From 381699f6514145b783870b246bbdd153dcb00167 Mon Sep 17 00:00:00 2001
From: atef <sarraj@innov.energy>
Date: Fri, 23 Feb 2024 13:08:19 +0100
Subject: [PATCH] Add extractdata script

---
 .../opt/innovenergy/scripts/extractS3data.py  | 212 ++++++++++++++++++
 1 file changed, 212 insertions(+)
 create mode 100644 firmware/opt/innovenergy/scripts/extractS3data.py

diff --git a/firmware/opt/innovenergy/scripts/extractS3data.py b/firmware/opt/innovenergy/scripts/extractS3data.py
new file mode 100644
index 000000000..41cd81f03
--- /dev/null
+++ b/firmware/opt/innovenergy/scripts/extractS3data.py
@@ -0,0 +1,212 @@
+import os
+import csv
+import subprocess
+import argparse
+import matplotlib.pyplot as plt
+from collections import defaultdict
+
+def extract_timestamp(filename):
+    timestamp_str = filename[:10]
+    try:
+        timestamp = int(timestamp_str)
+        return timestamp
+    except ValueError:
+        return 0
+
+def extract_values_by_key(csv_file, key, exact_match):
+    # Initialize a defaultdict for lists
+    matched_values = defaultdict(list)
+    with open(csv_file, 'r') as file:
+        reader = csv.reader(file)
+        for row in reader:
+            if row:
+                columns = row[0].split(';')
+                if len(columns) > 1:
+                    first_column = columns[0].strip()
+                    path_key = first_column.split('/')[-1]
+                    for key_item in key:
+                        if exact_match:
+                            if key_item.lower() == row[0].split('/')[-1].split(';')[0].lower():
+                                matched_values[path_key].append(row[0])
+                        else:
+                            if key_item.lower() in first_column.lower():
+                                matched_values[path_key].append(row[0])
+    #return matched_values
+    # Concatenate all keys to create a single final_key
+    final_key = ''.join(matched_values.keys())
+    # Combine all lists of values into a single list
+    combined_values = []
+    for values in matched_values.values():
+        combined_values.extend(values)
+    # Create the final dictionary with final_key and all combined values
+    final_dict = {final_key: combined_values}
+    #return dict(matched_values)
+    return final_dict
+
+def list_files_in_range(start_timestamp, end_timestamp, sampling_stepsize):
+    filenames_in_range = [f"{timestamp:10d}" for timestamp in range(start_timestamp, end_timestamp + 1, 2*sampling_stepsize)]
+    return filenames_in_range
+
+def check_s3_files_exist(bucket_number, filename):
+    s3cmd_ls_command = f"s3cmd ls s3://{bucket_number}-3e5b3069-214a-43ee-8d85-57d72000c19d/{filename}*"
+    try:
+        result = subprocess.run(s3cmd_ls_command, shell=True, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        lines = result.stdout.decode().split('\n')[:-1]
+        filenames = [line.split()[-1].split('/')[-1] for line in lines]
+        return filenames
+    except subprocess.CalledProcessError as e:
+        print(f"Error checking S3 files: {e}")
+        return []
+
+def download_files(bucket_number, filenames_to_download):
+    output_directory = f"S3cmdData_{bucket_number}"
+
+
+    if not os.path.exists(output_directory):
+        os.makedirs(output_directory)
+        print(f"Directory '{output_directory}' created.")
+
+    for filename in filenames_to_download:
+        stripfilename = filename.strip()
+        local_path = os.path.join(output_directory, stripfilename + ".csv")
+        if not os.path.exists(local_path):
+            s3cmd_command = f"s3cmd get s3://{bucket_number}-3e5b3069-214a-43ee-8d85-57d72000c19d/{stripfilename}.csv {output_directory}/"
+            try:
+                subprocess.run(s3cmd_command, shell=True, check=True)
+                downloaded_files = [file for file in os.listdir(output_directory) if file.startswith(filename)]
+                if not downloaded_files:
+                    print(f"No matching files found for prefix '{filename}'.")
+                else:
+                    print(f"Files with prefix '{filename}' downloaded successfully.")
+            except subprocess.CalledProcessError as e:
+                print(f"Error downloading files: {e}")
+                continue
+        else:
+            print(f"File '{filename}.csv' already exists locally. Skipping download.")
+
+
+def visualize_data(data, output_directory):
+    # Extract data for visualization (replace this with your actual data extraction)
+    x_values = [int(entry[0]) for entry in data]
+    y_values = [float(entry[1]) for entry in data]
+
+    # Plotting
+    plt.plot(x_values, y_values, marker='o', linestyle='-', color='b')
+    plt.xlabel('Timestamp')
+    plt.ylabel('Your Y-axis Label')
+    plt.title('Your Plot Title')
+    plt.grid(True)
+    plt.savefig(os.path.join(output_directory, f"{start_timestamp}_{key}_plot.png"))
+    plt.close()  # Close the plot window
+
+
+    # Save data to CSV
+    csv_file_path = os.path.join(output_directory, f"{start_timestamp}_{key}_extracted.csv")
+    with open(csv_file_path, 'w', newline='') as csvfile:
+        csv_writer = csv.writer(csvfile)
+        csv_writer.writerow(['Timestamp', 'Value'])  # Adjust column names as needed
+        csv_writer.writerows(data)
+
+def get_last_component(path):
+    path_without_slashes = path.replace('/', '')
+    return path_without_slashes
+
+
+def download_and_process_files(bucket_number, start_timestamp, end_timestamp, sampling_stepsize, key, booleans_as_numbers, exact_match):
+    output_directory = f"S3cmdData_{bucket_number}"
+
+    if not os.path.exists(output_directory):
+        os.makedirs(output_directory)
+        print(f"Directory '{output_directory}' created.")
+    
+    filenames_to_check = list_files_in_range(start_timestamp, end_timestamp, sampling_stepsize)
+    #filenames_on_s3 = check_s3_files_exist(bucket_number, filenames_to_check, key)
+
+    existing_files = [filename for filename in filenames_to_check if os.path.exists(os.path.join(output_directory, f"{filename}.csv"))]
+    files_to_download = set(filenames_to_check) - set(existing_files)
+
+    if os.listdir(output_directory):
+        print("Files already exist in the local folder. Skipping download.")
+    else:
+        if files_to_download:
+            download_files(bucket_number, files_to_download)
+
+
+    # Process CSV files
+    csv_files = [file for file in os.listdir(output_directory) if file.endswith('.csv')]
+    csv_files.sort(key=extract_timestamp)
+    keypath = ''
+    for key_item in key:
+        keypath+= get_last_component(key_item)
+    output_csv_filename = f"{keypath}_{start_timestamp}_{bucket_number}.csv"
+    with open(output_csv_filename, 'w', newline='') as csvfile:
+        csv_writer = csv.writer(csvfile)
+        header = ['time']
+        add_header = True
+
+        for csv_file in csv_files:
+            file_path = os.path.join(output_directory, csv_file)
+            extracted_values = extract_values_by_key(file_path, key, exact_match)
+            if add_header:
+                add_header = False
+                for values in extracted_values.values():
+                    first_value = values
+                    for first_val in first_value:
+                        header.append(first_val.split(';')[0].strip())
+                    break
+                csv_writer.writerow(header)
+            if extracted_values:
+                for first_column, values in extracted_values.items():
+                    if booleans_as_numbers:
+                        values = [1 if value.split(';')[1].strip() == "True" else 0 if value.split(';')[1].strip() == "False" else value.split(';')[1].strip() for value in values]
+                    values_list = []
+                    values_list.append(csv_file.replace(".csv", ""))
+                    for i, value in enumerate(values):
+                        if value is None:
+                            value = "No value provided"
+                        else:
+                            values_list.append(value.split(';')[1].strip())
+                    csv_writer.writerow(values_list)
+
+    print(f"Extracted data saved in '{output_csv_filename}'.")
+
+def parse_keys(input_string):
+    # Split the input string by commas and strip whitespace
+    keys = [key.strip() for key in input_string.split(',')]
+    # Return keys as a list if more than one, else return the single key
+    #return keys if len(keys) > 1 else keys[0]
+    return keys
+
+
+def main():
+    parser = argparse.ArgumentParser(description='Download files from S3 using s3cmd and extract specific values from CSV files.')
+    parser.add_argument('start_timestamp', type=int, help='The start timestamp for the range (even number)')
+    parser.add_argument('end_timestamp', type=int, help='The end timestamp for the range (even number)')
+    #parser.add_argument('--key', type=str, required=True, help='The part to match from each CSV file')
+    parser.add_argument('--keys', type=parse_keys, required=True, help='The part to match from each CSV file, can be a single key or a comma-separated list of keys')
+    parser.add_argument('--bucket-number', type=int, required=True, help='The number of the bucket to download from')
+    parser.add_argument('--sampling_stepsize', type=int, required=False, default=1, help='The number of 2sec intervals, which define the length of the sampling interval in S3 file retrieval')
+    parser.add_argument('--booleans_as_numbers', action="store_true", required=False, help='If key used, then booleans are converted to numbers [0/1], if key not used, then booleans maintained as text [False/True]')
+    parser.add_argument('--exact_match', action="store_true", required=False, help='If key used, then key has to match exactly "=", else it is enough that key is found "in" text')
+
+
+    args = parser.parse_args();
+    start_timestamp = args.start_timestamp
+    end_timestamp = args.end_timestamp
+    keys = args.keys
+    bucket_number = args.bucket_number
+    sampling_stepsize = args.sampling_stepsize
+    booleans_as_numbers = args.booleans_as_numbers
+    exact_match         = args.exact_match
+
+
+    
+    # Check if start_timestamp is smaller than end_timestamp
+    if start_timestamp >= end_timestamp:
+        print("Error: start_timestamp must be smaller than end_timestamp.")
+        return
+    download_and_process_files(bucket_number, start_timestamp, end_timestamp, sampling_stepsize, keys, booleans_as_numbers, exact_match)
+
+if __name__ == "__main__":
+    main()
+