From 381699f6514145b783870b246bbdd153dcb00167 Mon Sep 17 00:00:00 2001 From: atef Date: Fri, 23 Feb 2024 13:08:19 +0100 Subject: [PATCH] Add extractdata script --- .../opt/innovenergy/scripts/extractS3data.py | 212 ++++++++++++++++++ 1 file changed, 212 insertions(+) create mode 100644 firmware/opt/innovenergy/scripts/extractS3data.py diff --git a/firmware/opt/innovenergy/scripts/extractS3data.py b/firmware/opt/innovenergy/scripts/extractS3data.py new file mode 100644 index 000000000..41cd81f03 --- /dev/null +++ b/firmware/opt/innovenergy/scripts/extractS3data.py @@ -0,0 +1,212 @@ +import os +import csv +import subprocess +import argparse +import matplotlib.pyplot as plt +from collections import defaultdict + +def extract_timestamp(filename): + timestamp_str = filename[:10] + try: + timestamp = int(timestamp_str) + return timestamp + except ValueError: + return 0 + +def extract_values_by_key(csv_file, key, exact_match): + # Initialize a defaultdict for lists + matched_values = defaultdict(list) + with open(csv_file, 'r') as file: + reader = csv.reader(file) + for row in reader: + if row: + columns = row[0].split(';') + if len(columns) > 1: + first_column = columns[0].strip() + path_key = first_column.split('/')[-1] + for key_item in key: + if exact_match: + if key_item.lower() == row[0].split('/')[-1].split(';')[0].lower(): + matched_values[path_key].append(row[0]) + else: + if key_item.lower() in first_column.lower(): + matched_values[path_key].append(row[0]) + #return matched_values + # Concatenate all keys to create a single final_key + final_key = ''.join(matched_values.keys()) + # Combine all lists of values into a single list + combined_values = [] + for values in matched_values.values(): + combined_values.extend(values) + # Create the final dictionary with final_key and all combined values + final_dict = {final_key: combined_values} + #return dict(matched_values) + return final_dict + +def list_files_in_range(start_timestamp, end_timestamp, sampling_stepsize): + filenames_in_range = [f"{timestamp:10d}" for timestamp in range(start_timestamp, end_timestamp + 1, 2*sampling_stepsize)] + return filenames_in_range + +def check_s3_files_exist(bucket_number, filename): + s3cmd_ls_command = f"s3cmd ls s3://{bucket_number}-3e5b3069-214a-43ee-8d85-57d72000c19d/{filename}*" + try: + result = subprocess.run(s3cmd_ls_command, shell=True, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + lines = result.stdout.decode().split('\n')[:-1] + filenames = [line.split()[-1].split('/')[-1] for line in lines] + return filenames + except subprocess.CalledProcessError as e: + print(f"Error checking S3 files: {e}") + return [] + +def download_files(bucket_number, filenames_to_download): + output_directory = f"S3cmdData_{bucket_number}" + + + if not os.path.exists(output_directory): + os.makedirs(output_directory) + print(f"Directory '{output_directory}' created.") + + for filename in filenames_to_download: + stripfilename = filename.strip() + local_path = os.path.join(output_directory, stripfilename + ".csv") + if not os.path.exists(local_path): + s3cmd_command = f"s3cmd get s3://{bucket_number}-3e5b3069-214a-43ee-8d85-57d72000c19d/{stripfilename}.csv {output_directory}/" + try: + subprocess.run(s3cmd_command, shell=True, check=True) + downloaded_files = [file for file in os.listdir(output_directory) if file.startswith(filename)] + if not downloaded_files: + print(f"No matching files found for prefix '{filename}'.") + else: + print(f"Files with prefix '{filename}' downloaded successfully.") + except subprocess.CalledProcessError as e: + print(f"Error downloading files: {e}") + continue + else: + print(f"File '{filename}.csv' already exists locally. Skipping download.") + + +def visualize_data(data, output_directory): + # Extract data for visualization (replace this with your actual data extraction) + x_values = [int(entry[0]) for entry in data] + y_values = [float(entry[1]) for entry in data] + + # Plotting + plt.plot(x_values, y_values, marker='o', linestyle='-', color='b') + plt.xlabel('Timestamp') + plt.ylabel('Your Y-axis Label') + plt.title('Your Plot Title') + plt.grid(True) + plt.savefig(os.path.join(output_directory, f"{start_timestamp}_{key}_plot.png")) + plt.close() # Close the plot window + + + # Save data to CSV + csv_file_path = os.path.join(output_directory, f"{start_timestamp}_{key}_extracted.csv") + with open(csv_file_path, 'w', newline='') as csvfile: + csv_writer = csv.writer(csvfile) + csv_writer.writerow(['Timestamp', 'Value']) # Adjust column names as needed + csv_writer.writerows(data) + +def get_last_component(path): + path_without_slashes = path.replace('/', '') + return path_without_slashes + + +def download_and_process_files(bucket_number, start_timestamp, end_timestamp, sampling_stepsize, key, booleans_as_numbers, exact_match): + output_directory = f"S3cmdData_{bucket_number}" + + if not os.path.exists(output_directory): + os.makedirs(output_directory) + print(f"Directory '{output_directory}' created.") + + filenames_to_check = list_files_in_range(start_timestamp, end_timestamp, sampling_stepsize) + #filenames_on_s3 = check_s3_files_exist(bucket_number, filenames_to_check, key) + + existing_files = [filename for filename in filenames_to_check if os.path.exists(os.path.join(output_directory, f"{filename}.csv"))] + files_to_download = set(filenames_to_check) - set(existing_files) + + if os.listdir(output_directory): + print("Files already exist in the local folder. Skipping download.") + else: + if files_to_download: + download_files(bucket_number, files_to_download) + + + # Process CSV files + csv_files = [file for file in os.listdir(output_directory) if file.endswith('.csv')] + csv_files.sort(key=extract_timestamp) + keypath = '' + for key_item in key: + keypath+= get_last_component(key_item) + output_csv_filename = f"{keypath}_{start_timestamp}_{bucket_number}.csv" + with open(output_csv_filename, 'w', newline='') as csvfile: + csv_writer = csv.writer(csvfile) + header = ['time'] + add_header = True + + for csv_file in csv_files: + file_path = os.path.join(output_directory, csv_file) + extracted_values = extract_values_by_key(file_path, key, exact_match) + if add_header: + add_header = False + for values in extracted_values.values(): + first_value = values + for first_val in first_value: + header.append(first_val.split(';')[0].strip()) + break + csv_writer.writerow(header) + if extracted_values: + for first_column, values in extracted_values.items(): + if booleans_as_numbers: + values = [1 if value.split(';')[1].strip() == "True" else 0 if value.split(';')[1].strip() == "False" else value.split(';')[1].strip() for value in values] + values_list = [] + values_list.append(csv_file.replace(".csv", "")) + for i, value in enumerate(values): + if value is None: + value = "No value provided" + else: + values_list.append(value.split(';')[1].strip()) + csv_writer.writerow(values_list) + + print(f"Extracted data saved in '{output_csv_filename}'.") + +def parse_keys(input_string): + # Split the input string by commas and strip whitespace + keys = [key.strip() for key in input_string.split(',')] + # Return keys as a list if more than one, else return the single key + #return keys if len(keys) > 1 else keys[0] + return keys + + +def main(): + parser = argparse.ArgumentParser(description='Download files from S3 using s3cmd and extract specific values from CSV files.') + parser.add_argument('start_timestamp', type=int, help='The start timestamp for the range (even number)') + parser.add_argument('end_timestamp', type=int, help='The end timestamp for the range (even number)') + #parser.add_argument('--key', type=str, required=True, help='The part to match from each CSV file') + parser.add_argument('--keys', type=parse_keys, required=True, help='The part to match from each CSV file, can be a single key or a comma-separated list of keys') + parser.add_argument('--bucket-number', type=int, required=True, help='The number of the bucket to download from') + parser.add_argument('--sampling_stepsize', type=int, required=False, default=1, help='The number of 2sec intervals, which define the length of the sampling interval in S3 file retrieval') + parser.add_argument('--booleans_as_numbers', action="store_true", required=False, help='If key used, then booleans are converted to numbers [0/1], if key not used, then booleans maintained as text [False/True]') + parser.add_argument('--exact_match', action="store_true", required=False, help='If key used, then key has to match exactly "=", else it is enough that key is found "in" text') + + + args = parser.parse_args(); + start_timestamp = args.start_timestamp + end_timestamp = args.end_timestamp + keys = args.keys + bucket_number = args.bucket_number + sampling_stepsize = args.sampling_stepsize + booleans_as_numbers = args.booleans_as_numbers + exact_match = args.exact_match + + + + # Check if start_timestamp is smaller than end_timestamp + if start_timestamp >= end_timestamp: + print("Error: start_timestamp must be smaller than end_timestamp.") + return + download_and_process_files(bucket_number, start_timestamp, end_timestamp, sampling_stepsize, keys, booleans_as_numbers, exact_match) + +if __name__ == "__main__": + main() +