From 98fb313a6fff773aef57e5632a6dccca9e412480 Mon Sep 17 00:00:00 2001 From: atef Date: Wed, 4 Dec 2024 17:09:06 +0100 Subject: [PATCH] git commit -m "SM-17 " --- .../innovenergy/scripts/ExtractS3README.txt | 127 ----------- .../opt/innovenergy/scripts/extractS3data.py | 205 ------------------ 2 files changed, 332 deletions(-) delete mode 100644 firmware/opt/innovenergy/scripts/ExtractS3README.txt delete mode 100644 firmware/opt/innovenergy/scripts/extractS3data.py diff --git a/firmware/opt/innovenergy/scripts/ExtractS3README.txt b/firmware/opt/innovenergy/scripts/ExtractS3README.txt deleted file mode 100644 index 97854d4d7..000000000 --- a/firmware/opt/innovenergy/scripts/ExtractS3README.txt +++ /dev/null @@ -1,127 +0,0 @@ -This README file provides a comprehensive guide to utilizing a Python script for interacting with S3 storage, -specifically designed for downloading and processing data files based on a specified time range and key parameters. -The script requires Python3 installed on your system and makes use of the s3cmd tool for accessing data in cloud storage. -It also illustrates the process of configuring s3cmd by creating a .s3cfg file with your access credentials. - - -############ Create the .s3cfg file in home directory ################ - -nano .s3cfg - -Copy this lines inside the file. - -[default] -host_base = sos-ch-dk-2.exo.io -host_bucket = %(bucket)s.sos-ch-dk-2.exo.io -access_key = EXO4d838d1360ba9fb7d51648b0 -secret_key = _bmrp6ewWAvNwdAQoeJuC-9y02Lsx7NV6zD-WjljzCU -use_https = True - - -############ S3cmd instalation ################ - -Please install s3cmd for retrieving data from our Cloud storage. - -sudo apt install s3cmd - -############ Python3 instalation ################ - -To check if you have already have python3, run this command - - python3 --version - - -To install you can use this command: - -1) sudo apt update - -2) sudo apt install python3 - -3) python3 --version (to check if pyhton3 installed correctly) - - -############ Run extractRange.py ################ - -usage: extractRange.py [-h] --key KEY --bucket-number BUCKET_NUMBER start_timestamp end_timestamp - -KEY: the key can be a one word or a path - - for example: /DcDc/Devices/2/Status/Dc/Battery/voltage ==> this will provide us a Dc battery Voltage of the DcDc device 2. - example : Dc/Battery/voltage ==> This will provide all DcDc Device voltage (including the avg voltage of all DcDc device) - example : voltage ==> This will provide all voltage of all devices in the Salimax - -BUCKET_NUMBER: This a number of bucket name for the instalation - - List of bucket number/ instalation: - 1: Prototype - 2: Marti Technik (Bern) - 3: Schreinerei Schönthal (Thun) - 4: Wittmann Kottingbrunn - 5: Biohof Gubelmann (Walde) - 6: Steakhouse Mettmenstetten - 7: Andreas Ballif / Lerchenhof - 8: Weidmann Oberwil (ZG) - 9: Christian Huber (EBS Elektrotechnik) - - -start_timestamp end_timestamp: this must be a correct timestamp of 10 digits. -The start_timestamp must be smaller than the end_timestamp. - -PS: The data will be downloaded to a folder named S3cmdData_{Bucket_Number}. If this folder does not exist, it will be created. -If the folder exist, it will try to download data if there is no files in the folder. -If the folder exist and contains at least one file, it will only data extraction. - -Example command: - -python3 extractRange.py 1707087500 1707091260 --key ActivePowerImportT2 --bucket-number 1 - - -################################ EXTENDED FEATURES FOR MORE ADVANCED USAGE ################################ - -1) Multiple Keys Support: - -The script supports the extraction of data using multiple keys. Users can specify one or multiple keys separated by commas with the --keys parameter. -This feature allows for more granular data extraction, catering to diverse data analysis requirements. For example, users can extract data for different -metrics or parameters from the same or different CSV files within the specified range. - -2) Exact Match for Keys: - -With the --exact_match flag, the script offers an option to enforce exact matching of keys. This means that only the rows containing a key that exactly -matches the specified key(s) will be considered during the data extraction process. This option enhances the precision of the data extraction, making it -particularly useful when dealing with CSV files that contain similar but distinct keys. - -3) Dynamic Header Generation: - -The script dynamically generates headers for the output CSV file based on the keys provided. This ensures that the output file accurately reflects the -extracted data, providing a clear and understandable format for subsequent analysis. The headers correspond to the keys used for data extraction, making -it easy to identify and analyze the extracted data. - -4)Advanced Data Processing Capabilities: - -i) Booleans as Numbers: The --booleans_as_numbers flag allows users to convert boolean values (True/False) into numeric representations (1/0). This feature -is particularly useful for analytical tasks that require numerical data processing. - -ii) Sampling Stepsize: The --sampling_stepsize parameter enables users to define the granularity of the time range for data extraction. By specifying the number -of 2-second intervals, users can adjust the sampling interval, allowing for flexible data retrieval based on time. - -Example Command: - -python3 extractRange.py 1707087500 1707091260 --keys ActivePowerImportT2,Soc --bucket-number 1 --exact_match --booleans_as_numbers - - -This command extracts data for ActivePowerImportT2 and TotalEnergy keys from bucket number 1, between the specified timestamps, with exact -matching of keys and boolean values converted to numbers. - -Visualization and Data Analysis: - -After data extraction, the script facilitates data analysis by: - -i) Providing a visualization function to plot the extracted data. Users can modify this function to suit their specific analysis needs, adjusting -plot labels, titles, and other matplotlib parameters. - -ii) Saving the extracted data in a CSV file, with dynamically generated headers based on the specified keys. This file can be used for further -analysis or imported into data analysis tools. - -This Python script streamlines the process of data retrieval from S3 storage, offering flexible and powerful options for data extraction, visualization, -and analysis. Its support for multiple keys, exact match filtering, and advanced processing capabilities make it a valuable tool for data analysts and -researchers working with time-series data or any dataset stored in S3 buckets. diff --git a/firmware/opt/innovenergy/scripts/extractS3data.py b/firmware/opt/innovenergy/scripts/extractS3data.py deleted file mode 100644 index 4aeb99ee7..000000000 --- a/firmware/opt/innovenergy/scripts/extractS3data.py +++ /dev/null @@ -1,205 +0,0 @@ -import os -import csv -import subprocess -import argparse -import matplotlib.pyplot as plt -from collections import defaultdict -import zipfile -import base64 -import shutil - -def extract_timestamp(filename): - timestamp_str = filename[:10] - try: - timestamp = int(timestamp_str) - return timestamp - except ValueError: - return 0 - -def extract_values_by_key(csv_file, key, exact_match): - matched_values = defaultdict(list) - with open(csv_file, 'r') as file: - reader = csv.reader(file) - for row in reader: - if row: - columns = row[0].split(';') - if len(columns) > 1: - first_column = columns[0].strip() - path_key = first_column.split('/')[-1] - for key_item in key: - if exact_match: - if key_item.lower() == row[0].split('/')[-1].split(';')[0].lower(): - matched_values[path_key].append(row[0]) - else: - if key_item.lower() in first_column.lower(): - matched_values[path_key].append(row[0]) - final_key = ''.join(matched_values.keys()) - combined_values = [] - for values in matched_values.values(): - combined_values.extend(values) - final_dict = {final_key: combined_values} - return final_dict - -def list_files_in_range(start_timestamp, end_timestamp, sampling_stepsize): - filenames_in_range = [f"{timestamp:10d}" for timestamp in range(start_timestamp, end_timestamp + 1, 2*sampling_stepsize)] - return filenames_in_range - -def download_files(bucket_number, filenames_to_download, product_type): - if product_type == 0: - hash = "3e5b3069-214a-43ee-8d85-57d72000c19d" - elif product_type == 1: - hash = "c0436b6a-d276-4cd8-9c44-1eae86cf5d0e" - else: - raise ValueError("Invalid product type option. Use 0 or 1") - output_directory = f"S3cmdData_{bucket_number}" - - if not os.path.exists(output_directory): - os.makedirs(output_directory) - print(f"Directory '{output_directory}' created.") - - for filename in filenames_to_download: - stripfilename = filename.strip() - local_path = os.path.join(output_directory, stripfilename + ".csv") - if not os.path.exists(local_path): - s3cmd_command = f"s3cmd get s3://{bucket_number}-{hash}/{stripfilename}.csv {output_directory}/" - try: - subprocess.run(s3cmd_command, shell=True, check=True) - downloaded_files = [file for file in os.listdir(output_directory) if file.startswith(filename)] - if not downloaded_files: - print(f"No matching files found for prefix '{filename}'.") - else: - print(f"Files with prefix '{filename}' downloaded successfully.") - except subprocess.CalledProcessError as e: - print(f"Error downloading files: {e}") - continue - else: - print(f"File '{filename}.csv' already exists locally. Skipping download.") - -def decompress_file(compressed_file, output_directory): - base_name = os.path.splitext(os.path.basename(compressed_file))[0] - - with open(compressed_file, 'rb') as file: - compressed_data = file.read() - - # Decode the base64 encoded content - decoded_data = base64.b64decode(compressed_data) - - zip_path = os.path.join(output_directory, 'temp.zip') - with open(zip_path, 'wb') as zip_file: - zip_file.write(decoded_data) - - with zipfile.ZipFile(zip_path, 'r') as zip_ref: - zip_ref.extractall(output_directory) - - # Rename the extracted data.csv file to the original timestamp-based name - extracted_csv_path = os.path.join(output_directory, 'data.csv') - if os.path.exists(extracted_csv_path): - new_csv_path = os.path.join(output_directory, f"{base_name}.csv") - os.rename(extracted_csv_path, new_csv_path) - - os.remove(zip_path) - #os.remove(compressed_file) - print(f"Decompressed and renamed '{compressed_file}' to '{new_csv_path}'.") - - -def get_last_component(path): - path_without_slashes = path.replace('/', '') - return path_without_slashes - -def download_and_process_files(bucket_number, start_timestamp, end_timestamp, sampling_stepsize, key, booleans_as_numbers, exact_match, product_type): - output_directory = f"S3cmdData_{bucket_number}" - - if os.path.exists(output_directory): - shutil.rmtree(output_directory) - - if not os.path.exists(output_directory): - os.makedirs(output_directory) - print(f"Directory '{output_directory}' created.") - - filenames_to_check = list_files_in_range(start_timestamp, end_timestamp, sampling_stepsize) - existing_files = [filename for filename in filenames_to_check if os.path.exists(os.path.join(output_directory, f"{filename}.csv"))] - files_to_download = set(filenames_to_check) - set(existing_files) - - if os.listdir(output_directory): - print("Files already exist in the local folder. Skipping download.") - else: - if files_to_download: - download_files(bucket_number, files_to_download, product_type) - - # Decompress all downloaded .csv files (which are actually compressed) - compressed_files = [os.path.join(output_directory, file) for file in os.listdir(output_directory) if file.endswith('.csv')] - for compressed_file in compressed_files: - decompress_file(compressed_file, output_directory) - - csv_files = [file for file in os.listdir(output_directory) if file.endswith('.csv')] - csv_files.sort(key=extract_timestamp) - - - keypath = '' - for key_item in key: - keypath += get_last_component(key_item) - output_csv_filename = f"{keypath}_{start_timestamp}_{bucket_number}.csv" - with open(output_csv_filename, 'w', newline='') as csvfile: - csv_writer = csv.writer(csvfile) - header = ['time'] - add_header = True - - for csv_file in csv_files: - file_path = os.path.join(output_directory, csv_file) - extracted_values = extract_values_by_key(file_path, key, exact_match) - if add_header: - add_header = False - for values in extracted_values.values(): - first_value = values - for first_val in first_value: - header.append(first_val.split(';')[0].strip()) - break - csv_writer.writerow(header) - if extracted_values: - for first_column, values in extracted_values.items(): - if booleans_as_numbers: - values = [1 if value.split(';')[1].strip() == "True" else 0 if value.split(';')[1].strip() == "False" else value.split(';')[1].strip() for value in values] - values_list = [] - values_list.append(csv_file.replace(".csv", "")) - for i, value in enumerate(values): - if value is None: - value = "No value provided" - else: - values_list.append(value.split(';')[1].strip()) - csv_writer.writerow(values_list) - - print(f"Extracted data saved in '{output_csv_filename}'.") - -def parse_keys(input_string): - keys = [key.strip() for key in input_string.split(',')] - return keys - -def main(): - parser = argparse.ArgumentParser(description='Download files from S3 using s3cmd and extract specific values from CSV files.') - parser.add_argument('start_timestamp', type=int, help='The start timestamp for the range (even number)') - parser.add_argument('end_timestamp', type=int, help='The end timestamp for the range (even number)') - parser.add_argument('--keys', type=parse_keys, required=True, help='The part to match from each CSV file, can be a single key or a comma-separated list of keys') - parser.add_argument('--bucket-number', type=int, required=True, help='The number of the bucket to download from') - parser.add_argument('--sampling_stepsize', type=int, required=False, default=1, help='The number of 2sec intervals, which define the length of the sampling interval in S3 file retrieval') - parser.add_argument('--booleans_as_numbers', action="store_true", required=False, help='If key used, then booleans are converted to numbers [0/1], if key not used, then booleans maintained as text [False/True]') - parser.add_argument('--exact_match', action="store_true", required=False, help='If key used, then key has to match exactly "=", else it is enough that key is found "in" text') - parser.add_argument('--product_type', required=True, help='Use 0 for Salimax and 1 for Salidomo') - - args = parser.parse_args() - start_timestamp = args.start_timestamp - end_timestamp = args.end_timestamp - keys = args.keys - bucket_number = args.bucket_number - sampling_stepsize = args.sampling_stepsize - booleans_as_numbers = args.booleans_as_numbers - exact_match = args.exact_match - # new arg for product type - product_type = int(args.product_type) - - if start_timestamp >= end_timestamp: - print("Error: start_timestamp must be smaller than end_timestamp.") - return - download_and_process_files(bucket_number, start_timestamp, end_timestamp, sampling_stepsize, keys, booleans_as_numbers, exact_match, product_type) - -if __name__ == "__main__": - main()