Innovenergy_trunk/firmware/opt/innovenergy/scripts/extractS3data.py

import os
import csv
import subprocess
import argparse
import matplotlib.pyplot as plt
from collections import defaultdict
import zipfile
import base64
import shutil

def extract_timestamp(filename):
    timestamp_str = filename[:10]
    try:
        timestamp = int(timestamp_str)
        return timestamp
    except ValueError:
        return 0

def extract_values_by_key(csv_file, key, exact_match):
    matched_values = defaultdict(list)
    with open(csv_file, 'r') as file:
        reader = csv.reader(file)
        for row in reader:
            if row:
                columns = row[0].split(';')
                if len(columns) > 1:
                    first_column = columns[0].strip()
                    path_key = first_column.split('/')[-1]
                    for key_item in key:
                        if exact_match:
                            if key_item.lower() == row[0].split('/')[-1].split(';')[0].lower():
                                matched_values[path_key].append(row[0])
                        else:
                            if key_item.lower() in first_column.lower():
                                matched_values[path_key].append(row[0])
    final_key = ''.join(matched_values.keys())
    combined_values = []
    for values in matched_values.values():
        combined_values.extend(values)
    final_dict = {final_key: combined_values}
    return final_dict

def list_files_in_range(start_timestamp, end_timestamp, sampling_stepsize):
    filenames_in_range = [f"{timestamp:10d}" for timestamp in range(start_timestamp, end_timestamp + 1, 2*sampling_stepsize)]
    return filenames_in_range

def download_files(bucket_number, filenames_to_download, product_type):
    if product_type == 0:
        hash = "3e5b3069-214a-43ee-8d85-57d72000c19d"
    elif product_type == 1:
        hash = "c0436b6a-d276-4cd8-9c44-1eae86cf5d0e"
    else:
        raise ValueError("Invalid product type option. Use 0 or 1")
    output_directory = f"S3cmdData_{bucket_number}"

    if not os.path.exists(output_directory):
        os.makedirs(output_directory)
        print(f"Directory '{output_directory}' created.")

    for filename in filenames_to_download:
        stripfilename = filename.strip()
        local_path = os.path.join(output_directory, stripfilename + ".csv")
        if not os.path.exists(local_path):
            s3cmd_command = f"s3cmd get s3://{bucket_number}-{hash}/{stripfilename}.csv {output_directory}/"
            try:
                subprocess.run(s3cmd_command, shell=True, check=True)
                downloaded_files = [file for file in os.listdir(output_directory) if file.startswith(filename)]
                if not downloaded_files:
                    print(f"No matching files found for prefix '{filename}'.")
                else:
                    print(f"Files with prefix '{filename}' downloaded successfully.")
            except subprocess.CalledProcessError as e:
                print(f"Error downloading files: {e}")
                continue
        else:
            print(f"File '{filename}.csv' already exists locally. Skipping download.")

def decompress_file(compressed_file, output_directory):
    base_name = os.path.splitext(os.path.basename(compressed_file))[0]

    with open(compressed_file, 'rb') as file:
        compressed_data = file.read()

    # Decode the base64 encoded content
    decoded_data = base64.b64decode(compressed_data)

    zip_path = os.path.join(output_directory, 'temp.zip')
    with open(zip_path, 'wb') as zip_file:
        zip_file.write(decoded_data)

    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(output_directory)

    # Rename the extracted data.csv file to the original timestamp-based name
    extracted_csv_path = os.path.join(output_directory, 'data.csv')
    if os.path.exists(extracted_csv_path):
        new_csv_path = os.path.join(output_directory, f"{base_name}.csv")
        os.rename(extracted_csv_path, new_csv_path)

    os.remove(zip_path)
    #os.remove(compressed_file)
    print(f"Decompressed and renamed '{compressed_file}' to '{new_csv_path}'.")


def get_last_component(path):
    path_without_slashes = path.replace('/', '')
    return path_without_slashes

def download_and_process_files(bucket_number, start_timestamp, end_timestamp, sampling_stepsize, key, booleans_as_numbers, exact_match, product_type):
    output_directory = f"S3cmdData_{bucket_number}"

    if os.path.exists(output_directory):
        shutil.rmtree(output_directory)

    if not os.path.exists(output_directory):
        os.makedirs(output_directory)
        print(f"Directory '{output_directory}' created.")

    filenames_to_check = list_files_in_range(start_timestamp, end_timestamp, sampling_stepsize)
    existing_files = [filename for filename in filenames_to_check if os.path.exists(os.path.join(output_directory, f"{filename}.csv"))]
    files_to_download = set(filenames_to_check) - set(existing_files)

    if os.listdir(output_directory):
        print("Files already exist in the local folder. Skipping download.")
    else:
        if files_to_download:
            download_files(bucket_number, files_to_download, product_type)

    # Decompress all downloaded .csv files (which are actually compressed)
    compressed_files = [os.path.join(output_directory, file) for file in os.listdir(output_directory) if file.endswith('.csv')]
    for compressed_file in compressed_files:
        decompress_file(compressed_file, output_directory)

    csv_files = [file for file in os.listdir(output_directory) if file.endswith('.csv')]
    csv_files.sort(key=extract_timestamp)


    keypath = ''
    for key_item in key:
        keypath += get_last_component(key_item)
    output_csv_filename = f"{keypath}_{start_timestamp}_{bucket_number}.csv"
    with open(output_csv_filename, 'w', newline='') as csvfile:
        csv_writer = csv.writer(csvfile)
        header = ['time']
        add_header = True

        for csv_file in csv_files:
            file_path = os.path.join(output_directory, csv_file)
            extracted_values = extract_values_by_key(file_path, key, exact_match)
            if add_header:
                add_header = False
                for values in extracted_values.values():
                    first_value = values
                    for first_val in first_value:
                        header.append(first_val.split(';')[0].strip())
                    break
                csv_writer.writerow(header)
            if extracted_values:
                for first_column, values in extracted_values.items():
                    if booleans_as_numbers:
                        values = [1 if value.split(';')[1].strip() == "True" else 0 if value.split(';')[1].strip() == "False" else value.split(';')[1].strip() for value in values]
                    values_list = []
                    values_list.append(csv_file.replace(".csv", ""))
                    for i, value in enumerate(values):
                        if value is None:
                            value = "No value provided"
                        else:
                            values_list.append(value.split(';')[1].strip())
                    csv_writer.writerow(values_list)

    print(f"Extracted data saved in '{output_csv_filename}'.")

def parse_keys(input_string):
    keys = [key.strip() for key in input_string.split(',')]
    return keys

def main():
    parser = argparse.ArgumentParser(description='Download files from S3 using s3cmd and extract specific values from CSV files.')
    parser.add_argument('start_timestamp', type=int, help='The start timestamp for the range (even number)')
    parser.add_argument('end_timestamp', type=int, help='The end timestamp for the range (even number)')
    parser.add_argument('--keys', type=parse_keys, required=True, help='The part to match from each CSV file, can be a single key or a comma-separated list of keys')
    parser.add_argument('--bucket-number', type=int, required=True, help='The number of the bucket to download from')
    parser.add_argument('--sampling_stepsize', type=int, required=False, default=1, help='The number of 2sec intervals, which define the length of the sampling interval in S3 file retrieval')
    parser.add_argument('--booleans_as_numbers', action="store_true", required=False, help='If key used, then booleans are converted to numbers [0/1], if key not used, then booleans maintained as text [False/True]')
    parser.add_argument('--exact_match', action="store_true", required=False, help='If key used, then key has to match exactly "=", else it is enough that key is found "in" text')
    parser.add_argument('--product_type', required=True, help='Use 0 for Salimax and 1 for Salidomo')

    args = parser.parse_args()
    start_timestamp = args.start_timestamp
    end_timestamp = args.end_timestamp
    keys = args.keys
    bucket_number = args.bucket_number
    sampling_stepsize = args.sampling_stepsize
    booleans_as_numbers = args.booleans_as_numbers
    exact_match = args.exact_match
    product_type = int(args.product_type)

    if start_timestamp >= end_timestamp:
        print("Error: start_timestamp must be smaller than end_timestamp.")
        return
    download_and_process_files(bucket_number, start_timestamp, end_timestamp, sampling_stepsize, keys, booleans_as_numbers, exact_match, product_type)

if __name__ == "__main__":
    main()
Add extractdata script 2024-02-23 12:08:19 +00:00			`import os`
			`import csv`
			`import subprocess`
			`import argparse`
			`import matplotlib.pyplot as plt`
			`from collections import defaultdict`
extarct s3 data with decompression 2024-05-30 07:55:40 +00:00			`import zipfile`
			`import base64`
			`import shutil`
Add extractdata script 2024-02-23 12:08:19 +00:00
			`def extract_timestamp(filename):`
			`timestamp_str = filename[:10]`
			`try:`
			`timestamp = int(timestamp_str)`
			`return timestamp`
			`except ValueError:`
			`return 0`

			`def extract_values_by_key(csv_file, key, exact_match):`
			`matched_values = defaultdict(list)`
			`with open(csv_file, 'r') as file:`
			`reader = csv.reader(file)`
			`for row in reader:`
			`if row:`
			`columns = row[0].split(';')`
			`if len(columns) > 1:`
			`first_column = columns[0].strip()`
			`path_key = first_column.split('/')[-1]`
			`for key_item in key:`
			`if exact_match:`
			`if key_item.lower() == row[0].split('/')[-1].split(';')[0].lower():`
			`matched_values[path_key].append(row[0])`
			`else:`
			`if key_item.lower() in first_column.lower():`
			`matched_values[path_key].append(row[0])`
			`final_key = ''.join(matched_values.keys())`
			`combined_values = []`
			`for values in matched_values.values():`
			`combined_values.extend(values)`
			`final_dict = {final_key: combined_values}`
			`return final_dict`

			`def list_files_in_range(start_timestamp, end_timestamp, sampling_stepsize):`
			`filenames_in_range = [f"{timestamp:10d}" for timestamp in range(start_timestamp, end_timestamp + 1, 2*sampling_stepsize)]`
			`return filenames_in_range`

extarct s3 data with decompression 2024-05-30 07:55:40 +00:00			`def download_files(bucket_number, filenames_to_download, product_type):`
			`if product_type == 0:`
			`hash = "3e5b3069-214a-43ee-8d85-57d72000c19d"`
			`elif product_type == 1:`
			`hash = "c0436b6a-d276-4cd8-9c44-1eae86cf5d0e"`
			`else:`
			`raise ValueError("Invalid product type option. Use 0 or 1")`
Add extractdata script 2024-02-23 12:08:19 +00:00			`output_directory = f"S3cmdData_{bucket_number}"`

			`if not os.path.exists(output_directory):`
			`os.makedirs(output_directory)`
			`print(f"Directory '{output_directory}' created.")`

			`for filename in filenames_to_download:`
			`stripfilename = filename.strip()`
			`local_path = os.path.join(output_directory, stripfilename + ".csv")`
			`if not os.path.exists(local_path):`
extarct s3 data with decompression 2024-05-30 07:55:40 +00:00			`s3cmd_command = f"s3cmd get s3://{bucket_number}-{hash}/{stripfilename}.csv {output_directory}/"`
Add extractdata script 2024-02-23 12:08:19 +00:00			`try:`
			`subprocess.run(s3cmd_command, shell=True, check=True)`
			`downloaded_files = [file for file in os.listdir(output_directory) if file.startswith(filename)]`
			`if not downloaded_files:`
			`print(f"No matching files found for prefix '{filename}'.")`
			`else:`
			`print(f"Files with prefix '{filename}' downloaded successfully.")`
			`except subprocess.CalledProcessError as e:`
			`print(f"Error downloading files: {e}")`
			`continue`
			`else:`
			`print(f"File '{filename}.csv' already exists locally. Skipping download.")`

extarct s3 data with decompression 2024-05-30 07:55:40 +00:00			`def decompress_file(compressed_file, output_directory):`
			`base_name = os.path.splitext(os.path.basename(compressed_file))[0]`
Add extractdata script 2024-02-23 12:08:19 +00:00
extarct s3 data with decompression 2024-05-30 07:55:40 +00:00			`with open(compressed_file, 'rb') as file:`
			`compressed_data = file.read()`
Add extractdata script 2024-02-23 12:08:19 +00:00
extarct s3 data with decompression 2024-05-30 07:55:40 +00:00			`# Decode the base64 encoded content`
			`decoded_data = base64.b64decode(compressed_data)`
Add extractdata script 2024-02-23 12:08:19 +00:00
extarct s3 data with decompression 2024-05-30 07:55:40 +00:00			`zip_path = os.path.join(output_directory, 'temp.zip')`
			`with open(zip_path, 'wb') as zip_file:`
			`zip_file.write(decoded_data)`

			`with zipfile.ZipFile(zip_path, 'r') as zip_ref:`
			`zip_ref.extractall(output_directory)`

			`# Rename the extracted data.csv file to the original timestamp-based name`
			`extracted_csv_path = os.path.join(output_directory, 'data.csv')`
			`if os.path.exists(extracted_csv_path):`
			`new_csv_path = os.path.join(output_directory, f"{base_name}.csv")`
			`os.rename(extracted_csv_path, new_csv_path)`

			`os.remove(zip_path)`
			`#os.remove(compressed_file)`
			`print(f"Decompressed and renamed '{compressed_file}' to '{new_csv_path}'.")`
Add extractdata script 2024-02-23 12:08:19 +00:00

			`def get_last_component(path):`
			`path_without_slashes = path.replace('/', '')`
			`return path_without_slashes`

extarct s3 data with decompression 2024-05-30 07:55:40 +00:00			`def download_and_process_files(bucket_number, start_timestamp, end_timestamp, sampling_stepsize, key, booleans_as_numbers, exact_match, product_type):`
Add extractdata script 2024-02-23 12:08:19 +00:00			`output_directory = f"S3cmdData_{bucket_number}"`

extarct s3 data with decompression 2024-05-30 07:55:40 +00:00			`if os.path.exists(output_directory):`
			`shutil.rmtree(output_directory)`

Add extractdata script 2024-02-23 12:08:19 +00:00			`if not os.path.exists(output_directory):`
			`os.makedirs(output_directory)`
			`print(f"Directory '{output_directory}' created.")`

extarct s3 data with decompression 2024-05-30 07:55:40 +00:00			`filenames_to_check = list_files_in_range(start_timestamp, end_timestamp, sampling_stepsize)`
Add extractdata script 2024-02-23 12:08:19 +00:00			`existing_files = [filename for filename in filenames_to_check if os.path.exists(os.path.join(output_directory, f"{filename}.csv"))]`
			`files_to_download = set(filenames_to_check) - set(existing_files)`

			`if os.listdir(output_directory):`
			`print("Files already exist in the local folder. Skipping download.")`
			`else:`
			`if files_to_download:`
extarct s3 data with decompression 2024-05-30 07:55:40 +00:00			`download_files(bucket_number, files_to_download, product_type)`
Add extractdata script 2024-02-23 12:08:19 +00:00
extarct s3 data with decompression 2024-05-30 07:55:40 +00:00			`# Decompress all downloaded .csv files (which are actually compressed)`
			`compressed_files = [os.path.join(output_directory, file) for file in os.listdir(output_directory) if file.endswith('.csv')]`
			`for compressed_file in compressed_files:`
			`decompress_file(compressed_file, output_directory)`
Add extractdata script 2024-02-23 12:08:19 +00:00
			`csv_files = [file for file in os.listdir(output_directory) if file.endswith('.csv')]`
			`csv_files.sort(key=extract_timestamp)`
extarct s3 data with decompression 2024-05-30 07:55:40 +00:00

Add extractdata script 2024-02-23 12:08:19 +00:00			`keypath = ''`
			`for key_item in key:`
extarct s3 data with decompression 2024-05-30 07:55:40 +00:00			`keypath += get_last_component(key_item)`
Add extractdata script 2024-02-23 12:08:19 +00:00			`output_csv_filename = f"{keypath}_{start_timestamp}_{bucket_number}.csv"`
			`with open(output_csv_filename, 'w', newline='') as csvfile:`
			`csv_writer = csv.writer(csvfile)`
			`header = ['time']`
			`add_header = True`

			`for csv_file in csv_files:`
			`file_path = os.path.join(output_directory, csv_file)`
			`extracted_values = extract_values_by_key(file_path, key, exact_match)`
			`if add_header:`
			`add_header = False`
			`for values in extracted_values.values():`
			`first_value = values`
			`for first_val in first_value:`
			`header.append(first_val.split(';')[0].strip())`
			`break`
			`csv_writer.writerow(header)`
			`if extracted_values:`
			`for first_column, values in extracted_values.items():`
			`if booleans_as_numbers:`
			`values = [1 if value.split(';')[1].strip() == "True" else 0 if value.split(';')[1].strip() == "False" else value.split(';')[1].strip() for value in values]`
			`values_list = []`
			`values_list.append(csv_file.replace(".csv", ""))`
			`for i, value in enumerate(values):`
			`if value is None:`
			`value = "No value provided"`
			`else:`
			`values_list.append(value.split(';')[1].strip())`
			`csv_writer.writerow(values_list)`

			`print(f"Extracted data saved in '{output_csv_filename}'.")`

			`def parse_keys(input_string):`
			`keys = [key.strip() for key in input_string.split(',')]`
			`return keys`

			`def main():`
			`parser = argparse.ArgumentParser(description='Download files from S3 using s3cmd and extract specific values from CSV files.')`
			`parser.add_argument('start_timestamp', type=int, help='The start timestamp for the range (even number)')`
			`parser.add_argument('end_timestamp', type=int, help='The end timestamp for the range (even number)')`
			`parser.add_argument('--keys', type=parse_keys, required=True, help='The part to match from each CSV file, can be a single key or a comma-separated list of keys')`
			`parser.add_argument('--bucket-number', type=int, required=True, help='The number of the bucket to download from')`
			`parser.add_argument('--sampling_stepsize', type=int, required=False, default=1, help='The number of 2sec intervals, which define the length of the sampling interval in S3 file retrieval')`
			`parser.add_argument('--booleans_as_numbers', action="store_true", required=False, help='If key used, then booleans are converted to numbers [0/1], if key not used, then booleans maintained as text [False/True]')`
			`parser.add_argument('--exact_match', action="store_true", required=False, help='If key used, then key has to match exactly "=", else it is enough that key is found "in" text')`
extarct s3 data with decompression 2024-05-30 07:55:40 +00:00			`parser.add_argument('--product_type', required=True, help='Use 0 for Salimax and 1 for Salidomo')`
Add extractdata script 2024-02-23 12:08:19 +00:00
extarct s3 data with decompression 2024-05-30 07:55:40 +00:00			`args = parser.parse_args()`
Add extractdata script 2024-02-23 12:08:19 +00:00			`start_timestamp = args.start_timestamp`
			`end_timestamp = args.end_timestamp`
			`keys = args.keys`
			`bucket_number = args.bucket_number`
			`sampling_stepsize = args.sampling_stepsize`
			`booleans_as_numbers = args.booleans_as_numbers`
extarct s3 data with decompression 2024-05-30 07:55:40 +00:00			`exact_match = args.exact_match`
			`product_type = int(args.product_type)`
Add extractdata script 2024-02-23 12:08:19 +00:00
			`if start_timestamp >= end_timestamp:`
			`print("Error: start_timestamp must be smaller than end_timestamp.")`
			`return`
extarct s3 data with decompression 2024-05-30 07:55:40 +00:00			`download_and_process_files(bucket_number, start_timestamp, end_timestamp, sampling_stepsize, keys, booleans_as_numbers, exact_match, product_type)`
Add extractdata script 2024-02-23 12:08:19 +00:00
			`if __name__ == "__main__":`
			`main()`