205 lines
9.2 KiB
205 lines
9.2 KiB
import os
import csv
import subprocess
import argparse
import matplotlib.pyplot as plt
from collections import defaultdict
import zipfile
import base64
import shutil
def extract_timestamp(filename):
timestamp_str = filename[:10]
timestamp = int(timestamp_str)
return timestamp
except ValueError:
return 0
def extract_values_by_key(csv_file, key, exact_match):
matched_values = defaultdict(list)
with open(csv_file, 'r') as file:
reader = csv.reader(file)
for row in reader:
if row:
columns = row[0].split(';')
if len(columns) > 1:
first_column = columns[0].strip()
path_key = first_column.split('/')[-1]
for key_item in key:
if exact_match:
if key_item.lower() == row[0].split('/')[-1].split(';')[0].lower():
if key_item.lower() in first_column.lower():
final_key = ''.join(matched_values.keys())
combined_values = []
for values in matched_values.values():
final_dict = {final_key: combined_values}
return final_dict
def list_files_in_range(start_timestamp, end_timestamp, sampling_stepsize):
filenames_in_range = [f"{timestamp:10d}" for timestamp in range(start_timestamp, end_timestamp + 1, 2*sampling_stepsize)]
return filenames_in_range
def download_files(bucket_number, filenames_to_download, product_type):
if product_type == 0:
hash = "3e5b3069-214a-43ee-8d85-57d72000c19d"
elif product_type == 1:
hash = "c0436b6a-d276-4cd8-9c44-1eae86cf5d0e"
raise ValueError("Invalid product type option. Use 0 or 1")
output_directory = f"S3cmdData_{bucket_number}"
if not os.path.exists(output_directory):
print(f"Directory '{output_directory}' created.")
for filename in filenames_to_download:
stripfilename = filename.strip()
local_path = os.path.join(output_directory, stripfilename + ".csv")
if not os.path.exists(local_path):
s3cmd_command = f"s3cmd get s3://{bucket_number}-{hash}/{stripfilename}.csv {output_directory}/"
subprocess.run(s3cmd_command, shell=True, check=True)
downloaded_files = [file for file in os.listdir(output_directory) if file.startswith(filename)]
if not downloaded_files:
print(f"No matching files found for prefix '{filename}'.")
print(f"Files with prefix '{filename}' downloaded successfully.")
except subprocess.CalledProcessError as e:
print(f"Error downloading files: {e}")
print(f"File '{filename}.csv' already exists locally. Skipping download.")
def decompress_file(compressed_file, output_directory):
base_name = os.path.splitext(os.path.basename(compressed_file))[0]
with open(compressed_file, 'rb') as file:
compressed_data = file.read()
# Decode the base64 encoded content
decoded_data = base64.b64decode(compressed_data)
zip_path = os.path.join(output_directory, 'temp.zip')
with open(zip_path, 'wb') as zip_file:
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
# Rename the extracted data.csv file to the original timestamp-based name
extracted_csv_path = os.path.join(output_directory, 'data.csv')
if os.path.exists(extracted_csv_path):
new_csv_path = os.path.join(output_directory, f"{base_name}.csv")
os.rename(extracted_csv_path, new_csv_path)
print(f"Decompressed and renamed '{compressed_file}' to '{new_csv_path}'.")
def get_last_component(path):
path_without_slashes = path.replace('/', '')
return path_without_slashes
def download_and_process_files(bucket_number, start_timestamp, end_timestamp, sampling_stepsize, key, booleans_as_numbers, exact_match, product_type):
output_directory = f"S3cmdData_{bucket_number}"
if os.path.exists(output_directory):
if not os.path.exists(output_directory):
print(f"Directory '{output_directory}' created.")
filenames_to_check = list_files_in_range(start_timestamp, end_timestamp, sampling_stepsize)
existing_files = [filename for filename in filenames_to_check if os.path.exists(os.path.join(output_directory, f"{filename}.csv"))]
files_to_download = set(filenames_to_check) - set(existing_files)
if os.listdir(output_directory):
print("Files already exist in the local folder. Skipping download.")
if files_to_download:
download_files(bucket_number, files_to_download, product_type)
# Decompress all downloaded .csv files (which are actually compressed)
compressed_files = [os.path.join(output_directory, file) for file in os.listdir(output_directory) if file.endswith('.csv')]
for compressed_file in compressed_files:
decompress_file(compressed_file, output_directory)
csv_files = [file for file in os.listdir(output_directory) if file.endswith('.csv')]
keypath = ''
for key_item in key:
keypath += get_last_component(key_item)
output_csv_filename = f"{keypath}_{start_timestamp}_{bucket_number}.csv"
with open(output_csv_filename, 'w', newline='') as csvfile:
csv_writer = csv.writer(csvfile)
header = ['time']
add_header = True
for csv_file in csv_files:
file_path = os.path.join(output_directory, csv_file)
extracted_values = extract_values_by_key(file_path, key, exact_match)
if add_header:
add_header = False
for values in extracted_values.values():
first_value = values
for first_val in first_value:
if extracted_values:
for first_column, values in extracted_values.items():
if booleans_as_numbers:
values = [1 if value.split(';')[1].strip() == "True" else 0 if value.split(';')[1].strip() == "False" else value.split(';')[1].strip() for value in values]
values_list = []
values_list.append(csv_file.replace(".csv", ""))
for i, value in enumerate(values):
if value is None:
value = "No value provided"
print(f"Extracted data saved in '{output_csv_filename}'.")
def parse_keys(input_string):
keys = [key.strip() for key in input_string.split(',')]
return keys
def main():
parser = argparse.ArgumentParser(description='Download files from S3 using s3cmd and extract specific values from CSV files.')
parser.add_argument('start_timestamp', type=int, help='The start timestamp for the range (even number)')
parser.add_argument('end_timestamp', type=int, help='The end timestamp for the range (even number)')
parser.add_argument('--keys', type=parse_keys, required=True, help='The part to match from each CSV file, can be a single key or a comma-separated list of keys')
parser.add_argument('--bucket-number', type=int, required=True, help='The number of the bucket to download from')
parser.add_argument('--sampling_stepsize', type=int, required=False, default=1, help='The number of 2sec intervals, which define the length of the sampling interval in S3 file retrieval')
parser.add_argument('--booleans_as_numbers', action="store_true", required=False, help='If key used, then booleans are converted to numbers [0/1], if key not used, then booleans maintained as text [False/True]')
parser.add_argument('--exact_match', action="store_true", required=False, help='If key used, then key has to match exactly "=", else it is enough that key is found "in" text')
parser.add_argument('--product_type', required=True, help='Use 0 for Salimax and 1 for Salidomo')
args = parser.parse_args()
start_timestamp = args.start_timestamp
end_timestamp = args.end_timestamp
keys = args.keys
bucket_number = args.bucket_number
sampling_stepsize = args.sampling_stepsize
booleans_as_numbers = args.booleans_as_numbers
exact_match = args.exact_match
product_type = int(args.product_type)
if start_timestamp >= end_timestamp:
print("Error: start_timestamp must be smaller than end_timestamp.")
download_and_process_files(bucket_number, start_timestamp, end_timestamp, sampling_stepsize, keys, booleans_as_numbers, exact_match, product_type)
if __name__ == "__main__":