2024-02-23 12:08:19 +00:00
import os
import csv
import subprocess
import argparse
import matplotlib . pyplot as plt
from collections import defaultdict
2024-05-30 07:55:40 +00:00
import zipfile
import base64
import shutil
2024-02-23 12:08:19 +00:00
def extract_timestamp ( filename ) :
timestamp_str = filename [ : 10 ]
try :
timestamp = int ( timestamp_str )
return timestamp
except ValueError :
return 0
def extract_values_by_key ( csv_file , key , exact_match ) :
matched_values = defaultdict ( list )
with open ( csv_file , ' r ' ) as file :
reader = csv . reader ( file )
for row in reader :
if row :
columns = row [ 0 ] . split ( ' ; ' )
if len ( columns ) > 1 :
first_column = columns [ 0 ] . strip ( )
path_key = first_column . split ( ' / ' ) [ - 1 ]
for key_item in key :
if exact_match :
if key_item . lower ( ) == row [ 0 ] . split ( ' / ' ) [ - 1 ] . split ( ' ; ' ) [ 0 ] . lower ( ) :
matched_values [ path_key ] . append ( row [ 0 ] )
else :
if key_item . lower ( ) in first_column . lower ( ) :
matched_values [ path_key ] . append ( row [ 0 ] )
final_key = ' ' . join ( matched_values . keys ( ) )
combined_values = [ ]
for values in matched_values . values ( ) :
combined_values . extend ( values )
final_dict = { final_key : combined_values }
return final_dict
def list_files_in_range ( start_timestamp , end_timestamp , sampling_stepsize ) :
filenames_in_range = [ f " { timestamp : 10d } " for timestamp in range ( start_timestamp , end_timestamp + 1 , 2 * sampling_stepsize ) ]
return filenames_in_range
2024-05-30 07:55:40 +00:00
def download_files ( bucket_number , filenames_to_download , product_type ) :
if product_type == 0 :
hash = " 3e5b3069-214a-43ee-8d85-57d72000c19d "
elif product_type == 1 :
hash = " c0436b6a-d276-4cd8-9c44-1eae86cf5d0e "
else :
raise ValueError ( " Invalid product type option. Use 0 or 1 " )
2024-02-23 12:08:19 +00:00
output_directory = f " S3cmdData_ { bucket_number } "
if not os . path . exists ( output_directory ) :
os . makedirs ( output_directory )
print ( f " Directory ' { output_directory } ' created. " )
for filename in filenames_to_download :
stripfilename = filename . strip ( )
local_path = os . path . join ( output_directory , stripfilename + " .csv " )
if not os . path . exists ( local_path ) :
2024-05-30 07:55:40 +00:00
s3cmd_command = f " s3cmd get s3:// { bucket_number } - { hash } / { stripfilename } .csv { output_directory } / "
2024-02-23 12:08:19 +00:00
try :
subprocess . run ( s3cmd_command , shell = True , check = True )
downloaded_files = [ file for file in os . listdir ( output_directory ) if file . startswith ( filename ) ]
if not downloaded_files :
print ( f " No matching files found for prefix ' { filename } ' . " )
else :
print ( f " Files with prefix ' { filename } ' downloaded successfully. " )
except subprocess . CalledProcessError as e :
print ( f " Error downloading files: { e } " )
continue
else :
print ( f " File ' { filename } .csv ' already exists locally. Skipping download. " )
2024-05-30 07:55:40 +00:00
def decompress_file ( compressed_file , output_directory ) :
base_name = os . path . splitext ( os . path . basename ( compressed_file ) ) [ 0 ]
2024-02-23 12:08:19 +00:00
2024-05-30 07:55:40 +00:00
with open ( compressed_file , ' rb ' ) as file :
compressed_data = file . read ( )
2024-02-23 12:08:19 +00:00
2024-05-30 07:55:40 +00:00
# Decode the base64 encoded content
decoded_data = base64 . b64decode ( compressed_data )
2024-02-23 12:08:19 +00:00
2024-05-30 07:55:40 +00:00
zip_path = os . path . join ( output_directory , ' temp.zip ' )
with open ( zip_path , ' wb ' ) as zip_file :
zip_file . write ( decoded_data )
with zipfile . ZipFile ( zip_path , ' r ' ) as zip_ref :
zip_ref . extractall ( output_directory )
# Rename the extracted data.csv file to the original timestamp-based name
extracted_csv_path = os . path . join ( output_directory , ' data.csv ' )
if os . path . exists ( extracted_csv_path ) :
new_csv_path = os . path . join ( output_directory , f " { base_name } .csv " )
os . rename ( extracted_csv_path , new_csv_path )
os . remove ( zip_path )
#os.remove(compressed_file)
print ( f " Decompressed and renamed ' { compressed_file } ' to ' { new_csv_path } ' . " )
2024-02-23 12:08:19 +00:00
def get_last_component ( path ) :
path_without_slashes = path . replace ( ' / ' , ' ' )
return path_without_slashes
2024-05-30 07:55:40 +00:00
def download_and_process_files ( bucket_number , start_timestamp , end_timestamp , sampling_stepsize , key , booleans_as_numbers , exact_match , product_type ) :
2024-02-23 12:08:19 +00:00
output_directory = f " S3cmdData_ { bucket_number } "
2024-05-30 07:55:40 +00:00
if os . path . exists ( output_directory ) :
shutil . rmtree ( output_directory )
2024-02-23 12:08:19 +00:00
if not os . path . exists ( output_directory ) :
os . makedirs ( output_directory )
print ( f " Directory ' { output_directory } ' created. " )
2024-05-30 07:55:40 +00:00
filenames_to_check = list_files_in_range ( start_timestamp , end_timestamp , sampling_stepsize )
2024-02-23 12:08:19 +00:00
existing_files = [ filename for filename in filenames_to_check if os . path . exists ( os . path . join ( output_directory , f " { filename } .csv " ) ) ]
files_to_download = set ( filenames_to_check ) - set ( existing_files )
if os . listdir ( output_directory ) :
print ( " Files already exist in the local folder. Skipping download. " )
else :
if files_to_download :
2024-05-30 07:55:40 +00:00
download_files ( bucket_number , files_to_download , product_type )
2024-02-23 12:08:19 +00:00
2024-05-30 07:55:40 +00:00
# Decompress all downloaded .csv files (which are actually compressed)
compressed_files = [ os . path . join ( output_directory , file ) for file in os . listdir ( output_directory ) if file . endswith ( ' .csv ' ) ]
for compressed_file in compressed_files :
decompress_file ( compressed_file , output_directory )
2024-02-23 12:08:19 +00:00
csv_files = [ file for file in os . listdir ( output_directory ) if file . endswith ( ' .csv ' ) ]
csv_files . sort ( key = extract_timestamp )
2024-05-30 07:55:40 +00:00
2024-02-23 12:08:19 +00:00
keypath = ' '
for key_item in key :
2024-05-30 07:55:40 +00:00
keypath + = get_last_component ( key_item )
2024-02-23 12:08:19 +00:00
output_csv_filename = f " { keypath } _ { start_timestamp } _ { bucket_number } .csv "
with open ( output_csv_filename , ' w ' , newline = ' ' ) as csvfile :
csv_writer = csv . writer ( csvfile )
header = [ ' time ' ]
add_header = True
for csv_file in csv_files :
file_path = os . path . join ( output_directory , csv_file )
extracted_values = extract_values_by_key ( file_path , key , exact_match )
if add_header :
add_header = False
for values in extracted_values . values ( ) :
first_value = values
for first_val in first_value :
header . append ( first_val . split ( ' ; ' ) [ 0 ] . strip ( ) )
break
csv_writer . writerow ( header )
if extracted_values :
for first_column , values in extracted_values . items ( ) :
if booleans_as_numbers :
values = [ 1 if value . split ( ' ; ' ) [ 1 ] . strip ( ) == " True " else 0 if value . split ( ' ; ' ) [ 1 ] . strip ( ) == " False " else value . split ( ' ; ' ) [ 1 ] . strip ( ) for value in values ]
values_list = [ ]
values_list . append ( csv_file . replace ( " .csv " , " " ) )
for i , value in enumerate ( values ) :
if value is None :
value = " No value provided "
else :
values_list . append ( value . split ( ' ; ' ) [ 1 ] . strip ( ) )
csv_writer . writerow ( values_list )
print ( f " Extracted data saved in ' { output_csv_filename } ' . " )
def parse_keys ( input_string ) :
keys = [ key . strip ( ) for key in input_string . split ( ' , ' ) ]
return keys
def main ( ) :
parser = argparse . ArgumentParser ( description = ' Download files from S3 using s3cmd and extract specific values from CSV files. ' )
parser . add_argument ( ' start_timestamp ' , type = int , help = ' The start timestamp for the range (even number) ' )
parser . add_argument ( ' end_timestamp ' , type = int , help = ' The end timestamp for the range (even number) ' )
parser . add_argument ( ' --keys ' , type = parse_keys , required = True , help = ' The part to match from each CSV file, can be a single key or a comma-separated list of keys ' )
parser . add_argument ( ' --bucket-number ' , type = int , required = True , help = ' The number of the bucket to download from ' )
parser . add_argument ( ' --sampling_stepsize ' , type = int , required = False , default = 1 , help = ' The number of 2sec intervals, which define the length of the sampling interval in S3 file retrieval ' )
parser . add_argument ( ' --booleans_as_numbers ' , action = " store_true " , required = False , help = ' If key used, then booleans are converted to numbers [0/1], if key not used, then booleans maintained as text [False/True] ' )
parser . add_argument ( ' --exact_match ' , action = " store_true " , required = False , help = ' If key used, then key has to match exactly " = " , else it is enough that key is found " in " text ' )
2024-05-30 07:55:40 +00:00
parser . add_argument ( ' --product_type ' , required = True , help = ' Use 0 for Salimax and 1 for Salidomo ' )
2024-02-23 12:08:19 +00:00
2024-05-30 07:55:40 +00:00
args = parser . parse_args ( )
2024-02-23 12:08:19 +00:00
start_timestamp = args . start_timestamp
end_timestamp = args . end_timestamp
keys = args . keys
bucket_number = args . bucket_number
sampling_stepsize = args . sampling_stepsize
booleans_as_numbers = args . booleans_as_numbers
2024-05-30 07:55:40 +00:00
exact_match = args . exact_match
product_type = int ( args . product_type )
2024-02-23 12:08:19 +00:00
if start_timestamp > = end_timestamp :
print ( " Error: start_timestamp must be smaller than end_timestamp. " )
return
2024-05-30 07:55:40 +00:00
download_and_process_files ( bucket_number , start_timestamp , end_timestamp , sampling_stepsize , keys , booleans_as_numbers , exact_match , product_type )
2024-02-23 12:08:19 +00:00
if __name__ == " __main__ " :
main ( )