jmacura
/
sdm-dih-bridge


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081
							# coding: utf-8
from csv import reader, writer
import numpy as np

fname = "../src/assets/data/all_data.csv"
outname = "../src/assets/data/normalized_data.csv"

vars_by_year = {}
updated_by_year = {}
headers = []
regions = []
domains = []
scenarios = []

with open(fname, encoding="utf-8") as fh:
	for row in reader(fh, delimiter=','):
		if len(headers) == 0:
			# Save the headers for later use
			headers = row
			# and also skip the headers line from processing
			continue
		year = row[0]
		#print(year)
		if year not in vars_by_year:
			vars_by_year[year] = []
		vars_by_year[year].append(row[1:])
		#print(', '.join(row))

#print(vars_by_year)

# This step shall not be necessary as Runar is already eliminating years which are not fully covered by all pilots
for (year, data) in vars_by_year.copy().items():
	y_mat = np.matrix(data)
	# Remove years for which we do not have data for each pilot region
	if (y_mat.shape[0] < 9):
		del vars_by_year[year]
		continue
	#print(year)
	#print(y_mat.shape)

# For all the datasets we assume that the higher the value the higher its region's attractiveness
# TODO: the above statement should be made optional via some (positive/negative) effect switch
for (year, data) in vars_by_year.items():
	y_mat = np.matrix(data)
	# Save all pilot region names (can't be read elsewhere), as well as domain and scenario names
	if len(regions) == 0:
		# Region name is stored in the "MODEL" column which is 3rd from the end
		regions = np.squeeze(y_mat[:, -3]).tolist()[0]
		print(regions)
		# Domain name is stored in the "DOMAIN" column which is 2nd from the end
		domains = np.squeeze(y_mat[:, -2]).tolist()[0]
		# Scenario name is stored in the "SCENARIO" column which is 1st from the end
		scenarios = np.squeeze(y_mat[:, -1]).tolist()[0]
	# Drop last three columns (MODEL, DOMAIN, SCENARIO) and convert the value's datatype
	y_mat = y_mat[:, :-3].astype(float)
	#print(y_mat.dtype)
	# Maximum of each column (each dataset)
	maxs = y_mat.max(axis=0)
	# Minimum of each column (each dataset)
	mins = y_mat.min(axis=0)
	# Value range of each column (each dataset)
	spans = maxs-mins
	# Align the range of each column (dataset) so it starts at 0
	y_mat = y_mat - mins
	# Align the range of each column (dataset) into range 0 to 1
	y_mat = y_mat / spans
	updated_by_year[year] = y_mat

out = open(outname, 'w', encoding="utf-8", newline="")
cw = writer(out, delimiter=',', quotechar='"')
cw.writerow(headers)
for i in range(0, 9*10): #FIXME: [Apulia, Apulia, Apulia, ...]
	for (year, data) in updated_by_year.items():
		row = data[i, :].astype(str).tolist()[0]
		row.insert(0, year)
		row.append(regions[i])
		row.append(domains[i])
		row.append(scenarios[i])
		#print(row)
		cw.writerow(row)
out.close()