# coding: utf-8 from csv import reader, writer import numpy as np fname = "../public/data/all_data.csv" outname = "../public/data/normalized_data.csv" vars_by_year = {} updated_by_year = {} headers = [] regions = [] with open(fname, encoding="utf-8") as fh: for row in reader(fh, delimiter=','): if len(headers) == 0: # Save the headers for later use headers = row # and also skip the headers line from processing continue year = row[0] #print(year) if year not in vars_by_year: vars_by_year[year] = [] vars_by_year[year].append(row[1:]) #print(', '.join(row)) print(vars_by_year) for (year, data) in vars_by_year.copy().items(): y_mat = np.matrix(data) # Remove years for which we do not have data for each pilot region if (y_mat.shape[0] < 9): del vars_by_year[year] continue #print(year) #print(y_mat.shape) for (year, data) in vars_by_year.items(): y_mat = np.matrix(data) # Save all pilot region names (can't be read elsewhere) if len(regions) == 0: regions = np.squeeze(y_mat[:, -1]).tolist()[0] #print(regions) y_mat = y_mat[:, :-1].astype(float) #print(y_mat.dtype) maxs = y_mat.max(axis=0) #print(maxs) mins = y_mat.min(axis=0) #print(mins) spans = maxs-mins #print(spans) y_mat = y_mat - mins y_mat = y_mat / spans #print(y_mat) updated_by_year[year] = y_mat out = open(outname, 'w', encoding="utf-8", newline="") cw = writer(out, delimiter=',', quotechar='"') cw.writerow(headers) for i in range(0, 8): for (year, data) in updated_by_year.items(): row = data[i, :].astype(str).tolist()[0] row.insert(0, year) row.append(regions[i]) #print(row) cw.writerow(row) out.close()