normalize_input.py 2.2 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677
  1. # coding: utf-8
  2. from csv import reader, writer
  3. import numpy as np
  4. fname = "../src/assets/data/all_data.csv"
  5. outname = "../src/assets/data/normalized_data.csv"
  6. vars_by_year = {}
  7. updated_by_year = {}
  8. headers = []
  9. regions = []
  10. domains = []
  11. scenarios = []
  12. with open(fname, encoding="utf-8") as fh:
  13. for row in reader(fh, delimiter=','):
  14. if len(headers) == 0:
  15. # Save the headers for later use
  16. headers = row
  17. # and also skip the headers line from processing
  18. continue
  19. year = row[0]
  20. #print(year)
  21. if year not in vars_by_year:
  22. vars_by_year[year] = []
  23. vars_by_year[year].append(row[1:])
  24. #print(', '.join(row))
  25. #print(vars_by_year)
  26. # This step shall not be necessary as Runar is already eliminating years which are not fully covered by all pilots
  27. for (year, data) in vars_by_year.copy().items():
  28. y_mat = np.matrix(data)
  29. # Remove years for which we do not have data for each pilot region
  30. if (y_mat.shape[0] < 9):
  31. del vars_by_year[year]
  32. continue
  33. #print(year)
  34. #print(y_mat.shape)
  35. for (year, data) in vars_by_year.items():
  36. y_mat = np.matrix(data)
  37. # Save all pilot region names (can't be read elsewhere), as well as domain and scenario names
  38. if len(regions) == 0:
  39. # Region name is stored in the "MODEL" column which is 3rd from the end
  40. regions = np.squeeze(y_mat[:, -3]).tolist()[0]
  41. print(regions)
  42. # Domain name is stored in the "DOMAIN" column which is 2nd from the end
  43. domains = np.squeeze(y_mat[:, -2]).tolist()[0]
  44. # Scenario name is stored in the "SCENARIO" column which is 1st from the end
  45. scenarios = np.squeeze(y_mat[:, -1]).tolist()[0]
  46. y_mat = y_mat[:, :-3].astype(float)
  47. #print(y_mat.dtype)
  48. maxs = y_mat.max(axis=0)
  49. #print(maxs)
  50. mins = y_mat.min(axis=0)
  51. #print(mins)
  52. spans = maxs-mins
  53. #print(spans)
  54. y_mat = y_mat - mins
  55. y_mat = y_mat / spans
  56. #print(y_mat)
  57. updated_by_year[year] = y_mat
  58. out = open(outname, 'w', encoding="utf-8", newline="")
  59. cw = writer(out, delimiter=',', quotechar='"')
  60. cw.writerow(headers)
  61. for i in range(0, 9*10): #FIXME: [Apulia, Apulia, Apulia, ...]
  62. for (year, data) in updated_by_year.items():
  63. row = data[i, :].astype(str).tolist()[0]
  64. row.insert(0, year)
  65. row.append(regions[i])
  66. row.append(domains[i])
  67. row.append(scenarios[i])
  68. #print(row)
  69. cw.writerow(row)
  70. out.close()