normalize_input.py 2.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081
  1. # coding: utf-8
  2. from csv import reader, writer
  3. import numpy as np
  4. fname = "../src/assets/data/all_data.csv"
  5. outname = "../src/assets/data/normalized_data.csv"
  6. vars_by_year = {}
  7. updated_by_year = {}
  8. headers = []
  9. regions = []
  10. domains = []
  11. scenarios = []
  12. with open(fname, encoding="utf-8") as fh:
  13. for row in reader(fh, delimiter=','):
  14. if len(headers) == 0:
  15. # Save the headers for later use
  16. headers = row
  17. # and also skip the headers line from processing
  18. continue
  19. year = row[0]
  20. #print(year)
  21. if year not in vars_by_year:
  22. vars_by_year[year] = []
  23. vars_by_year[year].append(row[1:])
  24. #print(', '.join(row))
  25. #print(vars_by_year)
  26. # This step shall not be necessary as Runar is already eliminating years which are not fully covered by all pilots
  27. for (year, data) in vars_by_year.copy().items():
  28. y_mat = np.matrix(data)
  29. # Remove years for which we do not have data for each pilot region
  30. if (y_mat.shape[0] < 9):
  31. del vars_by_year[year]
  32. continue
  33. #print(year)
  34. #print(y_mat.shape)
  35. # For all the datasets we assume that the higher the value the higher its region's attractiveness
  36. # TODO: the above statement should be made optional via some (positive/negative) effect switch
  37. for (year, data) in vars_by_year.items():
  38. y_mat = np.matrix(data)
  39. # Save all pilot region names (can't be read elsewhere), as well as domain and scenario names
  40. if len(regions) == 0:
  41. # Region name is stored in the "MODEL" column which is 3rd from the end
  42. regions = np.squeeze(y_mat[:, -3]).tolist()[0]
  43. print(regions)
  44. # Domain name is stored in the "DOMAIN" column which is 2nd from the end
  45. domains = np.squeeze(y_mat[:, -2]).tolist()[0]
  46. # Scenario name is stored in the "SCENARIO" column which is 1st from the end
  47. scenarios = np.squeeze(y_mat[:, -1]).tolist()[0]
  48. # Drop last three columns (MODEL, DOMAIN, SCENARIO) and convert the value's datatype
  49. y_mat = y_mat[:, :-3].astype(float)
  50. #print(y_mat.dtype)
  51. # Maximum of each column (each dataset)
  52. maxs = y_mat.max(axis=0)
  53. # Minimum of each column (each dataset)
  54. mins = y_mat.min(axis=0)
  55. # Value range of each column (each dataset)
  56. spans = maxs-mins
  57. # Align the range of each column (dataset) so it starts at 0
  58. y_mat = y_mat - mins
  59. # Align the range of each column (dataset) into range 0 to 1
  60. y_mat = y_mat / spans
  61. updated_by_year[year] = y_mat
  62. out = open(outname, 'w', encoding="utf-8", newline="")
  63. cw = writer(out, delimiter=',', quotechar='"')
  64. cw.writerow(headers)
  65. for i in range(0, 9*10): #FIXME: [Apulia, Apulia, Apulia, ...]
  66. for (year, data) in updated_by_year.items():
  67. row = data[i, :].astype(str).tolist()[0]
  68. row.insert(0, year)
  69. row.append(regions[i])
  70. row.append(domains[i])
  71. row.append(scenarios[i])
  72. #print(row)
  73. cw.writerow(row)
  74. out.close()