normalize_input.py 1.6 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667
  1. # coding: utf-8
  2. from csv import reader, writer
  3. import numpy as np
  4. fname = "../src/assets/data/all_data.csv"
  5. outname = "../src/assets/data/normalized_data.csv"
  6. vars_by_year = {}
  7. updated_by_year = {}
  8. headers = []
  9. regions = []
  10. with open(fname, encoding="utf-8") as fh:
  11. for row in reader(fh, delimiter=','):
  12. if len(headers) == 0:
  13. # Save the headers for later use
  14. headers = row
  15. # and also skip the headers line from processing
  16. continue
  17. year = row[0]
  18. #print(year)
  19. if year not in vars_by_year:
  20. vars_by_year[year] = []
  21. vars_by_year[year].append(row[1:])
  22. #print(', '.join(row))
  23. print(vars_by_year)
  24. for (year, data) in vars_by_year.copy().items():
  25. y_mat = np.matrix(data)
  26. # Remove years for which we do not have data for each pilot region
  27. if (y_mat.shape[0] < 9):
  28. del vars_by_year[year]
  29. continue
  30. #print(year)
  31. #print(y_mat.shape)
  32. for (year, data) in vars_by_year.items():
  33. y_mat = np.matrix(data)
  34. # Save all pilot region names (can't be read elsewhere)
  35. if len(regions) == 0:
  36. regions = np.squeeze(y_mat[:, -1]).tolist()[0]
  37. #print(regions)
  38. y_mat = y_mat[:, :-1].astype(float)
  39. #print(y_mat.dtype)
  40. maxs = y_mat.max(axis=0)
  41. #print(maxs)
  42. mins = y_mat.min(axis=0)
  43. #print(mins)
  44. spans = maxs-mins
  45. #print(spans)
  46. y_mat = y_mat - mins
  47. y_mat = y_mat / spans
  48. #print(y_mat)
  49. updated_by_year[year] = y_mat
  50. out = open(outname, 'w', encoding="utf-8", newline="")
  51. cw = writer(out, delimiter=',', quotechar='"')
  52. cw.writerow(headers)
  53. for i in range(0, 8):
  54. for (year, data) in updated_by_year.items():
  55. row = data[i, :].astype(str).tolist()[0]
  56. row.insert(0, year)
  57. row.append(regions[i])
  58. #print(row)
  59. cw.writerow(row)
  60. out.close()