import csv import json import hashlib def map_employee(input_row): json_data = {} #--required attributes json_data['DATA_SOURCE'] = 'EMPLOYEE' json_data['ENTITY_TYPE'] = 'GENERIC' json_data['RECORD_ID'] = input_row['emp_num'] #--attributes used for resolution json_data['PRIMARY_NAME_LAST'] = input_row['last_name'] json_data['PRIMARY_NAME_FIRST'] = input_row['first_name'] json_data['PRIMARY_NAME_MIDDLE'] = input_row['middle_name'] json_data['HOME_ADDR_LINE1'] = input_row['addr1'] json_data['HOME_ADDR_CITY'] = input_row['city'] json_data['HOME_ADDR_STATE'] = input_row['state'] json_data['HOME_ADDR_POSTAL_CODE'] = input_row['postal_code'] json_data['HOME_PHONE_NUMBER'] = input_row['home_phone'] json_data['DATE_OF_BIRTH'] = input_row['dob'] json_data['SSN_NUMBER'] = input_row['ssn'] #--classify other id if input_row['other_id_number']: if input_row['other_id_type'] == 'DL': json_data['DRIVERS_LICENSE_NUMBER'] = input_row['other_id_number'] json_data['DRIVERS_LICENSE_STATE'] = input_row['other_id_country'] elif input_row['other_id_type'] == 'PP': json_data['PASSPORT_NUMBER'] = input_row['other_id_number'] json_data['PASSPORT_COUNTRY'] = input_row['other_id_country'] else: json_data['OTHER_ID_TYPE'] = input_row['other_id_type'].upper() json_data['OTHER_ID_NUMBER'] = input_row['other_id_number'] json_data['OTHER_ID_COUNTRY'] = input_row['other_id_country'] if input_row['sherrifs_card']: sherrifs_card_list = [] for id_number in input_row['sherrifs_card'].split(','): sherrifs_card_list.append({"SHERRIFS_CARD": id_number}) json_data['SHERIFFS_CARD_LIST'] = sherrifs_card_list #--payload attributes json_data['job_category'] = input_row['job_category'] json_data['job_title'] = input_row['job_title'] json_data['hire_date'] = input_row['hire_date'] #--relationships relationship_list = [] #--add an anchor so others can point to me relationship_data = {} relationship_data['REL_ANCHOR_DOMAIN'] = 'EMPLOYEE_NUM' relationship_data['REL_ANCHOR_KEY'] = input_row['emp_num'] relationship_list.append(relationship_data) #--point to my employer relationship_data = {} relationship_data['REL_POINTER_DOMAIN'] = 'EMPLOYER_ID' relationship_data['REL_POINTER_KEY'] = input_row['employer_id'] relationship_data['REL_POINTER_ROLE'] = input_row['job_category'] relationship_list.append(relationship_data) #--point to my manager if input_row['manager_id']: relationship_data = {} relationship_data['REL_POINTER_DOMAIN'] = 'EMPLOYEE_NUM' relationship_data['REL_POINTER_KEY'] = input_row['manager_id'] relationship_data['REL_POINTER_ROLE'] = 'MANAGED_BY' relationship_list.append(relationship_data) json_data['RELATIONSHIP_LIST'] = relationship_list return json.dumps(json_data) def map_employer(input_row): json_data = {} #--required json_data['DATA_SOURCE'] = 'EMPLOYER' json_data['ENTITY_TYPE'] = 'GENERIC' json_data['RECORD_ID'] = input_row['employer_id'] #--senzing attributes json_data['PRIMARY_NAME_ORG'] = input_row['employer_name'] json_data['BUSINESS_ADDR_FULL'] = input_row['employer_addr'] #--payload attributes #--relationships #--add an anchor so others can point to me json_data['REL_ANCHOR_DOMAIN'] = 'EMPLOYER_ID' json_data['REL_ANCHOR_KEY'] = input_row['employer_id'] return json.dumps(json_data) if __name__ == '__main__': input_file = open('us-small-employee-raw.csv', 'r', encoding='utf-8') output_file1_employee = open('us-small-employee-mapped.json', 'w', encoding='utf-8') output_file2_employer = open('us-small-employer-mapped.json', 'w', encoding='utf-8') employers_mapped = {} #--to help eliminate duplicate companies input_row_count = 0 for input_row in csv.DictReader(input_file): input_row_count += 1 #--create any new fields needed string_to_hash = input_row['employer_name'] + input_row['employer_addr'] input_row['employer_id'] = hashlib.md5(bytes(string_to_hash, 'utf-8')).hexdigest() #--map the employee output_file1_employee.write(map_employee(input_row) + '\n') #--map the employer if input_row['employer_id'] not in employers_mapped: output_file2_employer.write(map_employer(input_row) + '\n') employers_mapped[input_row['employer_id']] = True print('%s rows processed' % input_row_count) input_file.close() output_file1_employee.close() output_file2_employer.close()