Spaces:
Build error
Build error
| #!/usr/bin/env python3 | |
| # -*- coding: utf-8 -*- | |
| # @File : run.py | |
| # @Author: nixin | |
| # @Date : 2021/11/26 | |
| import pandas as pd | |
| from functions import * | |
| from functools import partial | |
| import multiprocessing as mp | |
| df = pd.read_csv('/Users/nixin/PycharmProjects/PatentSolver_demonstrator/MCDA/data/results (18).csv') | |
| print(df.columns) | |
| patent_number =[] | |
| for patent in df['patent_number']: | |
| patent_number.append(patent) | |
| print(patent_number) | |
| # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # | |
| # ~~~ Parameters for data_patent_details file ~~~ # | |
| # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # | |
| path_to_data = "/Users/nixin/PycharmProjects/PatentSolver_demonstrator/MCDA/data/" #### don't forget to change | |
| ## Create csv file to store the data_patent_details from the patent runs | |
| # (1) Specify column order of patents | |
| # (2) Create csv if it does not exist in the data_patent_details path | |
| data_column_order = ['inventor_name', | |
| 'assignee_name_orig', | |
| 'assignee_name_current', | |
| 'pub_date', | |
| 'priority_date', | |
| 'grant_date', | |
| 'filing_date', | |
| 'forward_cite_no_family', | |
| 'forward_cite_yes_family', | |
| 'backward_cite_no_family', | |
| 'backward_cite_yes_family', | |
| 'patent', | |
| 'url', | |
| 'abstract_text'] | |
| if 'edison_patents.csv' in os.listdir(path_to_data): | |
| os.remove(path_to_data + 'edison_patents.csv') # delete previous csv file | |
| with open(path_to_data + 'edison_patents.csv', 'w', newline='') as file: | |
| writer = csv.writer(file) | |
| writer.writerow(data_column_order) | |
| else: | |
| with open(path_to_data + 'edison_patents.csv', 'w', newline='') as file: | |
| writer = csv.writer(file) | |
| writer.writerow(data_column_order) | |
| # | |
| # | |
| ########### Run pool process ############# | |
| if __name__ == "__main__": | |
| ## Create lock to prevent collisions when processes try to write on same file | |
| l = mp.Lock() | |
| ## Use a pool of workers where the number of processes is equal to | |
| ## the number of cpus - 1 | |
| with poolcontext(processes=mp.cpu_count() - 1, initializer=init, initargs=(l,)) as pool: | |
| pool.map(partial(single_process_scraper, path_to_data_file=path_to_data + 'edison_patents.csv', | |
| data_column_order=data_column_order), | |
| patent_number) | |
| # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # | |
| # ~~~ clean raw data_patent_details ~~~ # | |
| # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # | |
| ##read Google scrawer's results | |
| table = pd.read_csv('/Users/nixin/PycharmProjects/PatentSolver_demonstrator/MCDA/data/edison_patents.csv') | |
| # clean raw patent results | |
| results = clean_patent(table) | |
| # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # | |
| # ~~~ count number ~~~ # | |
| # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # | |
| results = count_patent(results) | |
| print(results.columns) | |
| results.to_csv('/Users/nixin/PycharmProjects/PatentSolver_demonstrator/MCDA/data/cleaned_count_patents.csv', index=False) | |