Spaces:
Build error
Build error
| from App.bin.FiguresCleaner import FiguresCleaner | |
| from App.bin.ParameterExtractor import ParameterExtractor | |
| from App.bin import constants | |
| import nltk | |
| import re | |
| import os | |
| import json | |
| import hashlib | |
| import Levenshtein | |
| import uuid | |
| from collections import OrderedDict | |
| from App.bin.SharpClassifier import SharpClassifier | |
| from App.bin.ClassifierWithIncr import ClassifyWithIncr_it | |
| class InformationExtractorClaims(object): | |
| def __init__(self, section, input_folder, file_extension, file_name): | |
| self.section = section | |
| self.input_folder = input_folder | |
| self.file_extension = file_extension | |
| self.file_name = file_name | |
| patent_abbreviations = open(constants.ASSETS + "abbreviation_sentence_splitter").read().split() | |
| sentence_finder = nltk.data.load('tokenizers/punkt/english.pickle') | |
| sentence_finder._params.abbrev_types.update(patent_abbreviations) | |
| self.sentence_finder = sentence_finder | |
| def clean_data (self, sentence): | |
| sentence = str(sentence.lower()) | |
| sentence = re.sub(r'\(\s,?\s?\)', '', sentence) | |
| sentence = re.sub(r'\s+,', ',', sentence) | |
| sentence = re.sub(r'^\d+', '', sentence) | |
| sentence = re.sub(r'\s+', ' ', sentence) | |
| if sentence is not None: | |
| return sentence | |
| def truncate_data (self, sentence): | |
| sentence = str(sentence.lower()) | |
| sentence = re.sub(r'wherein said\s*', '', sentence) | |
| sentence = re.sub(r'characterized in that said\s*|characterised in that said?\s*', '', sentence) | |
| sentence = re.sub(r'wherein\s*', '', sentence) | |
| sentence = re.sub(r'characterized\s*|characterised\s*', '', sentence) | |
| sentence = re.sub(r'characterized in that\s*', '', sentence) | |
| sentence = re.sub(r'where\s*', '', sentence) | |
| sentence = re.sub(r'where said\s*', '', sentence) | |
| sentence = re.sub(r'further comprising', 'the system or method comprises', sentence) | |
| sentence = re.sub(r'.*thereof\s*\,?', '', sentence) | |
| sentence = re.sub(r'^\s+', '', sentence) | |
| sentence = re.sub(r'\s+\.$', '', sentence) | |
| if sentence is not None: | |
| return sentence | |
| def selectLines(self, line, lexic): | |
| with open(constants.ASSETS + lexic) as n: | |
| inclusion_list = n.read().splitlines() | |
| claims_words = re.compile('|'.join(inclusion_list)) | |
| m = re.search(claims_words, line) | |
| if m is not None: | |
| return m.group(1) | |
| # pass | |
| # return line | |
| def main(self): | |
| output_result = [] | |
| compt_Id = 50 | |
| count_concept = 3 | |
| clean_content_list = [] | |
| concept_list = [] | |
| output_content = [] | |
| uniq_output_linked_content =[] | |
| parameters_list = [] | |
| total_sentences_number =0 | |
| section = self.section | |
| input_folder = self.input_folder | |
| file_name = self.file_name | |
| file_extension = self.file_extension | |
| projectFolder = os.path.basename(os.path.normpath(input_folder)) | |
| output_file_name = input_folder+"/"+file_name+file_extension.strip("*") | |
| root_img_url = 'https://worldwide.espacenet.com/espacenetImage.jpg?flavour=firstPageClipping&locale=en_EP&FT=D&' | |
| root_pdf_url = 'https://worldwide.espacenet.com/publicationDetails/originalDocument?' | |
| if file_name is not None: | |
| match = re.search('(^[a-zA-Z]+)(([0-9]+)\s?([a-zA-Z0-9_]+$))', file_name) | |
| # CC for country code | |
| CC = match.group(1) | |
| #NR for Number | |
| NR = match.group(2) | |
| NR = re.sub(r'\s', '', NR) | |
| #KC for Kind code | |
| KC = match.group(4) | |
| urlImg = root_img_url+'&CC='+CC+'&NR='+NR+'&KC='+KC | |
| urlPDF = root_pdf_url+'CC='+CC+'&NR='+NR+'&KC='+KC+'&FT=D&ND=3&date='+'&DB=&locale=en_EP#' | |
| graphItemId = hashlib.md5(file_name.encode()) | |
| graphItemIdValue = graphItemId.hexdigest() | |
| graphItemIdValue = str(uuid.uuid4()) | |
| sentence_finder = self.sentence_finder | |
| sentences = sentence_finder.tokenize(section.strip()) | |
| for sentence in sentences: | |
| # print(sentence) | |
| sentence = self.clean_data(sentence) | |
| if sentence !='': | |
| clean_content_list.append(sentence) | |
| for line in clean_content_list: | |
| # print(len(line.split())) | |
| if not re.match(r'^\s*$', line): | |
| line = self.selectLines(line, 'claims_indices') | |
| if line is not None and count_concept > 0: | |
| line = self.truncate_data(line) | |
| line = re.sub(r'in that', '', line) | |
| # print(line, len(line.split())) | |
| concept_list.append(line) | |
| count_concept -= 1 | |
| count_concept = 3 | |
| if len(concept_list) is not None: | |
| total_sentences_number = len(concept_list) | |
| for concept in concept_list : | |
| if concept is not None and not re.match(r'^\s,', concept) and len(concept.split())<50: | |
| classifyT = ClassifyWithIncr_it() | |
| polarite = classifyT.main(concept) | |
| get_parameters = ParameterExtractor(concept) | |
| parameters = get_parameters.extract_parameters() | |
| parameters_list.extend(parameters) | |
| values = OrderedDict({ | |
| "concept": { | |
| "type": polarite, | |
| "id": graphItemIdValue + str(compt_Id), | |
| "sentence": concept, | |
| "source": output_file_name, | |
| "parameters": parameters_list, | |
| "image": urlImg, | |
| "pdf": urlPDF | |
| } | |
| }) | |
| json_string = json.dumps(values, sort_keys=OrderedDict, indent=4, separators=(',', ': ')) | |
| output_result.append(json_string) | |
| output_result = list(set(output_result)) | |
| output_json = ",".join(output_result) | |
| return output_json, total_sentences_number | |