Spaces:

xin
/

PatentSolver

Build error

App Files Files Community

PatentSolver / App /bin /InformationExtractor_Claims.py

xin

initial commit

22738ca almost 4 years ago

raw

history blame contribute delete

6.39 kB

	from App.bin.FiguresCleaner import FiguresCleaner
	from App.bin.ParameterExtractor import ParameterExtractor
	from App.bin import constants
	import nltk
	import re
	import os

	import json
	import hashlib
	import Levenshtein
	import uuid
	from collections import OrderedDict
	from App.bin.SharpClassifier import SharpClassifier
	from App.bin.ClassifierWithIncr import ClassifyWithIncr_it


	class InformationExtractorClaims(object):

	def __init__(self, section, input_folder, file_extension, file_name):
	self.section = section
	self.input_folder = input_folder
	self.file_extension = file_extension
	self.file_name = file_name

	patent_abbreviations = open(constants.ASSETS + "abbreviation_sentence_splitter").read().split()
	sentence_finder = nltk.data.load('tokenizers/punkt/english.pickle')
	sentence_finder._params.abbrev_types.update(patent_abbreviations)
	self.sentence_finder = sentence_finder

	def clean_data (self, sentence):

	sentence = str(sentence.lower())
	sentence = re.sub(r'$\s,?\s?$', '', sentence)
	sentence = re.sub(r'\s+,', ',', sentence)
	sentence = re.sub(r'^\d+', '', sentence)
	sentence = re.sub(r'\s+', ' ', sentence)
	if sentence is not None:
	return sentence

	def truncate_data (self, sentence):

	sentence = str(sentence.lower())
	sentence = re.sub(r'wherein said\s*', '', sentence)
	sentence = re.sub(r'characterized in that said\s\|characterised in that said?\s', '', sentence)
	sentence = re.sub(r'wherein\s*', '', sentence)
	sentence = re.sub(r'characterized\s\|characterised\s', '', sentence)
	sentence = re.sub(r'characterized in that\s*', '', sentence)
	sentence = re.sub(r'where\s*', '', sentence)
	sentence = re.sub(r'where said\s*', '', sentence)
	sentence = re.sub(r'further comprising', 'the system or method comprises', sentence)
	sentence = re.sub(r'.thereof\s\,?', '', sentence)
	sentence = re.sub(r'^\s+', '', sentence)
	sentence = re.sub(r'\s+\.$', '', sentence)
	if sentence is not None:
	return sentence

	def selectLines(self, line, lexic):
	with open(constants.ASSETS + lexic) as n:
	inclusion_list = n.read().splitlines()
	claims_words = re.compile('\|'.join(inclusion_list))
	m = re.search(claims_words, line)
	if m is not None:
	return m.group(1)
	# pass
	# return line
	def main(self):

	output_result = []
	compt_Id = 50
	count_concept = 3

	clean_content_list = []
	concept_list = []

	output_content = []

	uniq_output_linked_content =[]
	parameters_list = []
	total_sentences_number =0
	section = self.section
	input_folder = self.input_folder
	file_name = self.file_name
	file_extension = self.file_extension
	projectFolder = os.path.basename(os.path.normpath(input_folder))
	output_file_name = input_folder+"/"+file_name+file_extension.strip("*")

	root_img_url = 'https://worldwide.espacenet.com/espacenetImage.jpg?flavour=firstPageClipping&locale=en_EP&FT=D&'
	root_pdf_url = 'https://worldwide.espacenet.com/publicationDetails/originalDocument?'



	if file_name is not None:
	match = re.search('(^[a-zA-Z]+)(([0-9]+)\s?([a-zA-Z0-9_]+$))', file_name)
	# CC for country code
	CC = match.group(1)
	#NR for Number
	NR = match.group(2)
	NR = re.sub(r'\s', '', NR)
	#KC for Kind code
	KC = match.group(4)

	urlImg = root_img_url+'&CC='+CC+'&NR='+NR+'&KC='+KC
	urlPDF = root_pdf_url+'CC='+CC+'&NR='+NR+'&KC='+KC+'&FT=D&ND=3&date='+'&DB=&locale=en_EP#'

	graphItemId = hashlib.md5(file_name.encode())
	graphItemIdValue = graphItemId.hexdigest()
	graphItemIdValue = str(uuid.uuid4())

	sentence_finder = self.sentence_finder
	sentences = sentence_finder.tokenize(section.strip())
	for sentence in sentences:
	# print(sentence)
	sentence = self.clean_data(sentence)
	if sentence !='':
	clean_content_list.append(sentence)
	for line in clean_content_list:
	# print(len(line.split()))
	if not re.match(r'^\s*$', line):

	line = self.selectLines(line, 'claims_indices')

	if line is not None and count_concept > 0:
	line = self.truncate_data(line)
	line = re.sub(r'in that', '', line)
	# print(line, len(line.split()))
	concept_list.append(line)
	count_concept -= 1

	count_concept = 3
	if len(concept_list) is not None:
	total_sentences_number = len(concept_list)
	for concept in concept_list :


	if concept is not None and not re.match(r'^\s,', concept) and len(concept.split())<50:
	classifyT = ClassifyWithIncr_it()
	polarite = classifyT.main(concept)
	get_parameters = ParameterExtractor(concept)
	parameters = get_parameters.extract_parameters()

	parameters_list.extend(parameters)

	values = OrderedDict({
	"concept": {
	"type": polarite,
	"id": graphItemIdValue + str(compt_Id),
	"sentence": concept,
	"source": output_file_name,
	"parameters": parameters_list,
	"image": urlImg,
	"pdf": urlPDF

	}

	})
	json_string = json.dumps(values, sort_keys=OrderedDict, indent=4, separators=(',', ': '))
	output_result.append(json_string)
	output_result = list(set(output_result))

	output_json = ",".join(output_result)

	return output_json, total_sentences_number