Spaces:
Sleeping
Sleeping
| # add module | |
| import os | |
| import shutil | |
| import sys | |
| from subprocess import call | |
| from grobid_client.grobid_client import GrobidClient | |
| module_path = os.path.abspath(os.path.join('/project')) | |
| if module_path not in sys.path: | |
| sys.path.append(module_path) | |
| from core.tei import single_entry | |
| temp_dir = '/project/temp' | |
| pdffigures2_home = '/opt/pdffigures2' | |
| grobid_home = '/opt/grobid' | |
| grobid_python_config_pth = '/opt/grobid_client_python/config.json' | |
| def remove_temp_directory(): | |
| if os.path.exists(temp_dir): | |
| shutil.rmtree(temp_dir) | |
| def grobid_clident(): | |
| return GrobidClient(config_path=grobid_python_config_pth) | |
| def process_pdf(pdf_pth: str, file_name: str): | |
| """This function will preprocess pdf, generate xml, extract figures, and then move all things to /project/temp""" | |
| client = grobid_clident() | |
| remove_temp_directory() | |
| name = file_name[:-4] | |
| if not os.path.exists(temp_dir): | |
| os.makedirs(temp_dir) | |
| temp_pdf_dir = os.path.join(temp_dir, name, 'pdf') | |
| if not os.path.exists(temp_pdf_dir): | |
| os.makedirs(temp_pdf_dir) | |
| temp_xml_dir = os.path.join(temp_dir, name, 'xml') | |
| if not os.path.exists(temp_xml_dir): | |
| os.makedirs(temp_xml_dir) | |
| # copy pdf to temp dir | |
| shutil.copy(pdf_pth, temp_pdf_dir) | |
| # process to xml | |
| client.process( | |
| 'processFulltextDocument', | |
| temp_pdf_dir, | |
| tei_coordinates=True, | |
| force=True, | |
| verbose=True, | |
| output=temp_xml_dir, | |
| ) | |
| xml_name = name + '.tei.xml' | |
| xml_pth = os.path.join(temp_xml_dir, xml_name) | |
| # now scan figures | |
| fig_dir_profix = 'figure' | |
| img_dir_profix = 'figure/image' | |
| json_dir_profix = 'figure/json' | |
| tmp_fig_dir = os.path.join(pdffigures2_home, fig_dir_profix) | |
| if not os.path.exists(tmp_fig_dir): | |
| os.makedirs(tmp_fig_dir) | |
| tmp_img_dir = os.path.join(pdffigures2_home, img_dir_profix) | |
| if not os.path.exists(tmp_img_dir): | |
| os.makedirs(tmp_img_dir) | |
| tmp_json_dir = os.path.join(pdffigures2_home, json_dir_profix) | |
| if not os.path.exists(tmp_json_dir): | |
| os.makedirs(tmp_json_dir) | |
| args = [ | |
| 'sbt', | |
| '-J-Xmx4G', | |
| 'runMain org.allenai.pdffigures2.FigureExtractorBatchCli -e -q ' + os.path.abspath(temp_pdf_dir) + '/' + ' -m ' + './' + img_dir_profix + '/' + ' -d ' + './' + json_dir_profix + '/' + ' -s ' + './' + fig_dir_profix + '/stat.json', | |
| ] | |
| call(args, cwd=pdffigures2_home) | |
| shutil.move(tmp_fig_dir, os.path.join(temp_dir, name)) | |
| figure_json_pth = os.path.join(temp_dir, name, 'figure/json', name + '.json') | |
| # merge to single json | |
| _, title, abstract, text, headers, figures = single_entry('', xml_pth=xml_pth, fig_json_pth=figure_json_pth) | |
| temp_json_dir = os.path.join(temp_dir, name, 'json') | |
| if not os.path.exists(temp_json_dir): | |
| os.makedirs(temp_json_dir) | |
| json_data = { | |
| 'title': title, | |
| 'abstract': abstract, | |
| 'text': text, | |
| 'headers': headers, | |
| 'figures': figures, | |
| } | |
| import json | |
| json_pth = os.path.join(temp_json_dir, name + '.json') | |
| with open(json_pth, 'w') as f: | |
| json.dump(json_data, f, indent=4) | |
| # get preprocessed data | |
| with open(json_pth, 'r') as f: | |
| data = json.load(f) | |
| paper_length = len(data['text']) | |
| sections = [{ | |
| 'idx': i, | |
| 'title': head['section'], | |
| 'n': head['n'], | |
| 'text': ' '.join([data['text'][idx]['string'] for idx in range(head['start'], min(head['end'] + 1, paper_length))]), | |
| 'matched_slides': [], | |
| } for i, head in enumerate(data['headers'])] | |
| with open(os.path.join(temp_dir, name, name + '.preprocessed_text.json'), 'w') as f: | |
| json.dump([sec['text'] for sec in sections], f, indent=4) | |
| if __name__ == '__main__': | |
| process_pdf('/project/example/example.pdf') | |