|
|
import gradio as gr |
|
|
import fitz |
|
|
import json |
|
|
import torch |
|
|
import transformers |
|
|
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline |
|
|
import re |
|
|
import os |
|
|
|
|
|
|
|
|
MODEL_NAME = "google/gemma-2-2b-it" |
|
|
DEVICE = "cpu" |
|
|
hf_token = os.environ.get("HF_TOKEN") |
|
|
|
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, token=hf_token, trust_remote_code=True) |
|
|
model_config = transformers.AutoConfig.from_pretrained(MODEL_NAME, use_auth_token=True) |
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
|
MODEL_NAME, |
|
|
config=model_config, |
|
|
device_map="auto", |
|
|
token=hf_token, |
|
|
torch_dtype=torch.float32, |
|
|
trust_remote_code=True |
|
|
) |
|
|
generator = pipeline( |
|
|
"text-generation", |
|
|
model=model, |
|
|
tokenizer=tokenizer, |
|
|
eos_token_id=tokenizer.eos_token_id, |
|
|
pad_token_id=tokenizer.eos_token_id |
|
|
) |
|
|
|
|
|
def extract_text_from_pdf(pdf_path): |
|
|
doc = fitz.open(pdf_path) |
|
|
text = "\n".join(page.get_text("text") for page in doc) |
|
|
return text if text.strip() else "Error: No extractable text found in PDF." |
|
|
|
|
|
def build_prompt(text): |
|
|
instruction = f""" |
|
|
You are an AI that extracts structured metadata from research papers. |
|
|
Return ONLY valid JSON with the following structure and no extra text: |
|
|
{{ |
|
|
"Title": "Paper title", |
|
|
"Authors": ["Author 1", "Author 2"], |
|
|
"DOI": "DOI if available", |
|
|
"Keywords": ["Keyword1", "Keyword2"], |
|
|
"Abstract": "Abstract text" |
|
|
}} |
|
|
Here is the paper content: |
|
|
{text[:2000]} |
|
|
""" |
|
|
return ( |
|
|
"<start_of_turn>user\n" |
|
|
+ instruction.strip() + |
|
|
"\n<end_of_turn>\n<start_of_turn>model\n" |
|
|
) |
|
|
|
|
|
def extract_json(text): |
|
|
assistant_start = text.find("start_of_turn>model") |
|
|
if assistant_start == -1: |
|
|
return {"Error": "No assistant section found in output"} |
|
|
|
|
|
assistant_text = text[assistant_start:] |
|
|
assistant_text = re.sub(r"```(?:json)?|```", "", assistant_text).strip() |
|
|
|
|
|
start = assistant_text.find('{') |
|
|
if start == -1: |
|
|
return {"Error": "No opening '{' found in assistant section"} |
|
|
|
|
|
brace_count = 0 |
|
|
for i in range(start, len(assistant_text)): |
|
|
if assistant_text[i] == '{': |
|
|
brace_count += 1 |
|
|
elif assistant_text[i] == '}': |
|
|
brace_count -= 1 |
|
|
if brace_count == 0: |
|
|
json_str = assistant_text[start:i+1] |
|
|
try: |
|
|
return json.loads(json_str) |
|
|
except Exception as e: |
|
|
return {"Error": f"JSON parse failed: {e}"} |
|
|
|
|
|
return {"Error": "No complete JSON block found"} |
|
|
|
|
|
def extract_metadata(paper_text): |
|
|
prompt = build_prompt(paper_text) |
|
|
response = generator(prompt, max_new_tokens=1000, do_sample=False, temperature=0) |
|
|
raw_output = response[0]["generated_text"] |
|
|
return extract_json(raw_output) |
|
|
|
|
|
def process_pdf(pdf_file): |
|
|
extracted_text = extract_text_from_pdf(pdf_file.name) |
|
|
if extracted_text.startswith("Error:"): |
|
|
return {"Error": "No extractable text found in the PDF."} |
|
|
metadata = extract_metadata(extracted_text) |
|
|
return metadata |
|
|
|
|
|
|
|
|
iface = gr.Interface( |
|
|
fn=process_pdf, |
|
|
inputs=gr.File(label="Upload PDF"), |
|
|
outputs="json", |
|
|
title="Metadata Extractor", |
|
|
description="Upload a PDF to extract structured metadata such as title, authors, abstract, and more." |
|
|
) |
|
|
|
|
|
iface.launch() |