tejasreereddy's picture
Update app.py
3baa01d verified
import gradio as gr
import fitz # PyMuPDF
import json
import torch
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import re
import os
# Constants
MODEL_NAME = "google/gemma-2-2b-it"
DEVICE = "cpu"
hf_token = os.environ.get("HF_TOKEN")
# Load model at global scope
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, token=hf_token, trust_remote_code=True)
model_config = transformers.AutoConfig.from_pretrained(MODEL_NAME, use_auth_token=True)
model = AutoModelForCausalLM.from_pretrained(
MODEL_NAME,
config=model_config,
device_map="auto", # or "cpu" if no GPU is available
token=hf_token,
torch_dtype=torch.float32,
trust_remote_code=True
)
generator = pipeline(
"text-generation",
model=model,
tokenizer=tokenizer,
eos_token_id=tokenizer.eos_token_id,
pad_token_id=tokenizer.eos_token_id
)
def extract_text_from_pdf(pdf_path):
doc = fitz.open(pdf_path)
text = "\n".join(page.get_text("text") for page in doc)
return text if text.strip() else "Error: No extractable text found in PDF."
def build_prompt(text):
instruction = f"""
You are an AI that extracts structured metadata from research papers.
Return ONLY valid JSON with the following structure and no extra text:
{{
"Title": "Paper title",
"Authors": ["Author 1", "Author 2"],
"DOI": "DOI if available",
"Keywords": ["Keyword1", "Keyword2"],
"Abstract": "Abstract text"
}}
Here is the paper content:
{text[:2000]}
"""
return (
"<start_of_turn>user\n"
+ instruction.strip() +
"\n<end_of_turn>\n<start_of_turn>model\n"
)
def extract_json(text):
assistant_start = text.find("start_of_turn>model")
if assistant_start == -1:
return {"Error": "No assistant section found in output"}
assistant_text = text[assistant_start:]
assistant_text = re.sub(r"```(?:json)?|```", "", assistant_text).strip()
start = assistant_text.find('{')
if start == -1:
return {"Error": "No opening '{' found in assistant section"}
brace_count = 0
for i in range(start, len(assistant_text)):
if assistant_text[i] == '{':
brace_count += 1
elif assistant_text[i] == '}':
brace_count -= 1
if brace_count == 0:
json_str = assistant_text[start:i+1]
try:
return json.loads(json_str)
except Exception as e:
return {"Error": f"JSON parse failed: {e}"}
return {"Error": "No complete JSON block found"}
def extract_metadata(paper_text):
prompt = build_prompt(paper_text)
response = generator(prompt, max_new_tokens=1000, do_sample=False, temperature=0)
raw_output = response[0]["generated_text"]
return extract_json(raw_output)
def process_pdf(pdf_file):
extracted_text = extract_text_from_pdf(pdf_file.name)
if extracted_text.startswith("Error:"):
return {"Error": "No extractable text found in the PDF."}
metadata = extract_metadata(extracted_text)
return metadata
# Gradio Interface
iface = gr.Interface(
fn=process_pdf,
inputs=gr.File(label="Upload PDF"),
outputs="json",
title="Metadata Extractor",
description="Upload a PDF to extract structured metadata such as title, authors, abstract, and more."
)
iface.launch()