Spaces:

tejasreereddy
/

MetadataExtractor

Build error

File size: 3,500 Bytes

import gradio as gr
import fitz  # PyMuPDF for PDF text extraction
import json
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import re

# Constants
MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"
DEVICE = "cpu"  # Change to "cuda" if GPU is enabled in Space

# Load model once
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
model_config = transformers.AutoConfig.from_pretrained(MODEL_NAME, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    config=model_config,
    device_map="auto",
    torch_dtype=torch.float32,
    trust_remote_code=True
)
generator = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.eos_token_id,
    max_new_tokens=1000,
)

def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = "\n".join(page.get_text("text") for page in doc)
    return text if text.strip() else "Error: No extractable text found in PDF."

def build_prompt(text):
    instruction = f"""
You are an AI that extracts structured metadata from research papers.

Extract the following fields and return ONLY valid JSON (no extra text, no markdown, no explanations):

{{
  "Title": "Paper title",
  "Authors": ["Author 1", "Author 2"],
  "DOI": "DOI if available",
  "Keywords": ["Keyword1", "Keyword2"],
  "Abstract": "Abstract text"
}}

Here is the paper content:
{text[:3000]}
"""
    return (
        "<|im_start|>system\n"
        "You are a helpful assistant that extracts structured metadata from scientific papers.\n"
        "<|im_end|>\n"
        "<|im_start|>user\n"
        f"{instruction.strip()}\n"
        "<|im_end|>\n"
        "<|im_start|>assistant"
    )

def extract_json(text):
    assistant_start = text.find("<|im_start|>assistant")
    if assistant_start == -1:
        return {"Error": "No assistant section found in output"}

    assistant_text = text[assistant_start:]
    assistant_text = re.sub(r"```(?:json)?|```", "", assistant_text).strip()

    start = assistant_text.find('{')
    if start == -1:
        return {"Error": "No opening '{' found in assistant section"}

    brace_count = 0
    for i in range(start, len(assistant_text)):
        if assistant_text[i] == '{':
            brace_count += 1
        elif assistant_text[i] == '}':
            brace_count -= 1
            if brace_count == 0:
                json_str = assistant_text[start:i+1]
                try:
                    return json.loads(json_str)
                except Exception as e:
                    return {"Error": f"JSON parse failed: {e}"}

    return {"Error": "No complete JSON block found"}

def extract_metadata(paper_text):
    prompt = build_prompt(paper_text)
    response = generator(prompt, max_new_tokens=1000, do_sample=False, temperature=0)
    raw_output = response[0]["generated_text"]
    return extract_json(raw_output)

def process_pdf(pdf_file):
    extracted_text = extract_text_from_pdf(pdf_file.name)
    if extracted_text.startswith("Error:"):
        return {"Error": "No extractable text found in the PDF."}
    metadata = extract_metadata(extracted_text)
    return metadata

# Gradio interface
iface = gr.Interface(
    fn=process_pdf,
    inputs=gr.File(label="Upload PDF"),
    outputs="json",
    title="Metadata Extractor",
    description="Upload a research PDF to extract structured metadata fields."
)

iface.launch()