import gradio as gr import fitz # PyMuPDF for PDF text extraction import json import torch import transformers from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline import re # Constants MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct" DEVICE = "cpu" # Change to "cuda" if GPU is enabled in Space # Load model once tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True) model_config = transformers.AutoConfig.from_pretrained(MODEL_NAME, trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained( MODEL_NAME, config=model_config, device_map="auto", torch_dtype=torch.float32, trust_remote_code=True ) generator = pipeline( "text-generation", model=model, tokenizer=tokenizer, eos_token_id=tokenizer.eos_token_id, pad_token_id=tokenizer.eos_token_id, max_new_tokens=1000, ) def extract_text_from_pdf(pdf_path): doc = fitz.open(pdf_path) text = "\n".join(page.get_text("text") for page in doc) return text if text.strip() else "Error: No extractable text found in PDF." def build_prompt(text): instruction = f""" You are an AI that extracts structured metadata from research papers. Extract the following fields and return ONLY valid JSON (no extra text, no markdown, no explanations): {{ "Title": "Paper title", "Authors": ["Author 1", "Author 2"], "DOI": "DOI if available", "Keywords": ["Keyword1", "Keyword2"], "Abstract": "Abstract text" }} Here is the paper content: {text[:3000]} """ return ( "<|im_start|>system\n" "You are a helpful assistant that extracts structured metadata from scientific papers.\n" "<|im_end|>\n" "<|im_start|>user\n" f"{instruction.strip()}\n" "<|im_end|>\n" "<|im_start|>assistant" ) def extract_json(text): assistant_start = text.find("<|im_start|>assistant") if assistant_start == -1: return {"Error": "No assistant section found in output"} assistant_text = text[assistant_start:] assistant_text = re.sub(r"```(?:json)?|```", "", assistant_text).strip() start = assistant_text.find('{') if start == -1: return {"Error": "No opening '{' found in assistant section"} brace_count = 0 for i in range(start, len(assistant_text)): if assistant_text[i] == '{': brace_count += 1 elif assistant_text[i] == '}': brace_count -= 1 if brace_count == 0: json_str = assistant_text[start:i+1] try: return json.loads(json_str) except Exception as e: return {"Error": f"JSON parse failed: {e}"} return {"Error": "No complete JSON block found"} def extract_metadata(paper_text): prompt = build_prompt(paper_text) response = generator(prompt, max_new_tokens=1000, do_sample=False, temperature=0) raw_output = response[0]["generated_text"] return extract_json(raw_output) def process_pdf(pdf_file): extracted_text = extract_text_from_pdf(pdf_file.name) if extracted_text.startswith("Error:"): return {"Error": "No extractable text found in the PDF."} metadata = extract_metadata(extracted_text) return metadata # Gradio interface iface = gr.Interface( fn=process_pdf, inputs=gr.File(label="Upload PDF"), outputs="json", title="Metadata Extractor", description="Upload a research PDF to extract structured metadata fields." ) iface.launch()