import gradio as gr import fitz # PyMuPDF import json import torch import transformers from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline import re import os # Constants MODEL_NAME = "google/gemma-2-2b-it" DEVICE = "cpu" hf_token = os.environ.get("HF_TOKEN") # Load model at global scope tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, token=hf_token, trust_remote_code=True) model_config = transformers.AutoConfig.from_pretrained(MODEL_NAME, use_auth_token=True) model = AutoModelForCausalLM.from_pretrained( MODEL_NAME, config=model_config, device_map="auto", # or "cpu" if no GPU is available token=hf_token, torch_dtype=torch.float32, trust_remote_code=True ) generator = pipeline( "text-generation", model=model, tokenizer=tokenizer, eos_token_id=tokenizer.eos_token_id, pad_token_id=tokenizer.eos_token_id ) def extract_text_from_pdf(pdf_path): doc = fitz.open(pdf_path) text = "\n".join(page.get_text("text") for page in doc) return text if text.strip() else "Error: No extractable text found in PDF." def build_prompt(text): instruction = f""" You are an AI that extracts structured metadata from research papers. Return ONLY valid JSON with the following structure and no extra text: {{ "Title": "Paper title", "Authors": ["Author 1", "Author 2"], "DOI": "DOI if available", "Keywords": ["Keyword1", "Keyword2"], "Abstract": "Abstract text" }} Here is the paper content: {text[:2000]} """ return ( "user\n" + instruction.strip() + "\n\nmodel\n" ) def extract_json(text): assistant_start = text.find("start_of_turn>model") if assistant_start == -1: return {"Error": "No assistant section found in output"} assistant_text = text[assistant_start:] assistant_text = re.sub(r"```(?:json)?|```", "", assistant_text).strip() start = assistant_text.find('{') if start == -1: return {"Error": "No opening '{' found in assistant section"} brace_count = 0 for i in range(start, len(assistant_text)): if assistant_text[i] == '{': brace_count += 1 elif assistant_text[i] == '}': brace_count -= 1 if brace_count == 0: json_str = assistant_text[start:i+1] try: return json.loads(json_str) except Exception as e: return {"Error": f"JSON parse failed: {e}"} return {"Error": "No complete JSON block found"} def extract_metadata(paper_text): prompt = build_prompt(paper_text) response = generator(prompt, max_new_tokens=1000, do_sample=False, temperature=0) raw_output = response[0]["generated_text"] return extract_json(raw_output) def process_pdf(pdf_file): extracted_text = extract_text_from_pdf(pdf_file.name) if extracted_text.startswith("Error:"): return {"Error": "No extractable text found in the PDF."} metadata = extract_metadata(extracted_text) return metadata # Gradio Interface iface = gr.Interface( fn=process_pdf, inputs=gr.File(label="Upload PDF"), outputs="json", title="Metadata Extractor", description="Upload a PDF to extract structured metadata such as title, authors, abstract, and more." ) iface.launch()