Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import fitz # PyMuPDF for PDF text extraction | |
| import json | |
| import torch | |
| import transformers | |
| from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline | |
| import re | |
| # Constants | |
| MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct" | |
| DEVICE = "cpu" # Change to "cuda" if GPU is enabled in Space | |
| # Load model once | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True) | |
| model_config = transformers.AutoConfig.from_pretrained(MODEL_NAME, trust_remote_code=True) | |
| model = AutoModelForCausalLM.from_pretrained( | |
| MODEL_NAME, | |
| config=model_config, | |
| device_map="auto", | |
| torch_dtype=torch.float32, | |
| trust_remote_code=True | |
| ) | |
| generator = pipeline( | |
| "text-generation", | |
| model=model, | |
| tokenizer=tokenizer, | |
| eos_token_id=tokenizer.eos_token_id, | |
| pad_token_id=tokenizer.eos_token_id, | |
| max_new_tokens=1000, | |
| ) | |
| def extract_text_from_pdf(pdf_path): | |
| doc = fitz.open(pdf_path) | |
| text = "\n".join(page.get_text("text") for page in doc) | |
| return text if text.strip() else "Error: No extractable text found in PDF." | |
| def build_prompt(text): | |
| instruction = f""" | |
| You are an AI that extracts structured metadata from research papers. | |
| Extract the following fields and return ONLY valid JSON (no extra text, no markdown, no explanations): | |
| {{ | |
| "Title": "Paper title", | |
| "Authors": ["Author 1", "Author 2"], | |
| "DOI": "DOI if available", | |
| "Keywords": ["Keyword1", "Keyword2"], | |
| "Abstract": "Abstract text" | |
| }} | |
| Here is the paper content: | |
| {text[:3000]} | |
| """ | |
| return ( | |
| "<|im_start|>system\n" | |
| "You are a helpful assistant that extracts structured metadata from scientific papers.\n" | |
| "<|im_end|>\n" | |
| "<|im_start|>user\n" | |
| f"{instruction.strip()}\n" | |
| "<|im_end|>\n" | |
| "<|im_start|>assistant" | |
| ) | |
| def extract_json(text): | |
| assistant_start = text.find("<|im_start|>assistant") | |
| if assistant_start == -1: | |
| return {"Error": "No assistant section found in output"} | |
| assistant_text = text[assistant_start:] | |
| assistant_text = re.sub(r"```(?:json)?|```", "", assistant_text).strip() | |
| start = assistant_text.find('{') | |
| if start == -1: | |
| return {"Error": "No opening '{' found in assistant section"} | |
| brace_count = 0 | |
| for i in range(start, len(assistant_text)): | |
| if assistant_text[i] == '{': | |
| brace_count += 1 | |
| elif assistant_text[i] == '}': | |
| brace_count -= 1 | |
| if brace_count == 0: | |
| json_str = assistant_text[start:i+1] | |
| try: | |
| return json.loads(json_str) | |
| except Exception as e: | |
| return {"Error": f"JSON parse failed: {e}"} | |
| return {"Error": "No complete JSON block found"} | |
| def extract_metadata(paper_text): | |
| prompt = build_prompt(paper_text) | |
| response = generator(prompt, max_new_tokens=1000, do_sample=False, temperature=0) | |
| raw_output = response[0]["generated_text"] | |
| return extract_json(raw_output) | |
| def process_pdf(pdf_file): | |
| extracted_text = extract_text_from_pdf(pdf_file.name) | |
| if extracted_text.startswith("Error:"): | |
| return {"Error": "No extractable text found in the PDF."} | |
| metadata = extract_metadata(extracted_text) | |
| return metadata | |
| # Gradio interface | |
| iface = gr.Interface( | |
| fn=process_pdf, | |
| inputs=gr.File(label="Upload PDF"), | |
| outputs="json", | |
| title="Metadata Extractor", | |
| description="Upload a research PDF to extract structured metadata fields." | |
| ) | |
| iface.launch() |