Spaces:

tejasreereddy
/

MetadataExtractorr

Sleeping

App Files Files Community

MetadataExtractorr / app.py

tejasreereddy

Create app.py

483a56f verified 9 months ago

raw

history blame contribute delete

3.5 kB

	import gradio as gr
	import fitz # PyMuPDF for PDF text extraction
	import json
	import torch
	import transformers
	from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
	import re

	# Constants
	MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"
	DEVICE = "cpu" # Change to "cuda" if GPU is enabled in Space

	# Load model once
	tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
	model_config = transformers.AutoConfig.from_pretrained(MODEL_NAME, trust_remote_code=True)
	model = AutoModelForCausalLM.from_pretrained(
	MODEL_NAME,
	config=model_config,
	device_map="auto",
	torch_dtype=torch.float32,
	trust_remote_code=True
	)
	generator = pipeline(
	"text-generation",
	model=model,
	tokenizer=tokenizer,
	eos_token_id=tokenizer.eos_token_id,
	pad_token_id=tokenizer.eos_token_id,
	max_new_tokens=1000,
	)

	def extract_text_from_pdf(pdf_path):
	doc = fitz.open(pdf_path)
	text = "\n".join(page.get_text("text") for page in doc)
	return text if text.strip() else "Error: No extractable text found in PDF."

	def build_prompt(text):
	instruction = f"""
	You are an AI that extracts structured metadata from research papers.
	Extract the following fields and return ONLY valid JSON (no extra text, no markdown, no explanations):
	{{
	"Title": "Paper title",
	"Authors": ["Author 1", "Author 2"],
	"DOI": "DOI if available",
	"Keywords": ["Keyword1", "Keyword2"],
	"Abstract": "Abstract text"
	}}
	Here is the paper content:
	{text[:3000]}
	"""
	return (
	"<\|im_start\|>system\n"
	"You are a helpful assistant that extracts structured metadata from scientific papers.\n"
	"<\|im_end\|>\n"
	"<\|im_start\|>user\n"
	f"{instruction.strip()}\n"
	"<\|im_end\|>\n"
	"<\|im_start\|>assistant"
	)

	def extract_json(text):
	assistant_start = text.find("<\|im_start\|>assistant")
	if assistant_start == -1:
	return {"Error": "No assistant section found in output"}

	assistant_text = text[assistant_start:]
	assistant_text = re.sub(r"```(?:json)?\|```", "", assistant_text).strip()

	start = assistant_text.find('{')
	if start == -1:
	return {"Error": "No opening '{' found in assistant section"}

	brace_count = 0
	for i in range(start, len(assistant_text)):
	if assistant_text[i] == '{':
	brace_count += 1
	elif assistant_text[i] == '}':
	brace_count -= 1
	if brace_count == 0:
	json_str = assistant_text[start:i+1]
	try:
	return json.loads(json_str)
	except Exception as e:
	return {"Error": f"JSON parse failed: {e}"}

	return {"Error": "No complete JSON block found"}

	def extract_metadata(paper_text):
	prompt = build_prompt(paper_text)
	response = generator(prompt, max_new_tokens=1000, do_sample=False, temperature=0)
	raw_output = response[0]["generated_text"]
	return extract_json(raw_output)

	def process_pdf(pdf_file):
	extracted_text = extract_text_from_pdf(pdf_file.name)
	if extracted_text.startswith("Error:"):
	return {"Error": "No extractable text found in the PDF."}
	metadata = extract_metadata(extracted_text)
	return metadata

	# Gradio interface
	iface = gr.Interface(
	fn=process_pdf,
	inputs=gr.File(label="Upload PDF"),
	outputs="json",
	title="Metadata Extractor",
	description="Upload a research PDF to extract structured metadata fields."
	)

	iface.launch()