tejasreereddy commited on
Commit
483a56f
·
verified ·
1 Parent(s): 1de0109

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +110 -0
app.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import fitz # PyMuPDF for PDF text extraction
3
+ import json
4
+ import torch
5
+ import transformers
6
+ from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
7
+ import re
8
+
9
+ # Constants
10
+ MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"
11
+ DEVICE = "cpu" # Change to "cuda" if GPU is enabled in Space
12
+
13
+ # Load model once
14
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
15
+ model_config = transformers.AutoConfig.from_pretrained(MODEL_NAME, trust_remote_code=True)
16
+ model = AutoModelForCausalLM.from_pretrained(
17
+ MODEL_NAME,
18
+ config=model_config,
19
+ device_map="auto",
20
+ torch_dtype=torch.float32,
21
+ trust_remote_code=True
22
+ )
23
+ generator = pipeline(
24
+ "text-generation",
25
+ model=model,
26
+ tokenizer=tokenizer,
27
+ eos_token_id=tokenizer.eos_token_id,
28
+ pad_token_id=tokenizer.eos_token_id,
29
+ max_new_tokens=1000,
30
+ )
31
+
32
+ def extract_text_from_pdf(pdf_path):
33
+ doc = fitz.open(pdf_path)
34
+ text = "\n".join(page.get_text("text") for page in doc)
35
+ return text if text.strip() else "Error: No extractable text found in PDF."
36
+
37
+ def build_prompt(text):
38
+ instruction = f"""
39
+ You are an AI that extracts structured metadata from research papers.
40
+ Extract the following fields and return ONLY valid JSON (no extra text, no markdown, no explanations):
41
+ {{
42
+ "Title": "Paper title",
43
+ "Authors": ["Author 1", "Author 2"],
44
+ "DOI": "DOI if available",
45
+ "Keywords": ["Keyword1", "Keyword2"],
46
+ "Abstract": "Abstract text"
47
+ }}
48
+ Here is the paper content:
49
+ {text[:3000]}
50
+ """
51
+ return (
52
+ "<|im_start|>system\n"
53
+ "You are a helpful assistant that extracts structured metadata from scientific papers.\n"
54
+ "<|im_end|>\n"
55
+ "<|im_start|>user\n"
56
+ f"{instruction.strip()}\n"
57
+ "<|im_end|>\n"
58
+ "<|im_start|>assistant"
59
+ )
60
+
61
+ def extract_json(text):
62
+ assistant_start = text.find("<|im_start|>assistant")
63
+ if assistant_start == -1:
64
+ return {"Error": "No assistant section found in output"}
65
+
66
+ assistant_text = text[assistant_start:]
67
+ assistant_text = re.sub(r"```(?:json)?|```", "", assistant_text).strip()
68
+
69
+ start = assistant_text.find('{')
70
+ if start == -1:
71
+ return {"Error": "No opening '{' found in assistant section"}
72
+
73
+ brace_count = 0
74
+ for i in range(start, len(assistant_text)):
75
+ if assistant_text[i] == '{':
76
+ brace_count += 1
77
+ elif assistant_text[i] == '}':
78
+ brace_count -= 1
79
+ if brace_count == 0:
80
+ json_str = assistant_text[start:i+1]
81
+ try:
82
+ return json.loads(json_str)
83
+ except Exception as e:
84
+ return {"Error": f"JSON parse failed: {e}"}
85
+
86
+ return {"Error": "No complete JSON block found"}
87
+
88
+ def extract_metadata(paper_text):
89
+ prompt = build_prompt(paper_text)
90
+ response = generator(prompt, max_new_tokens=1000, do_sample=False, temperature=0)
91
+ raw_output = response[0]["generated_text"]
92
+ return extract_json(raw_output)
93
+
94
+ def process_pdf(pdf_file):
95
+ extracted_text = extract_text_from_pdf(pdf_file.name)
96
+ if extracted_text.startswith("Error:"):
97
+ return {"Error": "No extractable text found in the PDF."}
98
+ metadata = extract_metadata(extracted_text)
99
+ return metadata
100
+
101
+ # Gradio interface
102
+ iface = gr.Interface(
103
+ fn=process_pdf,
104
+ inputs=gr.File(label="Upload PDF"),
105
+ outputs="json",
106
+ title="Metadata Extractor",
107
+ description="Upload a research PDF to extract structured metadata fields."
108
+ )
109
+
110
+ iface.launch()