tejasreereddy commited on
Commit
fc590c9
·
verified ·
1 Parent(s): 3917bbb

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +115 -0
app.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import fitz # PyMuPDF for PDF text extraction
3
+ import json
4
+ import torch
5
+ import transformers
6
+ from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
7
+ import re
8
+
9
+ # Constants
10
+ MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"
11
+ DEVICE = "cpu" # Change to "cuda" if GPU is enabled in Space
12
+
13
+ # Load model once
14
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
15
+ model_config = transformers.AutoConfig.from_pretrained(MODEL_NAME, trust_remote_code=True)
16
+ model = AutoModelForCausalLM.from_pretrained(
17
+ MODEL_NAME,
18
+ config=model_config,
19
+ device_map="auto",
20
+ torch_dtype=torch.float32,
21
+ trust_remote_code=True
22
+ )
23
+ generator = pipeline(
24
+ "text-generation",
25
+ model=model,
26
+ tokenizer=tokenizer,
27
+ eos_token_id=tokenizer.eos_token_id,
28
+ pad_token_id=tokenizer.eos_token_id,
29
+ max_new_tokens=1000,
30
+ )
31
+
32
+ def extract_text_from_pdf(pdf_path):
33
+ doc = fitz.open(pdf_path)
34
+ text = "\n".join(page.get_text("text") for page in doc)
35
+ return text if text.strip() else "Error: No extractable text found in PDF."
36
+
37
+ def build_prompt(text):
38
+ instruction = f"""
39
+ You are an AI that extracts structured metadata from research papers.
40
+
41
+ Extract the following fields and return ONLY valid JSON (no extra text, no markdown, no explanations):
42
+
43
+ {{
44
+ "Title": "Paper title",
45
+ "Authors": ["Author 1", "Author 2"],
46
+ "DOI": "DOI if available",
47
+ "Keywords": ["Keyword1", "Keyword2"],
48
+ "Abstract": "Abstract text",
49
+ "Document Type": "Research Paper, Thesis, etc.",
50
+ "Number of References": 10
51
+ }}
52
+
53
+ Here is the paper content:
54
+ {text[:3000]}
55
+ """
56
+ return (
57
+ "<|im_start|>system\n"
58
+ "You are a helpful assistant that extracts structured metadata from scientific papers.\n"
59
+ "<|im_end|>\n"
60
+ "<|im_start|>user\n"
61
+ f"{instruction.strip()}\n"
62
+ "<|im_end|>\n"
63
+ "<|im_start|>assistant"
64
+ )
65
+
66
+ def extract_json(text):
67
+ assistant_start = text.find("<|im_start|>assistant")
68
+ if assistant_start == -1:
69
+ return {"Error": "No assistant section found in output"}
70
+
71
+ assistant_text = text[assistant_start:]
72
+ assistant_text = re.sub(r"```(?:json)?|```", "", assistant_text).strip()
73
+
74
+ start = assistant_text.find('{')
75
+ if start == -1:
76
+ return {"Error": "No opening '{' found in assistant section"}
77
+
78
+ brace_count = 0
79
+ for i in range(start, len(assistant_text)):
80
+ if assistant_text[i] == '{':
81
+ brace_count += 1
82
+ elif assistant_text[i] == '}':
83
+ brace_count -= 1
84
+ if brace_count == 0:
85
+ json_str = assistant_text[start:i+1]
86
+ try:
87
+ return json.loads(json_str)
88
+ except Exception as e:
89
+ return {"Error": f"JSON parse failed: {e}"}
90
+
91
+ return {"Error": "No complete JSON block found"}
92
+
93
+ def extract_metadata(paper_text):
94
+ prompt = build_prompt(paper_text)
95
+ response = generator(prompt, max_new_tokens=1000, do_sample=False, temperature=0)
96
+ raw_output = response[0]["generated_text"]
97
+ return extract_json(raw_output)
98
+
99
+ def process_pdf(pdf_file):
100
+ extracted_text = extract_text_from_pdf(pdf_file.name)
101
+ if extracted_text.startswith("Error:"):
102
+ return {"Error": "No extractable text found in the PDF."}
103
+ metadata = extract_metadata(extracted_text)
104
+ return metadata
105
+
106
+ # Gradio interface
107
+ iface = gr.Interface(
108
+ fn=process_pdf,
109
+ inputs=gr.File(label="Upload PDF"),
110
+ outputs="json",
111
+ title="Metadata Extractor",
112
+ description="Upload a research PDF to extract structured metadata fields."
113
+ )
114
+
115
+ iface.launch()