tejasreereddy commited on
Commit
19d9bcd
·
verified ·
1 Parent(s): 62f47d7

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +108 -0
app.py ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import fitz # PyMuPDF
3
+ import json
4
+ import torch
5
+ import transformers
6
+ from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
7
+ import re
8
+ import os
9
+
10
+ # Constants
11
+ MODEL_NAME = "google/gemma-2-2b-it"
12
+ DEVICE = "cpu"
13
+ hf_token = os.environ.get("HF_TOKEN")
14
+
15
+ # Load model at global scope
16
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, token=hf_token, trust_remote_code=True)
17
+ model_config = transformers.AutoConfig.from_pretrained(MODEL_NAME, use_auth_token=True)
18
+ model = AutoModelForCausalLM.from_pretrained(
19
+ MODEL_NAME,
20
+ config=model_config,
21
+ device_map="auto", # or "cpu" if no GPU is available
22
+ token=hf_token,
23
+ torch_dtype=torch.float32,
24
+ trust_remote_code=True
25
+ )
26
+ generator = pipeline(
27
+ "text-generation",
28
+ model=model,
29
+ tokenizer=tokenizer,
30
+ eos_token_id=tokenizer.eos_token_id,
31
+ pad_token_id=tokenizer.eos_token_id
32
+ )
33
+
34
+ def extract_text_from_pdf(pdf_path):
35
+ doc = fitz.open(pdf_path)
36
+ text = "\n".join(page.get_text("text") for page in doc)
37
+ return text if text.strip() else "Error: No extractable text found in PDF."
38
+
39
+ def build_prompt(text):
40
+ instruction = f"""
41
+ You are an AI that extracts structured metadata from research papers.
42
+ Return ONLY valid JSON with the following structure and no extra text:
43
+ {{
44
+ "Title": "Paper title",
45
+ "Authors": ["Author 1", "Author 2"],
46
+ "DOI": "DOI if available",
47
+ "Keywords": ["Keyword1", "Keyword2"],
48
+ "Abstract": "Abstract text"
49
+ }}
50
+ Here is the paper content:
51
+ {text[:2000]}
52
+ """
53
+ return (
54
+ "<start_of_turn>user\n"
55
+ + instruction.strip() +
56
+ "\n<end_of_turn>\n<start_of_turn>model\n"
57
+ )
58
+
59
+ def extract_json(text):
60
+ assistant_start = text.find("start_of_turn>model")
61
+ if assistant_start == -1:
62
+ return {"Error": "No assistant section found in output"}
63
+
64
+ assistant_text = text[assistant_start:]
65
+ assistant_text = re.sub(r"```(?:json)?|```", "", assistant_text).strip()
66
+
67
+ start = assistant_text.find('{')
68
+ if start == -1:
69
+ return {"Error": "No opening '{' found in assistant section"}
70
+
71
+ brace_count = 0
72
+ for i in range(start, len(assistant_text)):
73
+ if assistant_text[i] == '{':
74
+ brace_count += 1
75
+ elif assistant_text[i] == '}':
76
+ brace_count -= 1
77
+ if brace_count == 0:
78
+ json_str = assistant_text[start:i+1]
79
+ try:
80
+ return json.loads(json_str)
81
+ except Exception as e:
82
+ return {"Error": f"JSON parse failed: {e}"}
83
+
84
+ return {"Error": "No complete JSON block found"}
85
+
86
+ def extract_metadata(paper_text):
87
+ prompt = build_prompt(paper_text)
88
+ response = generator(prompt, max_new_tokens=1000, do_sample=False, temperature=0)
89
+ raw_output = response[0]["generated_text"]
90
+ return extract_json(raw_output)
91
+
92
+ def process_pdf(pdf_file):
93
+ extracted_text = extract_text_from_pdf(pdf_file.name)
94
+ if extracted_text.startswith("Error:"):
95
+ return {"Error": "No extractable text found in the PDF."}
96
+ metadata = extract_metadata(extracted_text)
97
+ return metadata
98
+
99
+ # Gradio Interface
100
+ iface = gr.Interface(
101
+ fn=process_pdf,
102
+ inputs=gr.File(label="Upload PDF"),
103
+ outputs="json",
104
+ title="Metadata Extractor",
105
+ description="Upload a PDF to extract structured metadata such as title, authors, abstract, and more."
106
+ )
107
+
108
+ iface.launch()