tejasreereddy commited on
Commit
30ebb61
·
verified ·
1 Parent(s): 37da5bc

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +112 -0
app.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import fitz # PyMuPDF
3
+ import json
4
+ import torch
5
+ import transformers
6
+ from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
7
+ import re
8
+
9
+ # Constants
10
+ MODEL_NAME = "google/gemma-2-2b-it"
11
+ DEVICE = "cpu"
12
+ hf_token = os.environ.get("HF_TOKEN")
13
+
14
+ # Load model at global scope
15
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, token=hf_token, trust_remote_code=True)
16
+ model_config = transformers.AutoConfig.from_pretrained(MODEL_NAME, use_auth_token=True)
17
+ model = AutoModelForCausalLM.from_pretrained(
18
+ MODEL_NAME,
19
+ config=model_config,
20
+ device_map="auto", # or "cpu" if no GPU is available
21
+ token=hf_token,
22
+ torch_dtype=torch.float32,
23
+ trust_remote_code=True
24
+ )
25
+ generator = pipeline(
26
+ "text-generation",
27
+ model=model,
28
+ tokenizer=tokenizer,
29
+ eos_token_id=tokenizer.eos_token_id,
30
+ pad_token_id=tokenizer.eos_token_id
31
+ )
32
+
33
+ def extract_text_from_pdf(pdf_path):
34
+ doc = fitz.open(pdf_path)
35
+ text = "\n".join(page.get_text("text") for page in doc)
36
+ return text if text.strip() else "Error: No extractable text found in PDF."
37
+
38
+ def build_prompt(text):
39
+ instruction = f"""
40
+ You are an AI that extracts structured metadata from research papers.
41
+
42
+ Return ONLY valid JSON with the following structure and no extra text:
43
+
44
+ {{
45
+ "Title": "Paper title",
46
+ "Authors": ["Author 1", "Author 2"],
47
+ "DOI": "DOI if available",
48
+ "Keywords": ["Keyword1", "Keyword2"],
49
+ "Abstract": "Abstract text",
50
+ "Document Type": "Research Paper, Thesis, etc.",
51
+ "Number of References": 10
52
+ }}
53
+
54
+ Here is the paper content:
55
+ {text[:2000]}
56
+ """
57
+ return (
58
+ "<start_of_turn>user\n"
59
+ + instruction.strip() +
60
+ "\n<end_of_turn>\n<start_of_turn>model\n"
61
+ )
62
+
63
+ def extract_json(text):
64
+ assistant_start = text.find("start_of_turn>model")
65
+ if assistant_start == -1:
66
+ return {"Error": "No assistant section found in output"}
67
+
68
+ assistant_text = text[assistant_start:]
69
+ assistant_text = re.sub(r"```(?:json)?|```", "", assistant_text).strip()
70
+
71
+ start = assistant_text.find('{')
72
+ if start == -1:
73
+ return {"Error": "No opening '{' found in assistant section"}
74
+
75
+ brace_count = 0
76
+ for i in range(start, len(assistant_text)):
77
+ if assistant_text[i] == '{':
78
+ brace_count += 1
79
+ elif assistant_text[i] == '}':
80
+ brace_count -= 1
81
+ if brace_count == 0:
82
+ json_str = assistant_text[start:i+1]
83
+ try:
84
+ return json.loads(json_str)
85
+ except Exception as e:
86
+ return {"Error": f"JSON parse failed: {e}"}
87
+
88
+ return {"Error": "No complete JSON block found"}
89
+
90
+ def extract_metadata(paper_text):
91
+ prompt = build_prompt(paper_text)
92
+ response = generator(prompt, max_new_tokens=1000, do_sample=False, temperature=0)
93
+ raw_output = response[0]["generated_text"]
94
+ return extract_json(raw_output)
95
+
96
+ def process_pdf(pdf_file):
97
+ extracted_text = extract_text_from_pdf(pdf_file.name)
98
+ if extracted_text.startswith("Error:"):
99
+ return {"Error": "No extractable text found in the PDF."}
100
+ metadata = extract_metadata(extracted_text)
101
+ return metadata
102
+
103
+ # Gradio Interface
104
+ iface = gr.Interface(
105
+ fn=process_pdf,
106
+ inputs=gr.File(label="Upload PDF"),
107
+ outputs="json",
108
+ title="Metadata Extractor",
109
+ description="Upload a PDF to extract structured metadata such as title, authors, abstract, and more."
110
+ )
111
+
112
+ iface.launch()