File size: 3,360 Bytes
30ebb61
 
 
 
 
 
 
ccd0657
30ebb61
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3baa01d
30ebb61
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import gradio as gr
import fitz  # PyMuPDF
import json
import torch
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import re
import os

# Constants
MODEL_NAME = "google/gemma-2-2b-it"
DEVICE = "cpu"
hf_token = os.environ.get("HF_TOKEN")

# Load model at global scope
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, token=hf_token, trust_remote_code=True)
model_config = transformers.AutoConfig.from_pretrained(MODEL_NAME, use_auth_token=True)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    config=model_config,
    device_map="auto",  # or "cpu" if no GPU is available
    token=hf_token,
    torch_dtype=torch.float32,
    trust_remote_code=True
)
generator = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.eos_token_id
)

def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = "\n".join(page.get_text("text") for page in doc)
    return text if text.strip() else "Error: No extractable text found in PDF."

def build_prompt(text):
    instruction = f"""
You are an AI that extracts structured metadata from research papers.

Return ONLY valid JSON with the following structure and no extra text:

{{
  "Title": "Paper title",
  "Authors": ["Author 1", "Author 2"],
  "DOI": "DOI if available",
  "Keywords": ["Keyword1", "Keyword2"],
  "Abstract": "Abstract text"
}}

Here is the paper content:
{text[:2000]}
"""
    return (
        "<start_of_turn>user\n"
        + instruction.strip() +
        "\n<end_of_turn>\n<start_of_turn>model\n"
    )

def extract_json(text):
    assistant_start = text.find("start_of_turn>model")
    if assistant_start == -1:
        return {"Error": "No assistant section found in output"}

    assistant_text = text[assistant_start:]
    assistant_text = re.sub(r"```(?:json)?|```", "", assistant_text).strip()

    start = assistant_text.find('{')
    if start == -1:
        return {"Error": "No opening '{' found in assistant section"}

    brace_count = 0
    for i in range(start, len(assistant_text)):
        if assistant_text[i] == '{':
            brace_count += 1
        elif assistant_text[i] == '}':
            brace_count -= 1
            if brace_count == 0:
                json_str = assistant_text[start:i+1]
                try:
                    return json.loads(json_str)
                except Exception as e:
                    return {"Error": f"JSON parse failed: {e}"}

    return {"Error": "No complete JSON block found"}

def extract_metadata(paper_text):
    prompt = build_prompt(paper_text)
    response = generator(prompt, max_new_tokens=1000, do_sample=False, temperature=0)
    raw_output = response[0]["generated_text"]
    return extract_json(raw_output)

def process_pdf(pdf_file):
    extracted_text = extract_text_from_pdf(pdf_file.name)
    if extracted_text.startswith("Error:"):
        return {"Error": "No extractable text found in the PDF."}
    metadata = extract_metadata(extracted_text)
    return metadata

# Gradio Interface
iface = gr.Interface(
    fn=process_pdf,
    inputs=gr.File(label="Upload PDF"),
    outputs="json",
    title="Metadata Extractor",
    description="Upload a PDF to extract structured metadata such as title, authors, abstract, and more."
)

iface.launch()