Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -22,22 +22,21 @@ logging.basicConfig(
|
|
| 22 |
)
|
| 23 |
logger = logging.getLogger("pdf_processor")
|
| 24 |
|
| 25 |
-
#
|
| 26 |
try:
|
| 27 |
from unstructured.partition.pdf import partition_pdf
|
| 28 |
UNSTRUCTURED_AVAILABLE = True
|
| 29 |
except ImportError:
|
| 30 |
UNSTRUCTURED_AVAILABLE = False
|
| 31 |
-
logger.warning("unstructured.partition.pdf not available; skipping that
|
| 32 |
|
| 33 |
-
# Load API key from
|
| 34 |
API_KEY = os.getenv("GOOGLE_API_KEY")
|
| 35 |
if API_KEY:
|
| 36 |
genai.configure(api_key=API_KEY)
|
| 37 |
else:
|
| 38 |
logger.warning("GOOGLE_API_KEY not set in environment.")
|
| 39 |
|
| 40 |
-
# Globals to store state
|
| 41 |
EXTRACTED_TEXT = ""
|
| 42 |
PDF_SECTIONS = []
|
| 43 |
EXTRACTION_METHOD = ""
|
|
@@ -45,21 +44,25 @@ EXTRACTION_METHOD = ""
|
|
| 45 |
|
| 46 |
# --- Extraction Functions ---
|
| 47 |
def extract_text_with_unstructured(pdf_path):
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
|
| 64 |
|
| 65 |
def extract_text_with_pypdf(pdf_path):
|
|
@@ -76,7 +79,6 @@ def extract_text_with_pypdf(pdf_path):
|
|
| 76 |
{"title": parts[i].strip(), "content": parts[i + 1].strip()}
|
| 77 |
for i in range(1, len(parts), 2)
|
| 78 |
]
|
| 79 |
-
# fallback single section
|
| 80 |
return [{"title": "Document", "content": full_text}]
|
| 81 |
|
| 82 |
|
|
@@ -100,7 +102,7 @@ def extract_text_with_tika(pdf_path):
|
|
| 100 |
return sections
|
| 101 |
|
| 102 |
|
| 103 |
-
# --- Gemini
|
| 104 |
def generate_greg_brockman_summary(content):
|
| 105 |
model = genai.GenerativeModel("gemini-1.5-pro")
|
| 106 |
prompt = f"""
|
|
@@ -110,14 +112,14 @@ You are an expert document analyst specializing in proposal evaluation.
|
|
| 110 |
1. GOAL: ...
|
| 111 |
... (rest of template) ...
|
| 112 |
|
| 113 |
-
CONTENT
|
| 114 |
{content}
|
| 115 |
"""
|
| 116 |
try:
|
| 117 |
resp = model.generate_content(prompt)
|
| 118 |
return resp.text, None
|
| 119 |
except Exception as e:
|
| 120 |
-
logger.error(f"Summary
|
| 121 |
return None, str(e)
|
| 122 |
|
| 123 |
|
|
@@ -135,11 +137,11 @@ QUESTION: {question}
|
|
| 135 |
resp = model.generate_content(prompt)
|
| 136 |
return resp.text, None
|
| 137 |
except Exception as e:
|
| 138 |
-
logger.error(f"Q&A
|
| 139 |
return None, str(e)
|
| 140 |
|
| 141 |
|
| 142 |
-
# ---
|
| 143 |
def process_pdf(pdf_file, progress=gr.Progress()):
|
| 144 |
global EXTRACTED_TEXT, PDF_SECTIONS, EXTRACTION_METHOD
|
| 145 |
|
|
@@ -148,13 +150,25 @@ def process_pdf(pdf_file, progress=gr.Progress()):
|
|
| 148 |
if pdf_file is None:
|
| 149 |
return None, None, "❌ No file uploaded.", ""
|
| 150 |
|
| 151 |
-
#
|
| 152 |
tmp_dir = tempfile.gettempdir()
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 158 |
methods = []
|
| 159 |
if UNSTRUCTURED_AVAILABLE:
|
| 160 |
methods.append(("unstructured", extract_text_with_unstructured))
|
|
@@ -164,6 +178,7 @@ def process_pdf(pdf_file, progress=gr.Progress()):
|
|
| 164 |
]
|
| 165 |
|
| 166 |
sections = None
|
|
|
|
| 167 |
for name, fn in methods:
|
| 168 |
try:
|
| 169 |
secs = fn(path)
|
|
@@ -172,45 +187,37 @@ def process_pdf(pdf_file, progress=gr.Progress()):
|
|
| 172 |
EXTRACTION_METHOD = name
|
| 173 |
break
|
| 174 |
except Exception as e:
|
| 175 |
-
|
|
|
|
|
|
|
| 176 |
if not sections:
|
| 177 |
-
return None, None, "❌ Extraction failed
|
| 178 |
|
| 179 |
-
# Combine &
|
| 180 |
-
combined = ""
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
structure += f"{idx}. {sec['title']}\n"
|
| 184 |
chunk = f"## {sec['title']}\n{sec['content']}\n\n"
|
| 185 |
-
if len(combined
|
| 186 |
-
combined += chunk
|
| 187 |
-
else:
|
| 188 |
-
combined += f"## {sec['title']}\n[Truncated]\n\n"
|
| 189 |
-
structure += " [Content truncated]\n"
|
| 190 |
EXTRACTED_TEXT = combined
|
| 191 |
PDF_SECTIONS = sections
|
| 192 |
|
| 193 |
-
# Generate summary
|
| 194 |
summary, err = generate_greg_brockman_summary(combined)
|
| 195 |
if err:
|
| 196 |
return None, structure, f"❌ {err}", combined
|
| 197 |
|
| 198 |
-
return summary, structure, "✅ PDF processed
|
| 199 |
-
|
| 200 |
|
| 201 |
def ask_question(question):
|
| 202 |
if not API_KEY:
|
| 203 |
return "❌ Set GOOGLE_API_KEY in Secrets."
|
| 204 |
if not EXTRACTED_TEXT:
|
| 205 |
-
return "❌
|
| 206 |
if not question.strip():
|
| 207 |
return "❌ Enter a question."
|
| 208 |
|
| 209 |
-
|
| 210 |
-
if err
|
| 211 |
-
return f"❌ {err}"
|
| 212 |
-
return answer
|
| 213 |
-
|
| 214 |
|
| 215 |
def view_log():
|
| 216 |
try:
|
|
@@ -218,7 +225,6 @@ def view_log():
|
|
| 218 |
except Exception as e:
|
| 219 |
return f"Error reading log: {e}"
|
| 220 |
|
| 221 |
-
|
| 222 |
def save_summary(summary):
|
| 223 |
if not summary:
|
| 224 |
return "❌ No summary to save."
|
|
@@ -227,7 +233,6 @@ def save_summary(summary):
|
|
| 227 |
f.write(summary)
|
| 228 |
return f"✅ Saved to {fn}"
|
| 229 |
|
| 230 |
-
|
| 231 |
def save_qa(question, answer):
|
| 232 |
if not question or not answer:
|
| 233 |
return "❌ Nothing to save."
|
|
@@ -243,28 +248,16 @@ with gr.Blocks(title="PDF Analyzer with Gemini API") as app:
|
|
| 243 |
gr.Markdown("Upload a PDF, get a Greg Brockman style summary, and ask questions.")
|
| 244 |
|
| 245 |
with gr.Tab("Setup"):
|
| 246 |
-
|
| 247 |
-
api_key_input = gr.Textbox(
|
| 248 |
-
label="Google Gemini API Key",
|
| 249 |
-
type="password",
|
| 250 |
-
placeholder="Set in Secrets (GOOGLE_API_KEY)"
|
| 251 |
-
)
|
| 252 |
-
api_button = gr.Button("Configure API")
|
| 253 |
-
api_status = gr.Markdown("⚠️ Using environment GOOGLE_API_KEY")
|
| 254 |
-
api_button.click(
|
| 255 |
-
fn=lambda key: (genai.configure(api_key=key) or "✅ API configured", None),
|
| 256 |
-
inputs=[api_key_input],
|
| 257 |
-
outputs=[api_status, gr.State()]
|
| 258 |
-
)
|
| 259 |
|
| 260 |
with gr.Tab("PDF Processing"):
|
| 261 |
with gr.Row():
|
| 262 |
pdf_file = gr.File(label="Upload PDF", file_types=[".pdf"])
|
| 263 |
proc_btn = gr.Button("Process PDF", variant="primary")
|
| 264 |
-
status = gr.Markdown("Awaiting upload
|
| 265 |
summary_out = gr.Textbox(label="Summary", lines=15)
|
| 266 |
structure_out = gr.Textbox(label="Structure", lines=8)
|
| 267 |
-
log_info
|
| 268 |
proc_btn.click(
|
| 269 |
fn=process_pdf,
|
| 270 |
inputs=[pdf_file],
|
|
@@ -289,5 +282,4 @@ with gr.Blocks(title="PDF Analyzer with Gemini API") as app:
|
|
| 289 |
refresh_btn.click(view_log, inputs=None, outputs=[sys_log])
|
| 290 |
|
| 291 |
if __name__ == "__main__":
|
| 292 |
-
# On Hugging Face Spaces, share=True isn't needed; server_name="0.0.0.0" ensures external access
|
| 293 |
app.launch(server_name="0.0.0.0")
|
|
|
|
| 22 |
)
|
| 23 |
logger = logging.getLogger("pdf_processor")
|
| 24 |
|
| 25 |
+
# Try Unstructured.io
|
| 26 |
try:
|
| 27 |
from unstructured.partition.pdf import partition_pdf
|
| 28 |
UNSTRUCTURED_AVAILABLE = True
|
| 29 |
except ImportError:
|
| 30 |
UNSTRUCTURED_AVAILABLE = False
|
| 31 |
+
logger.warning("unstructured.partition.pdf not available; skipping that method")
|
| 32 |
|
| 33 |
+
# Load Gemini API key from env (set in your Space Secrets)
|
| 34 |
API_KEY = os.getenv("GOOGLE_API_KEY")
|
| 35 |
if API_KEY:
|
| 36 |
genai.configure(api_key=API_KEY)
|
| 37 |
else:
|
| 38 |
logger.warning("GOOGLE_API_KEY not set in environment.")
|
| 39 |
|
|
|
|
| 40 |
EXTRACTED_TEXT = ""
|
| 41 |
PDF_SECTIONS = []
|
| 42 |
EXTRACTION_METHOD = ""
|
|
|
|
| 44 |
|
| 45 |
# --- Extraction Functions ---
|
| 46 |
def extract_text_with_unstructured(pdf_path):
|
| 47 |
+
try:
|
| 48 |
+
logger.info("Extracting via Unstructured.io...")
|
| 49 |
+
elements = partition_pdf(filename=pdf_path, extract_images_in_pdf=False)
|
| 50 |
+
sections, current = [], {"title": "Introduction", "content": ""}
|
| 51 |
+
for e in elements:
|
| 52 |
+
if hasattr(e, "text") and (t := e.text.strip()):
|
| 53 |
+
if len(t) < 80 and (t.isupper() or t.endswith(":") or re.match(r"^[0-9]+\.?\s+", t)):
|
| 54 |
+
if current["content"]:
|
| 55 |
+
sections.append(current)
|
| 56 |
+
current = {"title": t, "content": ""}
|
| 57 |
+
else:
|
| 58 |
+
current["content"] += t + "\n\n"
|
| 59 |
+
if current["content"]:
|
| 60 |
+
sections.append(current)
|
| 61 |
+
return sections
|
| 62 |
+
except Exception as e:
|
| 63 |
+
# Bubble up so process_pdf can catch & log
|
| 64 |
+
logger.error(f"Unstructured extraction error: {e}", exc_info=True)
|
| 65 |
+
raise
|
| 66 |
|
| 67 |
|
| 68 |
def extract_text_with_pypdf(pdf_path):
|
|
|
|
| 79 |
{"title": parts[i].strip(), "content": parts[i + 1].strip()}
|
| 80 |
for i in range(1, len(parts), 2)
|
| 81 |
]
|
|
|
|
| 82 |
return [{"title": "Document", "content": full_text}]
|
| 83 |
|
| 84 |
|
|
|
|
| 102 |
return sections
|
| 103 |
|
| 104 |
|
| 105 |
+
# --- Gemini calls ---
|
| 106 |
def generate_greg_brockman_summary(content):
|
| 107 |
model = genai.GenerativeModel("gemini-1.5-pro")
|
| 108 |
prompt = f"""
|
|
|
|
| 112 |
1. GOAL: ...
|
| 113 |
... (rest of template) ...
|
| 114 |
|
| 115 |
+
CONTENT:
|
| 116 |
{content}
|
| 117 |
"""
|
| 118 |
try:
|
| 119 |
resp = model.generate_content(prompt)
|
| 120 |
return resp.text, None
|
| 121 |
except Exception as e:
|
| 122 |
+
logger.error(f"Summary error: {e}")
|
| 123 |
return None, str(e)
|
| 124 |
|
| 125 |
|
|
|
|
| 137 |
resp = model.generate_content(prompt)
|
| 138 |
return resp.text, None
|
| 139 |
except Exception as e:
|
| 140 |
+
logger.error(f"Q&A error: {e}")
|
| 141 |
return None, str(e)
|
| 142 |
|
| 143 |
|
| 144 |
+
# --- Handlers ---
|
| 145 |
def process_pdf(pdf_file, progress=gr.Progress()):
|
| 146 |
global EXTRACTED_TEXT, PDF_SECTIONS, EXTRACTION_METHOD
|
| 147 |
|
|
|
|
| 150 |
if pdf_file is None:
|
| 151 |
return None, None, "❌ No file uploaded.", ""
|
| 152 |
|
| 153 |
+
# Determine path & write bytes if needed
|
| 154 |
tmp_dir = tempfile.gettempdir()
|
| 155 |
+
# Case 1: NamedString (in‐memory) with .name & .data
|
| 156 |
+
if hasattr(pdf_file, "name") and hasattr(pdf_file, "data"):
|
| 157 |
+
path = os.path.join(tmp_dir, pdf_file.name)
|
| 158 |
+
with open(path, "wb") as f:
|
| 159 |
+
f.write(pdf_file.data)
|
| 160 |
+
# Case 2: direct filepath (str)
|
| 161 |
+
elif isinstance(pdf_file, str):
|
| 162 |
+
path = pdf_file
|
| 163 |
+
# Case 3: file‐like with .read()
|
| 164 |
+
elif hasattr(pdf_file, "read"):
|
| 165 |
+
path = os.path.join(tmp_dir, getattr(pdf_file, "name", "uploaded.pdf"))
|
| 166 |
+
with open(path, "wb") as f:
|
| 167 |
+
f.write(pdf_file.read())
|
| 168 |
+
else:
|
| 169 |
+
return None, None, "❌ Unrecognized upload type", ""
|
| 170 |
+
|
| 171 |
+
# Try methods in order
|
| 172 |
methods = []
|
| 173 |
if UNSTRUCTURED_AVAILABLE:
|
| 174 |
methods.append(("unstructured", extract_text_with_unstructured))
|
|
|
|
| 178 |
]
|
| 179 |
|
| 180 |
sections = None
|
| 181 |
+
last_err = ""
|
| 182 |
for name, fn in methods:
|
| 183 |
try:
|
| 184 |
secs = fn(path)
|
|
|
|
| 187 |
EXTRACTION_METHOD = name
|
| 188 |
break
|
| 189 |
except Exception as e:
|
| 190 |
+
last_err = f"{name} failed: {e}"
|
| 191 |
+
logger.warning(last_err)
|
| 192 |
+
|
| 193 |
if not sections:
|
| 194 |
+
return None, None, "❌ Extraction failed", last_err
|
| 195 |
|
| 196 |
+
# Combine & summarize
|
| 197 |
+
combined, structure = "", ""
|
| 198 |
+
for i, sec in enumerate(sections, 1):
|
| 199 |
+
structure += f"{i}. {sec['title']}\n"
|
|
|
|
| 200 |
chunk = f"## {sec['title']}\n{sec['content']}\n\n"
|
| 201 |
+
combined += chunk if len(combined + chunk) < 30000 else f"## {sec['title']}\n[Truncated]\n\n"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 202 |
EXTRACTED_TEXT = combined
|
| 203 |
PDF_SECTIONS = sections
|
| 204 |
|
|
|
|
| 205 |
summary, err = generate_greg_brockman_summary(combined)
|
| 206 |
if err:
|
| 207 |
return None, structure, f"❌ {err}", combined
|
| 208 |
|
| 209 |
+
return summary, structure, "✅ PDF processed", f"Used {EXTRACTION_METHOD}"
|
|
|
|
| 210 |
|
| 211 |
def ask_question(question):
|
| 212 |
if not API_KEY:
|
| 213 |
return "❌ Set GOOGLE_API_KEY in Secrets."
|
| 214 |
if not EXTRACTED_TEXT:
|
| 215 |
+
return "❌ Process a PDF first."
|
| 216 |
if not question.strip():
|
| 217 |
return "❌ Enter a question."
|
| 218 |
|
| 219 |
+
ans, err = answer_question_about_pdf(EXTRACTED_TEXT, question)
|
| 220 |
+
return ans if not err else f"❌ {err}"
|
|
|
|
|
|
|
|
|
|
| 221 |
|
| 222 |
def view_log():
|
| 223 |
try:
|
|
|
|
| 225 |
except Exception as e:
|
| 226 |
return f"Error reading log: {e}"
|
| 227 |
|
|
|
|
| 228 |
def save_summary(summary):
|
| 229 |
if not summary:
|
| 230 |
return "❌ No summary to save."
|
|
|
|
| 233 |
f.write(summary)
|
| 234 |
return f"✅ Saved to {fn}"
|
| 235 |
|
|
|
|
| 236 |
def save_qa(question, answer):
|
| 237 |
if not question or not answer:
|
| 238 |
return "❌ Nothing to save."
|
|
|
|
| 248 |
gr.Markdown("Upload a PDF, get a Greg Brockman style summary, and ask questions.")
|
| 249 |
|
| 250 |
with gr.Tab("Setup"):
|
| 251 |
+
gr.Markdown("⚠️ Make sure `GOOGLE_API_KEY` is set in your Space's Secrets.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 252 |
|
| 253 |
with gr.Tab("PDF Processing"):
|
| 254 |
with gr.Row():
|
| 255 |
pdf_file = gr.File(label="Upload PDF", file_types=[".pdf"])
|
| 256 |
proc_btn = gr.Button("Process PDF", variant="primary")
|
| 257 |
+
status = gr.Markdown("Awaiting upload…")
|
| 258 |
summary_out = gr.Textbox(label="Summary", lines=15)
|
| 259 |
structure_out = gr.Textbox(label="Structure", lines=8)
|
| 260 |
+
log_info = gr.Textbox(label="Internal Log", lines=5)
|
| 261 |
proc_btn.click(
|
| 262 |
fn=process_pdf,
|
| 263 |
inputs=[pdf_file],
|
|
|
|
| 282 |
refresh_btn.click(view_log, inputs=None, outputs=[sys_log])
|
| 283 |
|
| 284 |
if __name__ == "__main__":
|
|
|
|
| 285 |
app.launch(server_name="0.0.0.0")
|