Wckd314 commited on
Commit
f40b3e2
Β·
verified Β·
1 Parent(s): e2dda9a

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +335 -0
app.py ADDED
@@ -0,0 +1,335 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Pundit Feynman β€” Research Paper to Executable Notebook
3
+ FastAPI backend with 3-stage AI pipeline, arXiv support, and SSE streaming.
4
+ """
5
+
6
+ import os
7
+ import re
8
+ import uuid
9
+ import json
10
+ import time
11
+ import shutil
12
+ import asyncio
13
+ from fastapi import FastAPI, UploadFile, File, BackgroundTasks, HTTPException
14
+ from fastapi.responses import FileResponse, StreamingResponse
15
+ from fastapi.staticfiles import StaticFiles
16
+ from dotenv import load_dotenv
17
+
18
+ from utils.pdf_processor import process_pdf_to_base64
19
+ from utils.llm_client import extract_text_from_images, run_full_pipeline_stream, generate_concept_image
20
+ from utils.notebook_builder import build_notebook_from_cells
21
+
22
+ load_dotenv()
23
+
24
+ app = FastAPI(title="Pundit Feynman API", version="2.0")
25
+ os.makedirs("jobs", exist_ok=True)
26
+
27
+ # ── Concurrency limiter β€” max 3 simultaneous generations ──
28
+ _generation_semaphore = asyncio.Semaphore(3)
29
+
30
+
31
+ def _safe_remove(path, retries=3, delay=0.5):
32
+ """Remove a file with retry for Windows file locking."""
33
+ import time
34
+ for i in range(retries):
35
+ try:
36
+ if os.path.exists(path):
37
+ os.remove(path)
38
+ return
39
+ except PermissionError:
40
+ if i < retries - 1:
41
+ time.sleep(delay)
42
+ else:
43
+ print(f" ⚠ Could not delete {path} (file locked)")
44
+
45
+
46
+ # ── Endpoint 1: Extract methodology from PDF upload ──────────────────────
47
+
48
+ @app.post("/api/extract")
49
+ async def extract(file: UploadFile = File(...)):
50
+ if not file.filename.endswith(".pdf"):
51
+ raise HTTPException(status_code=400, detail="Only PDF files are allowed")
52
+
53
+ job_id = str(uuid.uuid4())
54
+ pdf_path = f"jobs/{job_id}.pdf"
55
+
56
+ # Save uploaded PDF
57
+ with open(pdf_path, "wb") as buf:
58
+ shutil.copyfileobj(file.file, buf)
59
+
60
+ try:
61
+ # Phase 1a: PDF β†’ base64 images
62
+ base64_images = process_pdf_to_base64(pdf_path)
63
+
64
+ # Phase 1b: Vision extraction (batched)
65
+ raw_text = extract_text_from_images(base64_images)
66
+
67
+ # Save extracted text for Phase 2
68
+ txt_path = f"jobs/{job_id}.txt"
69
+ with open(txt_path, "w", encoding="utf-8") as f:
70
+ f.write(raw_text)
71
+
72
+ # Clean up PDF
73
+ _safe_remove(pdf_path)
74
+
75
+ return {"job_id": job_id, "status": "extraction_complete", "pages": len(base64_images)}
76
+
77
+ except Exception as e:
78
+ print(f" \u274c Extract error: {e}")
79
+ import traceback
80
+ traceback.print_exc()
81
+ if os.path.exists(pdf_path):
82
+ _safe_remove(pdf_path)
83
+ raise HTTPException(status_code=500, detail=str(e))
84
+
85
+
86
+ # ── Endpoint 1b: Extract from arXiv URL ──────────────────────────────────
87
+
88
+ @app.post("/api/extract-arxiv")
89
+ async def extract_arxiv(payload: dict):
90
+ """Accept an arXiv URL, download the PDF, and run extraction."""
91
+ import httpx
92
+
93
+ arxiv_url = payload.get("url", "").strip()
94
+ if not arxiv_url:
95
+ raise HTTPException(status_code=400, detail="Missing 'url' field")
96
+
97
+ # Extract paper ID from URL
98
+ match = re.search(r'arxiv\.org/(?:abs|pdf)/([0-9]+\.[0-9]+)', arxiv_url)
99
+ if not match:
100
+ raise HTTPException(
101
+ status_code=400,
102
+ detail="Invalid arXiv URL. Expected format: https://arxiv.org/abs/XXXX.XXXXX"
103
+ )
104
+
105
+ paper_id = match.group(1)
106
+ pdf_url = f"https://arxiv.org/pdf/{paper_id}.pdf"
107
+ job_id = str(uuid.uuid4())
108
+ pdf_path = f"jobs/{job_id}.pdf"
109
+
110
+ try:
111
+ # Download PDF from arXiv
112
+ async with httpx.AsyncClient(follow_redirects=True) as http_client:
113
+ print(f" ⬇ Downloading PDF from arXiv: {pdf_url}")
114
+ response = await http_client.get(pdf_url, timeout=30.0)
115
+
116
+ if response.status_code != 200:
117
+ raise HTTPException(
118
+ status_code=500,
119
+ detail=f"Failed to download PDF from arXiv: HTTP {response.status_code}"
120
+ )
121
+
122
+ # Save to disk
123
+ with open(pdf_path, "wb") as f:
124
+ f.write(response.content)
125
+
126
+ size_mb = len(response.content) / (1024 * 1024)
127
+ print(f" βœ… Downloaded: {size_mb:.1f} MB")
128
+
129
+ # Same pipeline as PDF upload
130
+ base64_images = process_pdf_to_base64(pdf_path)
131
+ raw_text = extract_text_from_images(base64_images)
132
+
133
+ txt_path = f"jobs/{job_id}.txt"
134
+ with open(txt_path, "w", encoding="utf-8") as f:
135
+ f.write(raw_text)
136
+
137
+ _safe_remove(pdf_path)
138
+
139
+ return {
140
+ "job_id": job_id,
141
+ "status": "extraction_complete",
142
+ "pages": len(base64_images),
143
+ "arxiv_id": paper_id,
144
+ }
145
+
146
+ except HTTPException:
147
+ raise
148
+ except Exception as e:
149
+ print(f" \u274c ArXiv extract error: {e}")
150
+ import traceback
151
+ traceback.print_exc()
152
+ if os.path.exists(pdf_path):
153
+ _safe_remove(pdf_path)
154
+ raise HTTPException(status_code=500, detail=str(e))
155
+
156
+
157
+ # ── Endpoint 2: Stream code generation (SSE) β€” 3-stage pipeline ──────────
158
+
159
+ @app.get("/api/generate_stream/{job_id}")
160
+ async def generate_stream(job_id: str):
161
+ txt_path = f"jobs/{job_id}.txt"
162
+ if not os.path.exists(txt_path):
163
+ raise HTTPException(status_code=404, detail="Extraction not found. Run /api/extract first.")
164
+
165
+ with open(txt_path, "r", encoding="utf-8") as f:
166
+ raw_text = f.read()
167
+
168
+ print(f"\n{'='*60}")
169
+ print(f" Starting 3-stage pipeline for job: {job_id}")
170
+ print(f" Text length: {len(raw_text)} chars")
171
+ print(f"{'='*60}\n")
172
+
173
+ def event_generator():
174
+ notebook_path = f"jobs/{job_id}.ipynb"
175
+ final_cells = None
176
+ pipeline_success = False
177
+
178
+ try:
179
+ for event_type, data in run_full_pipeline_stream(raw_text):
180
+ if event_type == "text":
181
+ payload = json.dumps({"text": data})
182
+ yield f"data: {payload}\n\n"
183
+
184
+ elif event_type == "cells":
185
+ final_cells = data
186
+ print(f" βœ… Pipeline produced {len(data)} cells")
187
+
188
+ elif event_type == "analysis":
189
+ # Save analysis to disk for the /api/visualize endpoint
190
+ analysis_path = f"jobs/{job_id}_analysis.json"
191
+ try:
192
+ with open(analysis_path, "w", encoding="utf-8") as af:
193
+ json.dump(data, af)
194
+ except Exception:
195
+ pass
196
+ # Signal frontend that visualization is ready
197
+ yield f"data: {json.dumps({'analysis_done': True})}\n\n"
198
+
199
+ elif event_type == "error":
200
+ err_msg = f"\n❌ Pipeline Error: {data}\n"
201
+ print(f" ❌ Pipeline error: {data}")
202
+ payload = json.dumps({"text": err_msg})
203
+ yield f"data: {payload}\n\n"
204
+
205
+ except Exception as e:
206
+ err_msg = f"\n❌ Unexpected Error: {str(e)}\n"
207
+ print(f" ❌ Unexpected pipeline error: {e}")
208
+ import traceback
209
+ traceback.print_exc()
210
+ err_payload = json.dumps({"text": err_msg})
211
+ yield f"data: {err_payload}\n\n"
212
+
213
+ # Build notebook from cells if we got them
214
+ if final_cells and len(final_cells) > 0:
215
+ try:
216
+ build_notebook_from_cells(final_cells, notebook_path)
217
+ pipeline_success = True
218
+ print(f" πŸ““ Notebook saved: {notebook_path}")
219
+ except Exception as e:
220
+ print(f" ❌ Failed to build notebook: {e}")
221
+ err_payload = json.dumps({"text": f"\n❌ Failed to save notebook: {str(e)}\n"})
222
+ yield f"data: {err_payload}\n\n"
223
+ else:
224
+ no_cells_msg = json.dumps({"text": "\n❌ Pipeline completed but no cells were produced. Check server logs for details.\n"})
225
+ yield f"data: {no_cells_msg}\n\n"
226
+ print(f" ❌ No cells produced β€” notebook not saved")
227
+
228
+ # Always send done event with status
229
+ done_payload = json.dumps({"done": True, "success": pipeline_success})
230
+ yield f"data: {done_payload}\n\n"
231
+
232
+ # Only clean up extraction text on success
233
+ if pipeline_success and os.path.exists(txt_path):
234
+ os.remove(txt_path)
235
+
236
+ return StreamingResponse(
237
+ event_generator(),
238
+ media_type="text/event-stream",
239
+ headers={
240
+ "Cache-Control": "no-cache",
241
+ "Connection": "keep-alive",
242
+ "X-Accel-Buffering": "no",
243
+ }
244
+ )
245
+
246
+
247
+ # ── Endpoint 3: Download notebook ────────────────────────────────────────
248
+
249
+ async def cleanup_job_files(job_id: str):
250
+ """Remove all job artifacts after download with a delay to ensure transfer."""
251
+ await asyncio.sleep(10) # Wait for download to start/finish
252
+ for ext in [".pdf", ".txt", ".ipynb", "_analysis.json"]:
253
+ path = f"jobs/{job_id}{ext}"
254
+ if os.path.exists(path):
255
+ try:
256
+ os.remove(path)
257
+ except Exception:
258
+ pass
259
+
260
+
261
+ @app.get("/api/download/{job_id}")
262
+ async def download_notebook(job_id: str, background_tasks: BackgroundTasks):
263
+ notebook_path = f"jobs/{job_id}.ipynb"
264
+ if not os.path.exists(notebook_path):
265
+ raise HTTPException(status_code=404, detail="Notebook not found")
266
+
267
+ background_tasks.add_task(cleanup_job_files, job_id)
268
+
269
+ return FileResponse(
270
+ notebook_path,
271
+ filename="pundit_feynman_notebook.ipynb",
272
+ media_type="application/octet-stream",
273
+ )
274
+
275
+
276
+ # ── Health check ─────────────────────────────────────────────────────────
277
+
278
+ @app.get("/health")
279
+ async def health():
280
+ return {"status": "ok", "version": "2.0", "pipeline": "3-stage"}
281
+
282
+
283
+ # ── Endpoint 5: Generate visual illustration ─────────────────────────────
284
+
285
+ @app.post("/api/visualize/{job_id}")
286
+ async def visualize_concept(job_id: str):
287
+ """Generate a visual illustration of the paper's core concept."""
288
+ print(f"\n[DEBUG] {time.strftime('%H:%M:%S')} 🎨 ROUTE HIT: /api/visualize/{job_id}")
289
+
290
+ # Verify job id is sane
291
+ if not job_id or job_id == "null" or job_id == "undefined":
292
+ print(f"[DEBUG] ❌ ERROR: Received invalid Job ID: '{job_id}'")
293
+ raise HTTPException(status_code=400, detail="Invalid Job ID received")
294
+
295
+ analysis_path = f"jobs/{job_id}_analysis.json"
296
+ if not os.path.exists(analysis_path):
297
+ print(f"[DEBUG] ❌ ERROR: Analysis file does not exist: {analysis_path}")
298
+ # List files in jobs to help debug
299
+ print(f"[DEBUG] Files in jobs/: {os.listdir('jobs')}")
300
+ raise HTTPException(status_code=404, detail=f"Analysis not found for job {job_id}")
301
+
302
+ print(f"[DEBUG] πŸ“‚ Loading analysis JSON...")
303
+ try:
304
+ with open(analysis_path, "r", encoding="utf-8") as f:
305
+ analysis = json.load(f)
306
+ except Exception as e:
307
+ print(f"[DEBUG] ❌ JSON ERROR: Could not parse {analysis_path}: {e}")
308
+ raise HTTPException(status_code=500, detail="Corrupted analysis file")
309
+
310
+ try:
311
+ print(f"[DEBUG] πŸ–ŒοΈ Dispatching generation to threadpool for Job: {job_id}...")
312
+ loop = asyncio.get_event_loop()
313
+ image_b64 = await loop.run_in_executor(None, generate_concept_image, analysis)
314
+
315
+ print(f"[DEBUG] βœ… SUCCESS: Generation finished for Job: {job_id}")
316
+ return {"image": f"data:image/png;base64,{image_b64}"}
317
+ except Exception as e:
318
+ print(f"[DEBUG] ❌ GENERATION ERROR for Job {job_id}: {e}")
319
+ import traceback
320
+ traceback.print_exc()
321
+ raise HTTPException(status_code=500, detail=str(e))
322
+
323
+ @app.get("/api/ping")
324
+ async def ping():
325
+ print("[DEBUG] πŸ“ Ping received")
326
+ return {"status": "ok", "message": "Pundit Feynman Backend is ALIVE"}
327
+
328
+
329
+ # ── Static files (MUST be last β€” catch-all) ──────────────────────────────
330
+
331
+ app.mount("/", StaticFiles(directory="static", html=True), name="static")
332
+
333
+ if __name__ == "__main__":
334
+ import uvicorn
335
+ uvicorn.run(app, host="0.0.0.0", port=8000)