ginipick commited on
Commit
ce38851
Β·
1 Parent(s): 62cebf7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +305 -237
app.py CHANGED
@@ -44,6 +44,9 @@ pdf_cache: Dict[str, Dict[str, Any]] = {}
44
  cache_locks = {}
45
  pdf_embeddings: Dict[str, Dict[str, Any]] = {}
46
 
 
 
 
47
 
48
  def get_cache_path(pdf_name: str):
49
  return CACHE_DIR / f"{pdf_name}_cache.json"
@@ -102,18 +105,18 @@ def get_pdf_page_as_base64(pdf_path: str, page_num: int, scale: float = 1.0) ->
102
  return None
103
 
104
 
105
- def get_pdf_pages_as_base64(pdf_path: str, max_pages: int = 25, scale: float = 0.8) -> List[Dict[str, Any]]:
106
- """PDF μ—¬λŸ¬ νŽ˜μ΄μ§€λ₯Ό base64 이미지 리슀트둜 λ³€ν™˜ (κΈ°λ³Έ 25νŽ˜μ΄μ§€)"""
107
  try:
108
  doc = fitz.open(pdf_path)
109
  total_pages = doc.page_count
110
- pages_to_process = min(total_pages, max_pages)
111
 
112
  images = []
113
- for page_num in range(pages_to_process):
114
  page = doc[page_num]
115
  pix = page.get_pixmap(matrix=fitz.Matrix(scale, scale))
116
- img_data = pix.tobytes("jpeg", 80)
117
  b64_img = base64.b64encode(img_data).decode('utf-8')
118
  images.append({
119
  "page": page_num + 1,
@@ -121,35 +124,52 @@ def get_pdf_pages_as_base64(pdf_path: str, max_pages: int = 25, scale: float = 0
121
  })
122
 
123
  doc.close()
124
- logger.info(f"PDF {pages_to_process}/{total_pages}νŽ˜μ΄μ§€ 이미지 λ³€ν™˜ μ™„λ£Œ")
125
- return images
126
  except Exception as e:
127
  logger.error(f"PDF νŽ˜μ΄μ§€λ“€ 이미지 λ³€ν™˜ 였λ₯˜: {e}")
128
- return []
129
 
130
 
131
- async def analyze_pdf_with_vlm(pdf_id: str, force_refresh: bool = False) -> Dict[str, Any]:
132
- """VLM으둜 PDF 전체 뢄석 ν›„ μΊμ‹œμ— μ €μž₯ (25νŽ˜μ΄μ§€)"""
 
 
133
 
134
- # μΊμ‹œ 확인
135
- if not force_refresh:
136
- cached = load_analysis_cache(pdf_id)
137
- if cached:
138
- return cached
 
 
 
 
 
139
 
140
- pdf_path = str(PROMPT_PDF_PATH)
141
- if not PROMPT_PDF_PATH.exists():
142
- return {"error": "PDF νŒŒμΌμ„ 찾을 수 μ—†μŠ΅λ‹ˆλ‹€."}
 
 
 
 
143
 
144
- # PDF νŽ˜μ΄μ§€λ“€μ„ μ΄λ―Έμ§€λ‘œ λ³€ν™˜ (μ΅œλŒ€ 25νŽ˜μ΄μ§€)
145
- page_images = get_pdf_pages_as_base64(pdf_path, max_pages=25, scale=0.7)
 
 
 
 
 
 
 
 
146
 
147
  if not page_images:
148
- return {"error": "PDF 이미지 λ³€ν™˜ μ‹€νŒ¨"}
149
 
150
- # VLM으둜 전체 λ‚΄μš© 뢄석
151
  content_parts = []
152
-
153
  for img_data in page_images:
154
  content_parts.append({
155
  "type": "image_url",
@@ -158,85 +178,147 @@ async def analyze_pdf_with_vlm(pdf_id: str, force_refresh: bool = False) -> Dict
158
  }
159
  })
160
 
 
161
  content_parts.append({
162
  "type": "text",
163
- "text": f"""μœ„ 이미지듀은 PDF λ¬Έμ„œμ˜ νŽ˜μ΄μ§€λ“€μž…λ‹ˆλ‹€ (총 {len(page_images)}νŽ˜μ΄μ§€).
164
-
165
- 이 PDF λ¬Έμ„œμ˜ 전체 λ‚΄μš©μ„ μƒμ„Έν•˜κ²Œ λΆ„μ„ν•΄μ£Όμ„Έμš”.
166
-
167
- λ‹€μŒ ν˜•μ‹μœΌλ‘œ μž‘μ„±ν•΄μ£Όμ„Έμš”:
168
-
169
- ## λ¬Έμ„œ κ°œμš”
170
- - λ¬Έμ„œ 제λͺ©/주제:
171
- - λ¬Έμ„œ μœ ν˜•:
172
- - μž‘μ„± λͺ©μ :
173
-
174
- ## νŽ˜μ΄μ§€λ³„ 핡심 λ‚΄μš©
175
- 각 νŽ˜μ΄μ§€μ˜ μ£Όμš” λ‚΄μš©μ„ μš”μ•½ν•΄μ£Όμ„Έμš”.
176
 
177
- ## 전체 μš”μ•½
178
- λ¬Έμ„œμ˜ 핡심 λ‚΄μš©μ„ 500자 λ‚΄μ™Έλ‘œ μš”μ•½ν•΄μ£Όμ„Έμš”.
179
-
180
- ## μ£Όμš” ν‚€μ›Œλ“œ
181
- λ¬Έμ„œμ—μ„œ μ€‘μš”ν•œ οΏ½οΏ½μ›Œλ“œλ‚˜ κ°œλ…λ“€μ„ λ‚˜μ—΄ν•΄μ£Όμ„Έμš”.
182
-
183
- ## 상세 λ‚΄μš©
184
- λ¬Έμ„œμ— ν¬ν•¨λœ λͺ¨λ“  μ€‘μš”ν•œ 정보, 데이터, ν‘œ, κ·Έλž˜ν”„ λ‚΄μš© 등을 μƒμ„Ένžˆ κΈ°μˆ ν•΄μ£Όμ„Έμš”.
185
 
186
  ν•œκ΅­μ–΄λ‘œ μž‘μ„±ν•΄μ£Όμ„Έμš”."""
187
  })
188
 
189
  messages = [{"role": "user", "content": content_parts}]
 
 
190
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
191
  try:
192
- logger.info(f"VLM PDF 뢄석 μ‹œμž‘: {len(page_images)}νŽ˜μ΄μ§€")
193
- analysis_text = call_fireworks_vlm_api(messages, max_tokens=8192, temperature=0.3)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
194
 
195
  analysis_data = {
196
  "pdf_id": pdf_id,
197
- "total_pages": len(page_images),
198
- "analysis": analysis_text,
 
 
199
  "created_at": time.time()
200
  }
201
 
202
  # μΊμ‹œμ— μ €μž₯
203
  save_analysis_cache(pdf_id, analysis_data)
204
 
 
 
 
205
  return analysis_data
 
206
  except Exception as e:
207
  logger.error(f"VLM PDF 뢄석 였λ₯˜: {e}")
 
208
  return {"error": str(e)}
209
 
210
 
211
- def call_fireworks_vlm_api(messages: List[Dict], max_tokens: int = 4096, temperature: float = 0.6) -> str:
212
- """Fireworks AI VLM API 호좜 (이미지 뢄석 지원)"""
213
- if not HAS_VALID_API_KEY:
214
- raise Exception("API ν‚€κ°€ μ„€μ •λ˜μ§€ μ•Šμ•˜μŠ΅λ‹ˆλ‹€.")
215
-
216
- payload = {
217
- "model": FIREWORKS_VLM_MODEL,
218
- "max_tokens": max_tokens,
219
- "top_p": 1,
220
- "top_k": 40,
221
- "presence_penalty": 0,
222
- "frequency_penalty": 0,
223
- "temperature": temperature,
224
- "messages": messages
225
- }
226
-
227
- headers = {
228
- "Accept": "application/json",
229
- "Content-Type": "application/json",
230
- "Authorization": f"Bearer {FIREWORKS_API_KEY}"
231
- }
232
-
233
- response = requests.post(FIREWORKS_API_URL, headers=headers, data=json.dumps(payload), timeout=120)
234
-
235
- if response.status_code != 200:
236
- raise Exception(f"API 였λ₯˜: {response.status_code} - {response.text}")
237
-
238
- result = response.json()
239
- return result["choices"][0]["message"]["content"]
240
 
241
 
242
  def extract_pdf_text(pdf_path: str) -> List[Dict[str, Any]]:
@@ -245,18 +327,15 @@ def extract_pdf_text(pdf_path: str) -> List[Dict[str, Any]]:
245
  chunks = []
246
  for page_num in range(len(doc)):
247
  page = doc[page_num]
248
- # μ—¬λŸ¬ λ°©λ²•μœΌλ‘œ ν…μŠ€νŠΈ μΆ”μΆœ μ‹œλ„
249
  text = page.get_text("text")
250
 
251
- # ν…μŠ€νŠΈκ°€ μ—†μœΌλ©΄ λ‹€λ₯Έ 방법 μ‹œλ„
252
  if not text.strip():
253
  text = page.get_text("blocks")
254
  if text:
255
  text = "\n".join([block[4] for block in text if len(block) > 4 and isinstance(block[4], str)])
256
 
257
- # μ—¬μ „νžˆ ν…μŠ€νŠΈκ°€ μ—†μœΌλ©΄ νŽ˜μ΄μ§€ μ •λ³΄λ§Œμ΄λΌλ„ μΆ”κ°€
258
  if not text.strip():
259
- text = f"[νŽ˜μ΄μ§€ {page_num + 1} - 이미지 λ˜λŠ” ν…μŠ€νŠΈ μ—†μŒ]"
260
 
261
  chunks.append({
262
  "page": page_num + 1,
@@ -271,54 +350,6 @@ def extract_pdf_text(pdf_path: str) -> List[Dict[str, Any]]:
271
  return []
272
 
273
 
274
- async def get_pdf_embedding(pdf_id: str) -> Dict[str, Any]:
275
- try:
276
- embedding_path = get_embedding_path(pdf_id)
277
- if embedding_path.exists():
278
- try:
279
- with open(embedding_path, "r", encoding="utf-8") as f:
280
- return json.load(f)
281
- except Exception as e:
282
- logger.error(f"μž„λ² λ”© μΊμ‹œ λ‘œλ“œ 였λ₯˜: {e}")
283
-
284
- pdf_path = str(PROMPT_PDF_PATH)
285
- if not PROMPT_PDF_PATH.exists():
286
- raise ValueError(f"PDF νŒŒμΌμ„ 찾을 수 μ—†μŠ΅λ‹ˆλ‹€: {pdf_path}")
287
-
288
- chunks = extract_pdf_text(pdf_path)
289
-
290
- # ν…μŠ€νŠΈ μΆ”μΆœ μ‹€νŒ¨ν•΄λ„ κΈ°λ³Έ 정보 제곡
291
- if not chunks:
292
- logger.warning(f"PDFμ—μ„œ ν…μŠ€νŠΈλ₯Ό μΆ”μΆœν•  수 μ—†μŠ΅λ‹ˆλ‹€. κΈ°λ³Έ μ •λ³΄λ‘œ λŒ€μ²΄ν•©λ‹ˆλ‹€.")
293
- try:
294
- doc = fitz.open(pdf_path)
295
- total_pages = doc.page_count
296
- doc.close()
297
- chunks = [{"page": i+1, "text": f"[νŽ˜μ΄μ§€ {i+1}]", "chunk_id": f"page_{i+1}"} for i in range(total_pages)]
298
- except:
299
- chunks = [{"page": 1, "text": "[PDF λ‚΄μš©μ„ 읽을 수 μ—†μŠ΅λ‹ˆλ‹€]", "chunk_id": "page_1"}]
300
-
301
- embedding_data = {
302
- "pdf_id": pdf_id,
303
- "pdf_path": pdf_path,
304
- "chunks": chunks,
305
- "created_at": time.time()
306
- }
307
-
308
- with open(embedding_path, "w", encoding="utf-8") as f:
309
- json.dump(embedding_data, f, ensure_ascii=False)
310
-
311
- return embedding_data
312
- except Exception as e:
313
- logger.error(f"PDF μž„λ² λ”© 생성 였λ₯˜: {e}")
314
- return {"error": str(e), "pdf_id": pdf_id, "chunks": []}
315
-
316
-
317
- def call_fireworks_api(messages: List[Dict], max_tokens: int = 4096, temperature: float = 0.6) -> str:
318
- """Fireworks AI VLM API 호좜"""
319
- return call_fireworks_vlm_api(messages, max_tokens, temperature)
320
-
321
-
322
  async def query_pdf(pdf_id: str, query: str) -> Dict[str, Any]:
323
  """μΊμ‹œλœ VLM 뢄석 κ²°κ³ΌοΏ½οΏ½ 기반으둜 μ§ˆμ˜μ‘λ‹΅"""
324
  try:
@@ -329,10 +360,18 @@ async def query_pdf(pdf_id: str, query: str) -> Dict[str, Any]:
329
  }
330
 
331
  # μΊμ‹œλœ 뢄석 κ²°κ³Ό 확인
332
- analysis_data = await analyze_pdf_with_vlm(pdf_id)
333
 
334
- if "error" in analysis_data:
335
- return {"error": analysis_data["error"], "answer": "PDF 뢄석에 μ‹€νŒ¨ν–ˆμŠ΅λ‹ˆλ‹€."}
 
 
 
 
 
 
 
 
336
 
337
  analysis_text = analysis_data.get("analysis", "")
338
  total_pages = analysis_data.get("total_pages", 0)
@@ -351,7 +390,7 @@ async def query_pdf(pdf_id: str, query: str) -> Dict[str, Any]:
351
  뢄석 λ‚΄μš©μ— μ—†λŠ” μ •λ³΄λŠ” "ν•΄λ‹Ή 정보λ₯Ό 찾을 수 μ—†μŠ΅λ‹ˆλ‹€"라고 μ†”μ§νžˆ λ‹΅ν•΄μ£Όμ„Έμš”.
352
 
353
  === PDF 뢄석 κ²°κ³Ό ===
354
- {analysis_text}
355
  =================="""
356
  },
357
  {
@@ -361,32 +400,17 @@ async def query_pdf(pdf_id: str, query: str) -> Dict[str, Any]:
361
  ]
362
 
363
  try:
364
- for attempt in range(3):
365
- try:
366
- answer = call_fireworks_vlm_api(messages, max_tokens=4096, temperature=0.6)
367
- return {
368
- "answer": answer,
369
- "pdf_id": pdf_id,
370
- "query": query
371
- }
372
- except Exception as api_error:
373
- logger.error(f"Fireworks API 호좜 였λ₯˜ (μ‹œλ„ {attempt+1}/3): {api_error}")
374
- if attempt == 2:
375
- raise api_error
376
- await asyncio.sleep(2 * (attempt + 1))
377
-
378
- raise Exception("API 호좜 μž¬μ‹œλ„ λͺ¨λ‘ μ‹€νŒ¨")
379
  except Exception as api_error:
380
- logger.error(f"Fireworks API 호좜 μ΅œμ’… 였λ₯˜: {api_error}")
381
  error_message = str(api_error)
382
- if "Connection" in error_message:
383
- return {"error": "AI μ„œλ²„μ™€ μ—°κ²°ν•  수 μ—†μŠ΅λ‹ˆλ‹€.", "answer": "λ„€νŠΈμ›Œν¬ 연결을 ν™•μΈν•΄μ£Όμ„Έμš”."}
384
- elif "401" in error_message or "Unauthorized" in error_message:
385
- return {"error": "API ν‚€κ°€ μœ νš¨ν•˜μ§€ μ•ŠμŠ΅λ‹ˆλ‹€.", "answer": "API 인증에 μ‹€νŒ¨ν–ˆμŠ΅λ‹ˆλ‹€."}
386
- elif "429" in error_message or "Rate limit" in error_message:
387
- return {"error": "API 호좜 ν•œλ„ 초과", "answer": "μž μ‹œ ν›„ λ‹€μ‹œ μ‹œλ„ν•΄μ£Όμ„Έμš”."}
388
- else:
389
- return {"error": f"AI 였λ₯˜: {error_message}", "answer": "처리 쀑 였λ₯˜κ°€ λ°œμƒν–ˆμŠ΅λ‹ˆλ‹€."}
390
  except Exception as e:
391
  logger.error(f"μ§ˆμ˜μ‘λ‹΅ 처리 였λ₯˜: {e}")
392
  return {"error": str(e), "answer": "처리 쀑 였λ₯˜κ°€ λ°œμƒν–ˆμŠ΅λ‹ˆλ‹€."}
@@ -401,51 +425,42 @@ async def summarize_pdf(pdf_id: str) -> Dict[str, Any]:
401
  "summary": "API ν‚€κ°€ μ—†μ–΄ μš”μ•½μ„ 생성할 수 μ—†μŠ΅λ‹ˆλ‹€."
402
  }
403
 
404
- # μΊμ‹œλœ 뢄석 κ²°κ³Ό 확인 (μ—†μœΌλ©΄ μƒˆλ‘œ 뢄석)
405
- analysis_data = await analyze_pdf_with_vlm(pdf_id)
406
 
407
- if "error" in analysis_data:
408
- return {"error": analysis_data["error"], "summary": "PDF 뢄석에 μ‹€νŒ¨ν–ˆμŠ΅λ‹ˆλ‹€."}
 
 
 
 
 
 
 
 
409
 
410
- analysis_text = analysis_data.get("analysis", "")
411
  total_pages = analysis_data.get("total_pages", 0)
 
412
 
413
- if not analysis_text:
414
- return {"error": "뢄석 데이터 μ—†μŒ", "summary": "PDF 뢄석 데이터λ₯Ό 찾을 수 μ—†μŠ΅λ‹ˆλ‹€."}
415
-
416
- # 뢄석 κ²°κ³Όμ—μ„œ μš”μ•½ λΆ€λΆ„ μΆ”μΆœ λ˜λŠ” μƒˆλ‘œ μš”μ•½ 생성
417
- messages = [
418
- {
419
- "role": "system",
420
- "content": """μ•„λž˜λŠ” PDF λ¬Έμ„œλ₯Ό VLM으둜 λΆ„μ„ν•œ κ²°κ³Όμž…λ‹ˆλ‹€.
421
- 이 뢄석 λ‚΄μš©μ„ λ°”νƒ•μœΌλ‘œ μ‚¬μš©μžμ—κ²Œ λ¬Έμ„œλ₯Ό μ†Œκ°œν•˜λŠ” μΉœμ ˆν•œ μš”μ•½μ„ μž‘μ„±ν•΄μ£Όμ„Έμš”.
422
- 500자 μ΄λ‚΄λ‘œ 핡심 λ‚΄μš©μ„ κ°„κ²°ν•˜κ²Œ ν•œκ΅­μ–΄λ‘œ μš”μ•½ν•΄μ£Όμ„Έμš”."""
423
- },
424
- {
425
- "role": "user",
426
- "content": f"λ‹€μŒ PDF 뢄석 κ²°κ³Όλ₯Ό μš”μ•½ν•΄μ£Όμ„Έμš”:\n\n{analysis_text}"
427
- }
428
- ]
429
-
430
- try:
431
- summary = call_fireworks_vlm_api(messages, max_tokens=1024, temperature=0.5)
432
  return {
433
  "summary": summary,
434
  "pdf_id": pdf_id,
 
 
 
 
 
 
 
 
 
 
435
  "total_pages": total_pages
436
  }
437
- except Exception as api_error:
438
- logger.error(f"μš”μ•½ 생성 였λ₯˜: {api_error}")
439
- # μΊμ‹œλœ 뢄석 κ²°κ³Όμ—μ„œ 직접 μš”μ•½ λΆ€λΆ„ μΆ”μΆœ μ‹œλ„
440
- if "## 전체 μš”μ•½" in analysis_text:
441
- try:
442
- start = analysis_text.index("## 전체 μš”μ•½")
443
- end = analysis_text.index("##", start + 10) if "##" in analysis_text[start+10:] else len(analysis_text)
444
- summary_part = analysis_text[start:end].replace("## 전체 μš”μ•½", "").strip()
445
- return {"summary": summary_part, "pdf_id": pdf_id, "total_pages": total_pages}
446
- except:
447
- pass
448
- return {"error": str(api_error), "summary": "μš”μ•½ 생성 쀑 였λ₯˜κ°€ λ°œμƒν–ˆμŠ΅λ‹ˆλ‹€."}
449
 
450
  except Exception as e:
451
  logger.error(f"PDF μš”μ•½ 생성 였λ₯˜: {e}")
@@ -572,9 +587,10 @@ async def cache_pdf(pdf_path: str):
572
  async def startup_event():
573
  if PROMPT_PDF_PATH.exists():
574
  logger.info(f"prompt.pdf 파일 발견: {PROMPT_PDF_PATH}")
 
575
  asyncio.create_task(cache_pdf(str(PROMPT_PDF_PATH)))
576
- # VLM 뢄석도 λ°±κ·ΈλΌμš΄λ“œμ—μ„œ μ‹œμž‘
577
- asyncio.create_task(analyze_pdf_with_vlm(PROMPT_PDF_ID))
578
  else:
579
  logger.warning(f"prompt.pdf νŒŒμΌμ„ 찾을 수 μ—†μŠ΅λ‹ˆλ‹€: {PROMPT_PDF_PATH}")
580
 
@@ -602,15 +618,27 @@ async def get_pdf_info():
602
 
603
  @app.get("/api/analysis-status")
604
  async def get_analysis_status():
605
- """VLM 뢄석 μΊμ‹œ μƒνƒœ 확인"""
 
606
  cached = load_analysis_cache(PROMPT_PDF_ID)
607
  if cached:
608
  return {
609
  "status": "completed",
610
  "total_pages": cached.get("total_pages", 0),
 
611
  "created_at": cached.get("created_at", 0)
612
  }
613
- return {"status": "not_analyzed"}
 
 
 
 
 
 
 
 
 
 
614
 
615
 
616
  @app.post("/api/reanalyze-pdf")
@@ -626,8 +654,12 @@ async def reanalyze_pdf():
626
  cache_path.unlink()
627
  logger.info("κΈ°μ‘΄ VLM 뢄석 μΊμ‹œ μ‚­μ œ")
628
 
 
 
 
 
629
  # λ°±κ·ΈλΌμš΄λ“œμ—μ„œ μž¬λΆ„μ„ μ‹œμž‘
630
- asyncio.create_task(analyze_pdf_with_vlm(PROMPT_PDF_ID, force_refresh=True))
631
 
632
  return {"status": "started", "message": "PDF μž¬λΆ„μ„μ„ μ‹œμž‘ν•©λ‹ˆλ‹€."}
633
  except Exception as e:
@@ -679,11 +711,10 @@ async def api_query_pdf(query: Dict[str, str]):
679
  return JSONResponse(content={"error": "PDF νŒŒμΌμ„ 찾을 수 μ—†μŠ΅λ‹ˆλ‹€"}, status_code=404)
680
 
681
  result = await query_pdf(PROMPT_PDF_ID, user_query)
682
- # μ—λŸ¬κ°€ μžˆμ–΄λ„ answerκ°€ 있으면 정상 μ‘λ‹΅μœΌλ‘œ 처리
683
  if "answer" in result:
684
  return result
685
  if "error" in result:
686
- return JSONResponse(content=result, status_code=200) # μ—λŸ¬ λ©”μ‹œμ§€λ„ 정상 μ‘λ‹΅μœΌλ‘œ
687
  return result
688
  except Exception as e:
689
  logger.error(f"μ§ˆμ˜μ‘λ‹΅ API 였λ₯˜: {e}")
@@ -697,11 +728,10 @@ async def api_summarize_pdf():
697
  return JSONResponse(content={"error": "PDF νŒŒμΌμ„ 찾을 수 μ—†μŠ΅λ‹ˆλ‹€"}, status_code=404)
698
 
699
  result = await summarize_pdf(PROMPT_PDF_ID)
700
- # μ—λŸ¬κ°€ μžˆμ–΄λ„ summaryκ°€ 있으면 정상 μ‘λ‹΅μœΌλ‘œ 처리
701
  if "summary" in result:
702
  return result
703
  if "error" in result:
704
- return JSONResponse(content=result, status_code=200) # μ—λŸ¬ λ©”μ‹œμ§€λ„ 정상 μ‘λ‹΅μœΌλ‘œ
705
  return result
706
  except Exception as e:
707
  logger.error(f"PDF μš”μ•½ API 였λ₯˜: {e}")
@@ -1325,6 +1355,7 @@ HTML = """
1325
  let isAiChatActive = false;
1326
  let isAiProcessing = false;
1327
  let hasLoadedSummary = false;
 
1328
 
1329
  function $id(id) { return document.getElementById(id); }
1330
 
@@ -1411,6 +1442,17 @@ HTML = """
1411
  }
1412
  }
1413
 
 
 
 
 
 
 
 
 
 
 
 
1414
  async function loadPdfSummary() {
1415
  if (isAiProcessing || hasLoadedSummary) return;
1416
 
@@ -1418,65 +1460,93 @@ HTML = """
1418
  isAiProcessing = true;
1419
  addTypingIndicator();
1420
 
1421
- // λ¨Όμ € 뢄석 μƒνƒœ 확인
1422
- const statusResponse = await fetch('/api/analysis-status');
1423
- const statusData = await statusResponse.json();
1424
 
1425
- if (statusData.status !== 'completed') {
1426
  removeTypingIndicator();
1427
- addChatMessage(`μ•ˆλ…•ν•˜μ„Έμš”! ν˜„μž¬ PDFλ₯Ό AIκ°€ λΆ„μ„ν•˜κ³  μžˆμŠ΅λ‹ˆλ‹€. πŸ“Š<br><br>μž μ‹œλ§Œ κΈ°λ‹€λ €μ£Όμ‹œλ©΄ 뢄석이 μ™„λ£Œλœ ν›„ μ§ˆλ¬Έμ— λ‹΅λ³€ν•  수 μžˆμŠ΅λ‹ˆλ‹€.<br><small style="color:#999;">λΆ„μ„μ—λŠ” μ•½ 1~2λΆ„ 정도 μ†Œμš”λ  수 μžˆμŠ΅λ‹ˆλ‹€.</small>`);
 
1428
  hasLoadedSummary = true;
1429
  isAiProcessing = false;
1430
 
1431
- // 뢄석 μ™„λ£Œλ  λ•ŒκΉŒμ§€ 폴링
1432
- pollAnalysisStatus();
 
 
 
 
 
 
 
 
1433
  return;
1434
  }
1435
 
 
 
 
 
 
 
 
 
 
 
1436
  const response = await fetch('/api/ai/summarize-pdf');
1437
  const data = await response.json();
1438
 
1439
  removeTypingIndicator();
1440
 
1441
  if (data.summary) {
1442
- const pageInfo = data.total_pages ? ` (${data.total_pages}νŽ˜μ΄μ§€ λΆ„μ„μ™„λ£Œ)` : '';
1443
  addChatMessage(`μ•ˆλ…•ν•˜μ„Έμš”! 이 PDF에 λŒ€ν•΄ 무엇이든 μ§ˆλ¬Έν•΄μ£Όμ„Έμš”.${pageInfo}<br><br><strong>πŸ“„ PDF μš”μ•½:</strong><br>${data.summary}`);
1444
- hasLoadedSummary = true;
1445
- } else if (data.error) {
1446
- addChatMessage(`μ•ˆλ…•ν•˜μ„Έμš”! PDF에 λŒ€ν•΄ κΆκΈˆν•œ 것을 μ§ˆλ¬Έν•΄μ£Όμ„Έμš”.<br><br><small style="color:#999;">⚠️ ${data.error}</small>`);
1447
- hasLoadedSummary = true;
1448
  } else {
1449
  addChatMessage("μ•ˆλ…•ν•˜μ„Έμš”! PDF에 λŒ€ν•΄ μ§ˆλ¬Έν•΄μ£Όμ„Έμš”. μ΅œμ„ μ„ λ‹€ν•΄ λ‹΅λ³€ν•˜κ² μŠ΅λ‹ˆλ‹€.");
1450
- hasLoadedSummary = true;
1451
  }
 
 
1452
  } catch (error) {
1453
  console.error("PDF μš”μ•½ λ‘œλ“œ 였λ₯˜:", error);
1454
  removeTypingIndicator();
1455
- addChatMessage("μ•ˆλ…•ν•˜μ„Έμš”! PDF에 λŒ€ν•΄ μ§ˆλ¬Έν•΄μ£Όμ„Έμš”. μ΅œμ„ μ„ λ‹€ν•΄ λ‹΅λ³€ν•˜κ² μŠ΅λ‹ˆλ‹€.");
1456
  hasLoadedSummary = true;
1457
  } finally {
1458
  isAiProcessing = false;
1459
  }
1460
  }
1461
 
1462
- async function pollAnalysisStatus() {
1463
- // 뢄석 μ™„λ£Œλ  λ•ŒκΉŒμ§€ 10μ΄ˆλ§ˆλ‹€ 확인
1464
- const checkInterval = setInterval(async () => {
 
1465
  try {
1466
- const response = await fetch('/api/analysis-status');
1467
- const data = await response.json();
1468
 
1469
  if (data.status === 'completed') {
1470
- clearInterval(checkInterval);
1471
- addChatMessage(`βœ… PDF 뢄석이 μ™„λ£Œλ˜μ—ˆμŠ΅λ‹ˆλ‹€! (${data.total_pages}νŽ˜μ΄μ§€)<br>이제 μ§ˆλ¬Έν•΄μ£Όμ„Έμš”.`);
 
 
 
 
 
 
 
 
1472
  }
1473
  } catch (e) {
1474
- console.error("뢄석 μƒνƒœ 확인 였λ₯˜:", e);
1475
  }
1476
- }, 10000);
1477
 
1478
  // 5λΆ„ ν›„ μžλ™ 쀑지
1479
- setTimeout(() => clearInterval(checkInterval), 300000);
 
 
 
 
 
1480
  }
1481
 
1482
  async function submitQuestion(question) {
@@ -1489,14 +1559,17 @@ HTML = """
1489
  addChatMessage(question, true);
1490
 
1491
  // 뢄석 μƒνƒœ 확인
1492
- const statusResponse = await fetch('/api/analysis-status');
1493
- const statusData = await statusResponse.json();
1494
 
1495
  if (statusData.status !== 'completed') {
1496
- addChatMessage("PDF 뢄석이 아직 μ§„ν–‰ μ€‘μž…λ‹ˆλ‹€. μž μ‹œλ§Œ κΈ°λ‹€λ €μ£Όμ„Έμš”... ⏳");
 
 
 
 
1497
  isAiProcessing = false;
1498
  $id('aiChatSubmit').disabled = false;
1499
- $id('aiChatInput').value = question; // 질문 μœ μ§€
1500
  return;
1501
  }
1502
 
@@ -1506,7 +1579,7 @@ HTML = """
1506
  method: 'POST',
1507
  headers: { 'Content-Type': 'application/json' },
1508
  body: JSON.stringify({ query: question }),
1509
- signal: AbortSignal.timeout(120000) // 2λΆ„ νƒ€μž„μ•„μ›ƒ
1510
  });
1511
 
1512
  const data = await response.json();
@@ -1551,7 +1624,7 @@ HTML = """
1551
  }
1552
 
1553
  function updateLoading(message, progress) {
1554
- const text = $id('loadingText');
1555
  if (text) text.textContent = message;
1556
  const bar = $id('progressBar');
1557
  if (bar && progress !== undefined) bar.style.width = `${progress}%`;
@@ -1681,7 +1754,6 @@ HTML = """
1681
  }
1682
  }
1683
 
1684
- // μΊμ‹œκ°€ μ—†μœΌλ©΄ 캐싱 μ‹œμž‘ν•˜κ³  μ§„ν–‰ 상황 λͺ¨λ‹ˆν„°λ§
1685
  const cacheResponse = await fetch('/api/cached-pdf');
1686
  let cachedData = await cacheResponse.json();
1687
 
@@ -1691,7 +1763,6 @@ HTML = """
1691
  return;
1692
  }
1693
 
1694
- // 캐싱 μ§„ν–‰ 쀑이면 μ™„λ£Œλ  λ•ŒκΉŒμ§€ λŒ€κΈ°
1695
  while (cachedData.status === "processing" || cachedData.status === "started") {
1696
  await new Promise(resolve => setTimeout(resolve, 1000));
1697
 
@@ -1727,11 +1798,9 @@ HTML = """
1727
  document.addEventListener('DOMContentLoaded', function() {
1728
  initializeAudio();
1729
 
1730
- // AI λ²„νŠΌ 이벀트
1731
  $id('aiButton').addEventListener('click', () => toggleAiChat(!isAiChatActive));
1732
  $id('aiChatClose').addEventListener('click', () => toggleAiChat(false));
1733
 
1734
- // μ±„νŒ… 폼 이벀트
1735
  $id('aiChatForm').addEventListener('submit', function(e) {
1736
  e.preventDefault();
1737
  const question = $id('aiChatInput').value.trim();
@@ -1740,7 +1809,6 @@ HTML = """
1740
  }
1741
  });
1742
 
1743
- // PDF μžλ™ λ‘œλ“œ
1744
  loadPDF();
1745
  });
1746
  </script>
 
44
  cache_locks = {}
45
  pdf_embeddings: Dict[str, Dict[str, Any]] = {}
46
 
47
+ # VLM 뢄석 μƒνƒœ 좔적 (λ©”λͺ¨λ¦¬)
48
+ analysis_status: Dict[str, Dict[str, Any]] = {}
49
+
50
 
51
  def get_cache_path(pdf_name: str):
52
  return CACHE_DIR / f"{pdf_name}_cache.json"
 
105
  return None
106
 
107
 
108
+ def get_pdf_pages_as_base64(pdf_path: str, start_page: int = 0, max_pages: int = 10, scale: float = 0.7) -> List[Dict[str, Any]]:
109
+ """PDF μ—¬λŸ¬ νŽ˜μ΄μ§€λ₯Ό base64 이미지 리슀트둜 λ³€ν™˜ (배치 처리용)"""
110
  try:
111
  doc = fitz.open(pdf_path)
112
  total_pages = doc.page_count
113
+ end_page = min(start_page + max_pages, total_pages)
114
 
115
  images = []
116
+ for page_num in range(start_page, end_page):
117
  page = doc[page_num]
118
  pix = page.get_pixmap(matrix=fitz.Matrix(scale, scale))
119
+ img_data = pix.tobytes("jpeg", 75)
120
  b64_img = base64.b64encode(img_data).decode('utf-8')
121
  images.append({
122
  "page": page_num + 1,
 
124
  })
125
 
126
  doc.close()
127
+ logger.info(f"PDF {start_page+1}~{end_page}/{total_pages}νŽ˜μ΄μ§€ 이미지 λ³€ν™˜ μ™„λ£Œ")
128
+ return images, total_pages
129
  except Exception as e:
130
  logger.error(f"PDF νŽ˜μ΄μ§€λ“€ 이미지 λ³€ν™˜ 였λ₯˜: {e}")
131
+ return [], 0
132
 
133
 
134
+ def call_fireworks_vlm_api(messages: List[Dict], max_tokens: int = 4096, temperature: float = 0.6) -> str:
135
+ """Fireworks AI VLM API 호좜 (이미지 뢄석 지원)"""
136
+ if not HAS_VALID_API_KEY:
137
+ raise Exception("API ν‚€κ°€ μ„€μ •λ˜μ§€ μ•Šμ•˜μŠ΅λ‹ˆλ‹€.")
138
 
139
+ payload = {
140
+ "model": FIREWORKS_VLM_MODEL,
141
+ "max_tokens": max_tokens,
142
+ "top_p": 1,
143
+ "top_k": 40,
144
+ "presence_penalty": 0,
145
+ "frequency_penalty": 0,
146
+ "temperature": temperature,
147
+ "messages": messages
148
+ }
149
 
150
+ headers = {
151
+ "Accept": "application/json",
152
+ "Content-Type": "application/json",
153
+ "Authorization": f"Bearer {FIREWORKS_API_KEY}"
154
+ }
155
+
156
+ response = requests.post(FIREWORKS_API_URL, headers=headers, data=json.dumps(payload), timeout=180)
157
 
158
+ if response.status_code != 200:
159
+ raise Exception(f"API 였λ₯˜: {response.status_code} - {response.text}")
160
+
161
+ result = response.json()
162
+ return result["choices"][0]["message"]["content"]
163
+
164
+
165
+ def analyze_batch_pages_sync(pdf_path: str, start_page: int, batch_size: int = 5) -> str:
166
+ """배치 νŽ˜μ΄μ§€ 뢄석 (동기)"""
167
+ page_images, total_pages = get_pdf_pages_as_base64(pdf_path, start_page, batch_size, scale=0.6)
168
 
169
  if not page_images:
170
+ return ""
171
 
 
172
  content_parts = []
 
173
  for img_data in page_images:
174
  content_parts.append({
175
  "type": "image_url",
 
178
  }
179
  })
180
 
181
+ page_range = f"{start_page + 1}~{start_page + len(page_images)}"
182
  content_parts.append({
183
  "type": "text",
184
+ "text": f"""μœ„ 이미지듀은 PDF λ¬Έμ„œμ˜ {page_range}νŽ˜μ΄μ§€μž…λ‹ˆλ‹€.
 
 
 
 
 
 
 
 
 
 
 
 
185
 
186
+ 각 νŽ˜μ΄μ§€μ˜ λ‚΄μš©μ„ μƒμ„Έν•˜κ²Œ λΆ„μ„ν•˜μ—¬ ν…μŠ€νŠΈλ‘œ μΆ”μΆœν•΄μ£Όμ„Έμš”.
187
+ - λͺ¨λ“  ν…μŠ€νŠΈ λ‚΄μš©μ„ 빠짐없이 μΆ”μΆœ
188
+ - ν‘œ, 차트, κ·Έλž˜ν”„κ°€ 있으면 λ‚΄μš© μ„€λͺ…
189
+ - 이미지가 있으면 μ„€λͺ…
190
+ - νŽ˜μ΄μ§€λ³„λ‘œ κ΅¬λΆ„ν•˜μ—¬ μž‘μ„±
 
 
 
191
 
192
  ν•œκ΅­μ–΄λ‘œ μž‘μ„±ν•΄μ£Όμ„Έμš”."""
193
  })
194
 
195
  messages = [{"role": "user", "content": content_parts}]
196
+
197
+ return call_fireworks_vlm_api(messages, max_tokens=4096, temperature=0.3)
198
 
199
+
200
+ async def analyze_pdf_with_vlm_batched(pdf_id: str, force_refresh: bool = False) -> Dict[str, Any]:
201
+ """VLM으둜 PDF 배치 뢄석 ν›„ μΊμ‹œμ— μ €μž₯"""
202
+ global analysis_status
203
+
204
+ # 이미 뢄석 쀑인지 확인
205
+ if pdf_id in analysis_status and analysis_status[pdf_id].get("status") == "analyzing":
206
+ logger.info(f"PDF {pdf_id} 이미 뢄석 쀑...")
207
+ return {"status": "analyzing", "progress": analysis_status[pdf_id].get("progress", 0)}
208
+
209
+ # μΊμ‹œ 확인
210
+ if not force_refresh:
211
+ cached = load_analysis_cache(pdf_id)
212
+ if cached:
213
+ analysis_status[pdf_id] = {"status": "completed", "progress": 100}
214
+ return cached
215
+
216
+ pdf_path = str(PROMPT_PDF_PATH)
217
+ if not PROMPT_PDF_PATH.exists():
218
+ analysis_status[pdf_id] = {"status": "error", "error": "PDF 파일 μ—†μŒ"}
219
+ return {"error": "PDF νŒŒμΌμ„ 찾을 수 μ—†μŠ΅λ‹ˆλ‹€."}
220
+
221
+ if not HAS_VALID_API_KEY:
222
+ analysis_status[pdf_id] = {"status": "error", "error": "API ν‚€ μ—†μŒ"}
223
+ return {"error": "API ν‚€κ°€ μ„€μ •λ˜μ§€ μ•Šμ•˜μŠ΅λ‹ˆλ‹€."}
224
+
225
+ # 뢄석 μ‹œμž‘
226
+ analysis_status[pdf_id] = {"status": "analyzing", "progress": 0, "started_at": time.time()}
227
+
228
  try:
229
+ # PDF 총 νŽ˜μ΄μ§€ 수 확인
230
+ doc = fitz.open(pdf_path)
231
+ total_pages = doc.page_count
232
+ doc.close()
233
+
234
+ logger.info(f"PDF 뢄석 μ‹œμž‘: 총 {total_pages}νŽ˜μ΄μ§€")
235
+
236
+ # 배치둜 λ‚˜λˆ μ„œ 뢄석 (5νŽ˜μ΄μ§€μ”©)
237
+ batch_size = 5
238
+ all_analyses = []
239
+
240
+ for start_page in range(0, min(total_pages, 25), batch_size): # μ΅œλŒ€ 25νŽ˜μ΄μ§€
241
+ try:
242
+ progress = int((start_page / min(total_pages, 25)) * 100)
243
+ analysis_status[pdf_id]["progress"] = progress
244
+ logger.info(f"배치 뢄석 쀑: {start_page + 1}νŽ˜μ΄μ§€λΆ€ν„° (μ§„ν–‰λ₯ : {progress}%)")
245
+
246
+ # 동기 ν•¨μˆ˜λ₯Ό 별도 μŠ€λ ˆλ“œμ—μ„œ μ‹€ν–‰
247
+ loop = asyncio.get_event_loop()
248
+ batch_result = await loop.run_in_executor(
249
+ None,
250
+ analyze_batch_pages_sync,
251
+ pdf_path,
252
+ start_page,
253
+ batch_size
254
+ )
255
+
256
+ if batch_result:
257
+ all_analyses.append(f"### νŽ˜μ΄μ§€ {start_page + 1}~{min(start_page + batch_size, total_pages)}\n{batch_result}")
258
+
259
+ # API 레이트 리밋 λ°©μ§€
260
+ await asyncio.sleep(2)
261
+
262
+ except Exception as batch_error:
263
+ logger.error(f"배치 {start_page} 뢄석 였λ₯˜: {batch_error}")
264
+ all_analyses.append(f"### νŽ˜μ΄μ§€ {start_page + 1}~{min(start_page + batch_size, total_pages)}\n[뢄석 μ‹€νŒ¨: {str(batch_error)}]")
265
+
266
+ # 전체 뢄석 κ²°κ³Ό ν•©μΉ˜κΈ°
267
+ combined_analysis = "\n\n".join(all_analyses)
268
+
269
+ # μš”μ•½ 생성
270
+ summary = ""
271
+ if combined_analysis:
272
+ try:
273
+ summary_messages = [
274
+ {
275
+ "role": "system",
276
+ "content": "λ‹€μŒ PDF 뢄석 λ‚΄μš©μ„ 500자 μ΄λ‚΄λ‘œ μš”μ•½ν•΄μ£Όμ„Έμš”. 핡심 λ‚΄μš©κ³Ό μ£Όμš” ν‚€μ›Œλ“œλ₯Ό ν¬ν•¨ν•΄μ£Όμ„Έμš”."
277
+ },
278
+ {
279
+ "role": "user",
280
+ "content": combined_analysis[:8000] # 토큰 μ œν•œ
281
+ }
282
+ ]
283
+ summary = call_fireworks_vlm_api(summary_messages, max_tokens=1024, temperature=0.5)
284
+ except Exception as sum_err:
285
+ logger.error(f"μš”μ•½ 생성 였λ₯˜: {sum_err}")
286
+ summary = combined_analysis[:500] + "..."
287
 
288
  analysis_data = {
289
  "pdf_id": pdf_id,
290
+ "total_pages": total_pages,
291
+ "analyzed_pages": min(total_pages, 25),
292
+ "analysis": combined_analysis,
293
+ "summary": summary,
294
  "created_at": time.time()
295
  }
296
 
297
  # μΊμ‹œμ— μ €μž₯
298
  save_analysis_cache(pdf_id, analysis_data)
299
 
300
+ analysis_status[pdf_id] = {"status": "completed", "progress": 100}
301
+ logger.info(f"PDF 뢄석 μ™„λ£Œ: {pdf_id}")
302
+
303
  return analysis_data
304
+
305
  except Exception as e:
306
  logger.error(f"VLM PDF 뢄석 였λ₯˜: {e}")
307
+ analysis_status[pdf_id] = {"status": "error", "error": str(e)}
308
  return {"error": str(e)}
309
 
310
 
311
+ async def run_initial_analysis():
312
+ """μ„œλ²„ μ‹œμž‘ μ‹œ 초기 뢄석 μ‹€ν–‰"""
313
+ logger.info("초기 PDF 뢄석 μ‹œμž‘...")
314
+ try:
315
+ result = await analyze_pdf_with_vlm_batched(PROMPT_PDF_ID)
316
+ if "error" in result:
317
+ logger.error(f"초기 뢄석 μ‹€νŒ¨: {result['error']}")
318
+ else:
319
+ logger.info("초기 PDF 뢄석 μ™„λ£Œ!")
320
+ except Exception as e:
321
+ logger.error(f"초기 뢄석 μ˜ˆμ™Έ: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
322
 
323
 
324
  def extract_pdf_text(pdf_path: str) -> List[Dict[str, Any]]:
 
327
  chunks = []
328
  for page_num in range(len(doc)):
329
  page = doc[page_num]
 
330
  text = page.get_text("text")
331
 
 
332
  if not text.strip():
333
  text = page.get_text("blocks")
334
  if text:
335
  text = "\n".join([block[4] for block in text if len(block) > 4 and isinstance(block[4], str)])
336
 
 
337
  if not text.strip():
338
+ text = f"[νŽ˜μ΄μ§€ {page_num + 1} - 이미지 기반 νŽ˜μ΄μ§€]"
339
 
340
  chunks.append({
341
  "page": page_num + 1,
 
350
  return []
351
 
352
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
353
  async def query_pdf(pdf_id: str, query: str) -> Dict[str, Any]:
354
  """μΊμ‹œλœ VLM 뢄석 κ²°κ³ΌοΏ½οΏ½ 기반으둜 μ§ˆμ˜μ‘λ‹΅"""
355
  try:
 
360
  }
361
 
362
  # μΊμ‹œλœ 뢄석 κ²°κ³Ό 확인
363
+ analysis_data = load_analysis_cache(pdf_id)
364
 
365
+ if not analysis_data:
366
+ # 뢄석 μƒνƒœ 확인
367
+ if pdf_id in analysis_status:
368
+ status = analysis_status[pdf_id].get("status")
369
+ if status == "analyzing":
370
+ progress = analysis_status[pdf_id].get("progress", 0)
371
+ return {"error": f"뢄석 μ§„ν–‰ 쀑 ({progress}%)", "answer": f"PDF 뢄석이 μ§„ν–‰ μ€‘μž…λ‹ˆλ‹€ ({progress}%). μž μ‹œλ§Œ κΈ°λ‹€λ €μ£Όμ„Έμš”."}
372
+ elif status == "error":
373
+ return {"error": "뢄석 μ‹€νŒ¨", "answer": f"PDF 뢄석에 μ‹€νŒ¨ν–ˆμŠ΅λ‹ˆλ‹€: {analysis_status[pdf_id].get('error', 'μ•Œ 수 μ—†λŠ” 였λ₯˜')}"}
374
+ return {"error": "뢄석 데이터 μ—†μŒ", "answer": "PDFκ°€ 아직 λΆ„μ„λ˜μ§€ μ•Šμ•˜μŠ΅λ‹ˆλ‹€. μž μ‹œ ν›„ λ‹€μ‹œ μ‹œλ„ν•΄μ£Όμ„Έμš”."}
375
 
376
  analysis_text = analysis_data.get("analysis", "")
377
  total_pages = analysis_data.get("total_pages", 0)
 
390
  뢄석 λ‚΄μš©μ— μ—†λŠ” μ •λ³΄λŠ” "ν•΄λ‹Ή 정보λ₯Ό 찾을 수 μ—†μŠ΅λ‹ˆλ‹€"라고 μ†”μ§νžˆ λ‹΅ν•΄μ£Όμ„Έμš”.
391
 
392
  === PDF 뢄석 κ²°κ³Ό ===
393
+ {analysis_text[:12000]}
394
  =================="""
395
  },
396
  {
 
400
  ]
401
 
402
  try:
403
+ answer = call_fireworks_vlm_api(messages, max_tokens=4096, temperature=0.6)
404
+ return {
405
+ "answer": answer,
406
+ "pdf_id": pdf_id,
407
+ "query": query
408
+ }
 
 
 
 
 
 
 
 
 
409
  except Exception as api_error:
410
+ logger.error(f"Fireworks API 호좜 였λ₯˜: {api_error}")
411
  error_message = str(api_error)
412
+ return {"error": f"AI 였λ₯˜: {error_message}", "answer": "처리 쀑 였λ₯˜κ°€ λ°œμƒν–ˆμŠ΅λ‹ˆλ‹€. μž μ‹œ ν›„ λ‹€μ‹œ μ‹œλ„ν•΄μ£Όμ„Έμš”."}
413
+
 
 
 
 
 
 
414
  except Exception as e:
415
  logger.error(f"μ§ˆμ˜μ‘λ‹΅ 처리 였λ₯˜: {e}")
416
  return {"error": str(e), "answer": "처리 쀑 였λ₯˜κ°€ λ°œμƒν–ˆμŠ΅λ‹ˆλ‹€."}
 
425
  "summary": "API ν‚€κ°€ μ—†μ–΄ μš”μ•½μ„ 생성할 수 μ—†μŠ΅λ‹ˆλ‹€."
426
  }
427
 
428
+ # μΊμ‹œλœ 뢄석 κ²°κ³Ό 확인
429
+ analysis_data = load_analysis_cache(pdf_id)
430
 
431
+ if not analysis_data:
432
+ # 뢄석 μƒνƒœ 확인
433
+ if pdf_id in analysis_status:
434
+ status = analysis_status[pdf_id].get("status")
435
+ if status == "analyzing":
436
+ progress = analysis_status[pdf_id].get("progress", 0)
437
+ return {"error": f"뢄석 μ§„ν–‰ 쀑", "summary": f"PDF 뢄석이 μ§„ν–‰ μ€‘μž…λ‹ˆλ‹€ ({progress}%). μž μ‹œλ§Œ κΈ°λ‹€λ €μ£Όμ„Έμš”."}
438
+ elif status == "error":
439
+ return {"error": "뢄석 μ‹€νŒ¨", "summary": f"PDF 뢄석에 μ‹€νŒ¨ν–ˆμŠ΅λ‹ˆλ‹€."}
440
+ return {"error": "뢄석 데이터 μ—†μŒ", "summary": "PDFκ°€ 아직 λΆ„μ„λ˜μ§€ μ•Šμ•˜μŠ΅λ‹ˆλ‹€."}
441
 
442
+ summary = analysis_data.get("summary", "")
443
  total_pages = analysis_data.get("total_pages", 0)
444
+ analyzed_pages = analysis_data.get("analyzed_pages", total_pages)
445
 
446
+ if summary:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
447
  return {
448
  "summary": summary,
449
  "pdf_id": pdf_id,
450
+ "total_pages": total_pages,
451
+ "analyzed_pages": analyzed_pages
452
+ }
453
+
454
+ # μš”μ•½μ΄ μ—†μœΌλ©΄ 뢄석 λ‚΄μš©μ—μ„œ μΆ”μΆœ
455
+ analysis_text = analysis_data.get("analysis", "")
456
+ if analysis_text:
457
+ return {
458
+ "summary": analysis_text[:500] + "...",
459
+ "pdf_id": pdf_id,
460
  "total_pages": total_pages
461
  }
462
+
463
+ return {"error": "μš”μ•½ μ—†μŒ", "summary": "μš”μ•½μ„ 생성할 수 μ—†μŠ΅λ‹ˆλ‹€."}
 
 
 
 
 
 
 
 
 
 
464
 
465
  except Exception as e:
466
  logger.error(f"PDF μš”μ•½ 생성 였λ₯˜: {e}")
 
587
  async def startup_event():
588
  if PROMPT_PDF_PATH.exists():
589
  logger.info(f"prompt.pdf 파일 발견: {PROMPT_PDF_PATH}")
590
+ # ν”Œλ¦½λΆ 캐싱
591
  asyncio.create_task(cache_pdf(str(PROMPT_PDF_PATH)))
592
+ # VLM 뢄석 - μ—λŸ¬ 핸듀링 포함
593
+ asyncio.create_task(run_initial_analysis())
594
  else:
595
  logger.warning(f"prompt.pdf νŒŒμΌμ„ 찾을 수 μ—†μŠ΅λ‹ˆλ‹€: {PROMPT_PDF_PATH}")
596
 
 
618
 
619
  @app.get("/api/analysis-status")
620
  async def get_analysis_status():
621
+ """VLM 뢄석 μƒνƒœ 확인"""
622
+ # λ¨Όμ € μΊμ‹œ 파일 확인
623
  cached = load_analysis_cache(PROMPT_PDF_ID)
624
  if cached:
625
  return {
626
  "status": "completed",
627
  "total_pages": cached.get("total_pages", 0),
628
+ "analyzed_pages": cached.get("analyzed_pages", 0),
629
  "created_at": cached.get("created_at", 0)
630
  }
631
+
632
+ # λ©”λͺ¨λ¦¬ μƒνƒœ 확인
633
+ if PROMPT_PDF_ID in analysis_status:
634
+ status_info = analysis_status[PROMPT_PDF_ID]
635
+ return {
636
+ "status": status_info.get("status", "unknown"),
637
+ "progress": status_info.get("progress", 0),
638
+ "error": status_info.get("error")
639
+ }
640
+
641
+ return {"status": "not_started"}
642
 
643
 
644
  @app.post("/api/reanalyze-pdf")
 
654
  cache_path.unlink()
655
  logger.info("κΈ°μ‘΄ VLM 뢄석 μΊμ‹œ μ‚­μ œ")
656
 
657
+ # μƒνƒœ μ΄ˆκΈ°ν™”
658
+ if PROMPT_PDF_ID in analysis_status:
659
+ del analysis_status[PROMPT_PDF_ID]
660
+
661
  # λ°±κ·ΈλΌμš΄λ“œμ—μ„œ μž¬λΆ„μ„ μ‹œμž‘
662
+ asyncio.create_task(run_initial_analysis())
663
 
664
  return {"status": "started", "message": "PDF μž¬λΆ„μ„μ„ μ‹œμž‘ν•©λ‹ˆλ‹€."}
665
  except Exception as e:
 
711
  return JSONResponse(content={"error": "PDF νŒŒμΌμ„ 찾을 수 μ—†μŠ΅λ‹ˆλ‹€"}, status_code=404)
712
 
713
  result = await query_pdf(PROMPT_PDF_ID, user_query)
 
714
  if "answer" in result:
715
  return result
716
  if "error" in result:
717
+ return JSONResponse(content=result, status_code=200)
718
  return result
719
  except Exception as e:
720
  logger.error(f"μ§ˆμ˜μ‘λ‹΅ API 였λ₯˜: {e}")
 
728
  return JSONResponse(content={"error": "PDF νŒŒμΌμ„ 찾을 수 μ—†μŠ΅λ‹ˆλ‹€"}, status_code=404)
729
 
730
  result = await summarize_pdf(PROMPT_PDF_ID)
 
731
  if "summary" in result:
732
  return result
733
  if "error" in result:
734
+ return JSONResponse(content=result, status_code=200)
735
  return result
736
  except Exception as e:
737
  logger.error(f"PDF μš”μ•½ API 였λ₯˜: {e}")
 
1355
  let isAiChatActive = false;
1356
  let isAiProcessing = false;
1357
  let hasLoadedSummary = false;
1358
+ let analysisCheckInterval = null;
1359
 
1360
  function $id(id) { return document.getElementById(id); }
1361
 
 
1442
  }
1443
  }
1444
 
1445
+ async function checkAnalysisStatus() {
1446
+ try {
1447
+ const response = await fetch('/api/analysis-status');
1448
+ const data = await response.json();
1449
+ return data;
1450
+ } catch (e) {
1451
+ console.error("뢄석 μƒνƒœ 확인 였λ₯˜:", e);
1452
+ return { status: "error" };
1453
+ }
1454
+ }
1455
+
1456
  async function loadPdfSummary() {
1457
  if (isAiProcessing || hasLoadedSummary) return;
1458
 
 
1460
  isAiProcessing = true;
1461
  addTypingIndicator();
1462
 
1463
+ // 뢄석 μƒνƒœ 확인
1464
+ const statusData = await checkAnalysisStatus();
 
1465
 
1466
+ if (statusData.status === 'analyzing') {
1467
  removeTypingIndicator();
1468
+ const progress = statusData.progress || 0;
1469
+ addChatMessage(`μ•ˆλ…•ν•˜μ„Έμš”! ν˜„μž¬ PDFλ₯Ό AIκ°€ λΆ„μ„ν•˜κ³  μžˆμŠ΅λ‹ˆλ‹€. πŸ“Š<br><br>μ§„ν–‰λ₯ : <strong>${progress}%</strong><br><small style="color:#999;">뢄석이 μ™„λ£Œλ˜λ©΄ μžλ™μœΌλ‘œ μ•Œλ €λ“œλ¦¬κ² μŠ΅λ‹ˆλ‹€.</small>`);
1470
  hasLoadedSummary = true;
1471
  isAiProcessing = false;
1472
 
1473
+ // 뢄석 μ™„λ£Œ 폴링
1474
+ startAnalysisPolling();
1475
+ return;
1476
+ }
1477
+
1478
+ if (statusData.status === 'error') {
1479
+ removeTypingIndicator();
1480
+ addChatMessage(`μ•ˆλ…•ν•˜μ„Έμš”! PDF 뢄석 쀑 였λ₯˜κ°€ λ°œμƒν–ˆμŠ΅λ‹ˆλ‹€. ⚠️<br><br><small style="color:#e74c3c;">${statusData.error || 'μ•Œ 수 μ—†λŠ” 였λ₯˜'}</small><br><br>νŽ˜μ΄μ§€λ₯Ό μƒˆλ‘œκ³ μΉ¨ν•˜κ±°λ‚˜ μž μ‹œ ν›„ λ‹€μ‹œ μ‹œλ„ν•΄μ£Όμ„Έμš”.`);
1481
+ hasLoadedSummary = true;
1482
+ isAiProcessing = false;
1483
  return;
1484
  }
1485
 
1486
+ if (statusData.status === 'not_started') {
1487
+ removeTypingIndicator();
1488
+ addChatMessage(`μ•ˆλ…•ν•˜μ„Έμš”! PDF 뢄석이 아직 μ‹œμž‘λ˜μ§€ μ•Šμ•˜μŠ΅λ‹ˆλ‹€. πŸ”„<br><br><small style="color:#999;">μž μ‹œλ§Œ κΈ°λ‹€λ €μ£Όμ„Έμš”...</small>`);
1489
+ hasLoadedSummary = true;
1490
+ isAiProcessing = false;
1491
+ startAnalysisPolling();
1492
+ return;
1493
+ }
1494
+
1495
+ // 뢄석 μ™„λ£Œλ¨ - μš”μ•½ κ°€μ Έμ˜€κΈ°
1496
  const response = await fetch('/api/ai/summarize-pdf');
1497
  const data = await response.json();
1498
 
1499
  removeTypingIndicator();
1500
 
1501
  if (data.summary) {
1502
+ const pageInfo = data.analyzed_pages ? ` (${data.analyzed_pages}/${data.total_pages}νŽ˜μ΄μ§€ λΆ„μ„μ™„λ£Œ)` : '';
1503
  addChatMessage(`μ•ˆλ…•ν•˜μ„Έμš”! 이 PDF에 λŒ€ν•΄ 무엇이든 μ§ˆλ¬Έν•΄μ£Όμ„Έμš”.${pageInfo}<br><br><strong>πŸ“„ PDF μš”μ•½:</strong><br>${data.summary}`);
 
 
 
 
1504
  } else {
1505
  addChatMessage("μ•ˆλ…•ν•˜μ„Έμš”! PDF에 λŒ€ν•΄ μ§ˆλ¬Έν•΄μ£Όμ„Έμš”. μ΅œμ„ μ„ λ‹€ν•΄ λ‹΅λ³€ν•˜κ² μŠ΅λ‹ˆλ‹€.");
 
1506
  }
1507
+ hasLoadedSummary = true;
1508
+
1509
  } catch (error) {
1510
  console.error("PDF μš”μ•½ λ‘œλ“œ 였λ₯˜:", error);
1511
  removeTypingIndicator();
1512
+ addChatMessage("μ•ˆλ…•ν•˜μ„Έμš”! PDF에 λŒ€ν•΄ μ§ˆλ¬Έν•΄μ£Όμ„Έμš”.");
1513
  hasLoadedSummary = true;
1514
  } finally {
1515
  isAiProcessing = false;
1516
  }
1517
  }
1518
 
1519
+ function startAnalysisPolling() {
1520
+ if (analysisCheckInterval) return;
1521
+
1522
+ analysisCheckInterval = setInterval(async () => {
1523
  try {
1524
+ const data = await checkAnalysisStatus();
 
1525
 
1526
  if (data.status === 'completed') {
1527
+ clearInterval(analysisCheckInterval);
1528
+ analysisCheckInterval = null;
1529
+ addChatMessage(`βœ… PDF 뢄석이 μ™„λ£Œλ˜μ—ˆμŠ΅λ‹ˆλ‹€! (${data.analyzed_pages || data.total_pages}νŽ˜μ΄μ§€)<br>이제 자유둭게 μ§ˆλ¬Έν•΄μ£Όμ„Έμš”.`);
1530
+ } else if (data.status === 'analyzing') {
1531
+ // μ§„ν–‰λ₯  μ—…λ°μ΄νŠΈ (선택적)
1532
+ console.log(`뢄석 μ§„ν–‰ 쀑: ${data.progress}%`);
1533
+ } else if (data.status === 'error') {
1534
+ clearInterval(analysisCheckInterval);
1535
+ analysisCheckInterval = null;
1536
+ addChatMessage(`⚠️ PDF 뢄석 μ‹€νŒ¨: ${data.error || 'μ•Œ 수 μ—†λŠ” 였λ₯˜'}`);
1537
  }
1538
  } catch (e) {
1539
+ console.error("폴링 였λ₯˜:", e);
1540
  }
1541
+ }, 5000); // 5μ΄ˆλ§ˆλ‹€ 확인
1542
 
1543
  // 5λΆ„ ν›„ μžλ™ 쀑지
1544
+ setTimeout(() => {
1545
+ if (analysisCheckInterval) {
1546
+ clearInterval(analysisCheckInterval);
1547
+ analysisCheckInterval = null;
1548
+ }
1549
+ }, 300000);
1550
  }
1551
 
1552
  async function submitQuestion(question) {
 
1559
  addChatMessage(question, true);
1560
 
1561
  // 뢄석 μƒνƒœ 확인
1562
+ const statusData = await checkAnalysisStatus();
 
1563
 
1564
  if (statusData.status !== 'completed') {
1565
+ if (statusData.status === 'analyzing') {
1566
+ addChatMessage(`PDF 뢄석이 μ§„ν–‰ μ€‘μž…λ‹ˆλ‹€ (${statusData.progress || 0}%). μ™„λ£Œ ν›„ μ§ˆλ¬Έν•΄μ£Όμ„Έμš”. ⏳`);
1567
+ } else {
1568
+ addChatMessage("PDF 뢄석이 아직 μ™„λ£Œλ˜μ§€ μ•Šμ•˜μŠ΅λ‹ˆλ‹€. μž μ‹œλ§Œ κΈ°λ‹€λ €μ£Όμ„Έμš”.");
1569
+ }
1570
  isAiProcessing = false;
1571
  $id('aiChatSubmit').disabled = false;
1572
+ $id('aiChatInput').value = question;
1573
  return;
1574
  }
1575
 
 
1579
  method: 'POST',
1580
  headers: { 'Content-Type': 'application/json' },
1581
  body: JSON.stringify({ query: question }),
1582
+ signal: AbortSignal.timeout(120000)
1583
  });
1584
 
1585
  const data = await response.json();
 
1624
  }
1625
 
1626
  function updateLoading(message, progress) {
1627
+ const text = document.querySelector('.loading-text');
1628
  if (text) text.textContent = message;
1629
  const bar = $id('progressBar');
1630
  if (bar && progress !== undefined) bar.style.width = `${progress}%`;
 
1754
  }
1755
  }
1756
 
 
1757
  const cacheResponse = await fetch('/api/cached-pdf');
1758
  let cachedData = await cacheResponse.json();
1759
 
 
1763
  return;
1764
  }
1765
 
 
1766
  while (cachedData.status === "processing" || cachedData.status === "started") {
1767
  await new Promise(resolve => setTimeout(resolve, 1000));
1768
 
 
1798
  document.addEventListener('DOMContentLoaded', function() {
1799
  initializeAudio();
1800
 
 
1801
  $id('aiButton').addEventListener('click', () => toggleAiChat(!isAiChatActive));
1802
  $id('aiChatClose').addEventListener('click', () => toggleAiChat(false));
1803
 
 
1804
  $id('aiChatForm').addEventListener('submit', function(e) {
1805
  e.preventDefault();
1806
  const question = $id('aiChatInput').value.trim();
 
1809
  }
1810
  });
1811
 
 
1812
  loadPDF();
1813
  });
1814
  </script>