Alstears commited on
Commit
28cfc5d
·
verified ·
1 Parent(s): 5a074c3

Upload server.py

Browse files
Files changed (1) hide show
  1. server.py +120 -80
server.py CHANGED
@@ -55,8 +55,8 @@ else:
55
 
56
  GEMINI_API_KEY = os.environ.get('GEMINI_API_KEY', "AIzaSyAhVnCOblQvDvq9VIG6A4ztOdGh_yqarfk")
57
  GEMINI_BASE_URL = os.environ.get('GEMINI_BASE_URL', "https://generativelanguage.googleapis.com/v1beta")
58
- GEMINI_MODEL = os.environ.get('GEMINI_MODEL', "gemini-2.5-flash")
59
- GEMINI_ANALYSIS_MODEL = os.environ.get('GEMINI_ANALYSIS_MODEL', "gemini-3-flash-preview")
60
 
61
  GROQ_API_KEY = os.environ.get('GROQ_API_KEY', "")
62
  GROQ_MODEL = os.environ.get('GROQ_MODEL', "llama-3.3-70b-versatile")
@@ -604,35 +604,26 @@ async def _analyze_pdf_legacy(file_path: str, user: User) -> dict:
604
 
605
 
606
  async def analyze_pdf(file_path: str, user: User) -> dict:
607
- """Analyze PDF using adaptive chunked strategy for large documents."""
608
- # Kasih jeda awal (permintaan user)
609
  await asyncio.sleep(2)
610
 
611
  reader = PdfReader(file_path)
612
  total_pages = len(reader.pages)
613
 
614
- # Small documents: use legacy single-pass for quality
615
- # Menaikkan threshold ke 5 halaman karena Gemini Pro/Flash sanggup
616
- if total_pages <= 5:
617
  return await _analyze_pdf_legacy(file_path, user)
618
 
619
- # Perbesar chunk_size agar jumlah request lebih sedikit (hindari 429)
620
- if total_pages <= 20:
621
- chunk_size = 7
622
- elif total_pages <= 50:
623
- chunk_size = 12
624
- else:
625
- chunk_size = 15
626
-
627
- overlap = 2 # Overlap sedikit saja untuk menjaga konteks tanpa memboroskan request
628
  step = chunk_size - overlap
629
 
630
- # Calculate exact number of batches needed
631
  remaining = total_pages - chunk_size
632
- if remaining <= 0:
633
- total_batches = 1
634
- else:
635
- total_batches = (remaining + step - 1) // step + 1
636
 
637
  all_concepts = []
638
  all_diagrams = []
@@ -640,10 +631,8 @@ async def analyze_pdf(file_path: str, user: User) -> dict:
640
  summaries = []
641
 
642
  for batch_idx in range(total_batches):
643
- # Berikan jeda lebih lama jika file besar agar tidak menabrak Rate Limit Google
644
  if batch_idx > 0:
645
- delay = 4 if total_pages > 30 else 1
646
- await asyncio.sleep(delay)
647
 
648
  start = 1 + batch_idx * step
649
  end = min(start + chunk_size - 1, total_pages)
@@ -657,16 +646,11 @@ async def analyze_pdf(file_path: str, user: User) -> dict:
657
  all_concepts.extend(batch.get("key_concepts", []))
658
  all_diagrams.extend(batch.get("diagrams", []))
659
  all_objectives.extend(batch.get("learning_objectives", []))
660
- all_objectives.extend(batch.get("learning_objectives", []))
661
 
662
- # Deduplicate and merge
663
- unique_concepts = _deduplicate_concepts(all_concepts)
664
- unique_concepts = unique_concepts[:20]
665
 
666
- merged_diagrams = _merge_diagrams(all_diagrams)
667
- merged_diagrams = merged_diagrams[:10]
668
-
669
- # Deduplicate objectives
670
  seen_obj = set()
671
  unique_objectives = []
672
  for obj in all_objectives:
@@ -674,56 +658,13 @@ async def analyze_pdf(file_path: str, user: User) -> dict:
674
  if norm and norm not in seen_obj:
675
  seen_obj.add(norm)
676
  unique_objectives.append(obj)
677
- unique_objectives = unique_objectives[:10]
678
-
679
- # Synthesize final summary
680
- final_summary = await _synthesize_summary_from_chunks(summaries, user)
681
-
682
- # Fallback: if very few concepts from chunked analysis on large doc, try legacy and merge
683
- title = ""
684
- if len(unique_concepts) < 5 and total_pages > 10:
685
- try:
686
- legacy = await _analyze_pdf_legacy(file_path, user)
687
- # Merge: keep existing concepts, add any new from legacy up to cap
688
- legacy_concepts = legacy.get("key_concepts", [])
689
- for lc in legacy_concepts:
690
- if len(unique_concepts) >= 20:
691
- break
692
- norm = _normalize_concept_name(lc.get("concept", ""))
693
- if norm and norm not in {_normalize_concept_name(c.get("concept", "")) for c in unique_concepts}:
694
- unique_concepts.append(lc)
695
- # Merge diagrams similarly
696
- legacy_diagrams = legacy.get("diagrams", [])
697
- for ld in legacy_diagrams:
698
- if len(merged_diagrams) >= 10:
699
- break
700
- key = (_normalize_concept_name(ld.get("name", "")), ld.get("type", ""))
701
- if key not in {(_normalize_concept_name(d.get("name", "")), d.get("type", "")) for d in merged_diagrams}:
702
- merged_diagrams.append(ld)
703
- # Merge objectives
704
- for lo in legacy.get("learning_objectives", []):
705
- if len(unique_objectives) >= 10:
706
- break
707
- norm = _normalize_concept_name(lo)
708
- if norm and norm not in seen_obj:
709
- seen_obj.add(norm)
710
- unique_objectives.append(lo)
711
- # Use legacy title if missing short summary
712
- if not title and legacy.get("title"):
713
- title = legacy.get("title")
714
- except Exception as e:
715
- logger.warning(f"Legacy fallback failed: {e}")
716
-
717
- # Estimate title from first batch if still empty
718
- if not title:
719
- title = f"Dokumen ({total_pages} halaman)"
720
-
721
  return {
722
- "title": title,
723
- "summary": final_summary,
724
  "key_concepts": unique_concepts,
725
  "diagrams": merged_diagrams,
726
- "learning_objectives": unique_objectives
727
  }
728
 
729
 
@@ -3121,6 +3062,84 @@ async def _sync_local_audios_to_mongodb():
3121
  for file_path_str in wav_files:
3122
  file_path = Path(file_path_str)
3123
  filename = file_path.name
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3124
  existing = await db.audio_files.find_one({"filename": filename}, {"_id": 1})
3125
  if not existing:
3126
  logger.info(f"Mengunggah file audio baru ke MongoDB: {filename}")
@@ -3148,4 +3167,25 @@ async def startup():
3148
 
3149
  @fastapi_app.on_event("shutdown")
3150
  async def shutdown_db_client():
3151
- client.close()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
 
56
  GEMINI_API_KEY = os.environ.get('GEMINI_API_KEY', "AIzaSyAhVnCOblQvDvq9VIG6A4ztOdGh_yqarfk")
57
  GEMINI_BASE_URL = os.environ.get('GEMINI_BASE_URL', "https://generativelanguage.googleapis.com/v1beta")
58
+ GEMINI_MODEL = os.environ.get('GEMINI_MODEL', "gemini-1.5-flash")
59
+ GEMINI_ANALYSIS_MODEL = os.environ.get('GEMINI_ANALYSIS_MODEL', "gemini-1.5-pro")
60
 
61
  GROQ_API_KEY = os.environ.get('GROQ_API_KEY', "")
62
  GROQ_MODEL = os.environ.get('GROQ_MODEL', "llama-3.3-70b-versatile")
 
604
 
605
 
606
  async def analyze_pdf(file_path: str, user: User) -> dict:
607
+ """Analyze PDF using single-pass strategy for high quality with Gemini 2.5."""
608
+ # Kasih jeda awal sesuai permintaan user
609
  await asyncio.sleep(2)
610
 
611
  reader = PdfReader(file_path)
612
  total_pages = len(reader.pages)
613
 
614
+ # Gemini 2.5 memiliki context window 1jt+ token.
615
+ # Kita gunakan single-pass untuk dokumen sampai 200 halaman agar kualitas Anand Kumar maksimal.
616
+ if total_pages <= 200:
617
  return await _analyze_pdf_legacy(file_path, user)
618
 
619
+ # Chunking hanya untuk file ekstrim (> 200 halaman)
620
+ chunk_size = 50
621
+ overlap = 5
 
 
 
 
 
 
622
  step = chunk_size - overlap
623
 
624
+ # Calculate exact number of batches
625
  remaining = total_pages - chunk_size
626
+ total_batches = (remaining + step - 1) // step + 1 if remaining > 0 else 1
 
 
 
627
 
628
  all_concepts = []
629
  all_diagrams = []
 
631
  summaries = []
632
 
633
  for batch_idx in range(total_batches):
 
634
  if batch_idx > 0:
635
+ await asyncio.sleep(5) # Jeda lebih lama untuk file raksasa
 
636
 
637
  start = 1 + batch_idx * step
638
  end = min(start + chunk_size - 1, total_pages)
 
646
  all_concepts.extend(batch.get("key_concepts", []))
647
  all_diagrams.extend(batch.get("diagrams", []))
648
  all_objectives.extend(batch.get("learning_objectives", []))
 
649
 
650
+ # Final merge logic
651
+ unique_concepts = _deduplicate_concepts(all_concepts)[:25]
652
+ merged_diagrams = _merge_diagrams(all_diagrams)[:12]
653
 
 
 
 
 
654
  seen_obj = set()
655
  unique_objectives = []
656
  for obj in all_objectives:
 
658
  if norm and norm not in seen_obj:
659
  seen_obj.add(norm)
660
  unique_objectives.append(obj)
661
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
662
  return {
663
+ "title": f"Analisis Dokumen ({total_pages} hal)",
664
+ "summary": await _synthesize_summary_from_chunks(summaries, user),
665
  "key_concepts": unique_concepts,
666
  "diagrams": merged_diagrams,
667
+ "learning_objectives": unique_objectives[:12]
668
  }
669
 
670
 
 
3062
  for file_path_str in wav_files:
3063
  file_path = Path(file_path_str)
3064
  filename = file_path.name
3065
+
3066
+ if file_path.stat().st_size > 15 * 1024 * 1024:
3067
+ logger.warning(f"File audio {filename} terlalu besar untuk MongoDB (>15MB), dilewati.")
3068
+ continue
3069
+
3070
+ existing = await db.audio_files.find_one({"filename": filename}, {"_id": 1})
3071
+ if not existing:
3072
+ logger.info(f"Mengunggah file audio baru ke MongoDB: {filename}")
3073
+ with open(file_path_str, "rb") as f:
3074
+ data = f.read()
3075
+ await db.audio_files.update_one(
3076
+ {"filename": filename},
3077
+ {"$set": {
3078
+ "filename": filename,
3079
+ "data": Binary(data),
3080
+ "created_at": datetime.now(timezone.utc).isoformat()
3081
+ }},
3082
+ upsert=True
3083
+ )
3084
+ logger.info("Sinkronisasi audio selesai.")
3085
+ except Exception as e:
3086
+ logger.warning(f"Gagal melakukan sinkronisasi audio ke MongoDB: {e}")
3087
+
3088
+
3089
+ @fastapi_app.on_event("startup")
3090
+ async def startup():
3091
+ await _ensure_pdfs_bucket()
3092
+ await _sync_local_audios_to_mongodb()
3093
+
3094
+
3095
+ @fastapi_app.on_event("shutdown")
3096
+ async def shutdown_db_client():
3097
+ client.close()
3098
+
3099
+
3100
+ # ============== Supabase Storage helpers ==============
3101
+ SUPABASE_STORAGE_URL = f"{SUPABASE_URL}/storage/v1" if SUPABASE_URL else ""
3102
+
3103
+
3104
+ async def _ensure_pdfs_bucket():
3105
+ if not SUPABASE_STORAGE_URL or not SUPABASE_URL:
3106
+ return
3107
+ try:
3108
+ async with httpx.AsyncClient() as hc:
3109
+ await hc.post(
3110
+ f"{SUPABASE_STORAGE_URL}/bucket",
3111
+ headers={
3112
+ "Authorization": f"Bearer {SUPABASE_ANON_KEY}",
3113
+ "Content-Type": "application/json",
3114
+ },
3115
+ json={"id": "pdfs", "name": "pdfs", "public": True},
3116
+ )
3117
+ except Exception:
3118
+ passi_app.include_router(api_router)
3119
+ fastapi_app.add_middleware(
3120
+ CORSMiddleware,
3121
+ allow_credentials=True,
3122
+ allow_origins=os.environ.get('CORS_ORIGINS', '*').split(','),
3123
+ allow_methods=["*"],
3124
+ allow_headers=["*"],
3125
+ )
3126
+
3127
+
3128
+ async def _sync_local_audios_to_mongodb():
3129
+ logger.info("Memulai sinkronisasi file audio lokal ke MongoDB...")
3130
+ try:
3131
+ if not AUDIO_DIR.exists():
3132
+ return
3133
+ import glob
3134
+ wav_files = glob.glob(str(AUDIO_DIR / "*.wav"))
3135
+ for file_path_str in wav_files:
3136
+ file_path = Path(file_path_str)
3137
+ filename = file_path.name
3138
+
3139
+ if file_path.stat().st_size > 15 * 1024 * 1024:
3140
+ logger.warning(f"File audio {filename} terlalu besar untuk MongoDB (>15MB), dilewati.")
3141
+ continue
3142
+
3143
  existing = await db.audio_files.find_one({"filename": filename}, {"_id": 1})
3144
  if not existing:
3145
  logger.info(f"Mengunggah file audio baru ke MongoDB: {filename}")
 
3167
 
3168
  @fastapi_app.on_event("shutdown")
3169
  async def shutdown_db_client():
3170
+ client.close()
3171
+
3172
+
3173
+ # ============== Supabase Storage helpers ==============
3174
+ SUPABASE_STORAGE_URL = f"{SUPABASE_URL}/storage/v1" if SUPABASE_URL else ""
3175
+
3176
+
3177
+ async def _ensure_pdfs_bucket():
3178
+ if not SUPABASE_STORAGE_URL or not SUPABASE_URL:
3179
+ return
3180
+ try:
3181
+ async with httpx.AsyncClient() as hc:
3182
+ await hc.post(
3183
+ f"{SUPABASE_STORAGE_URL}/bucket",
3184
+ headers={
3185
+ "Authorization": f"Bearer {SUPABASE_ANON_KEY}",
3186
+ "Content-Type": "application/json",
3187
+ },
3188
+ json={"id": "pdfs", "name": "pdfs", "public": True},
3189
+ )
3190
+ except Exception:
3191
+ pass