Spaces:
Sleeping
Sleeping
Upload server.py
Browse files
server.py
CHANGED
|
@@ -55,8 +55,8 @@ else:
|
|
| 55 |
|
| 56 |
GEMINI_API_KEY = os.environ.get('GEMINI_API_KEY', "AIzaSyAhVnCOblQvDvq9VIG6A4ztOdGh_yqarfk")
|
| 57 |
GEMINI_BASE_URL = os.environ.get('GEMINI_BASE_URL', "https://generativelanguage.googleapis.com/v1beta")
|
| 58 |
-
GEMINI_MODEL = os.environ.get('GEMINI_MODEL', "gemini-
|
| 59 |
-
GEMINI_ANALYSIS_MODEL = os.environ.get('GEMINI_ANALYSIS_MODEL', "gemini-
|
| 60 |
|
| 61 |
GROQ_API_KEY = os.environ.get('GROQ_API_KEY', "")
|
| 62 |
GROQ_MODEL = os.environ.get('GROQ_MODEL', "llama-3.3-70b-versatile")
|
|
@@ -604,35 +604,26 @@ async def _analyze_pdf_legacy(file_path: str, user: User) -> dict:
|
|
| 604 |
|
| 605 |
|
| 606 |
async def analyze_pdf(file_path: str, user: User) -> dict:
|
| 607 |
-
"""Analyze PDF using
|
| 608 |
-
# Kasih jeda awal
|
| 609 |
await asyncio.sleep(2)
|
| 610 |
|
| 611 |
reader = PdfReader(file_path)
|
| 612 |
total_pages = len(reader.pages)
|
| 613 |
|
| 614 |
-
#
|
| 615 |
-
#
|
| 616 |
-
if total_pages <=
|
| 617 |
return await _analyze_pdf_legacy(file_path, user)
|
| 618 |
|
| 619 |
-
#
|
| 620 |
-
|
| 621 |
-
|
| 622 |
-
elif total_pages <= 50:
|
| 623 |
-
chunk_size = 12
|
| 624 |
-
else:
|
| 625 |
-
chunk_size = 15
|
| 626 |
-
|
| 627 |
-
overlap = 2 # Overlap sedikit saja untuk menjaga konteks tanpa memboroskan request
|
| 628 |
step = chunk_size - overlap
|
| 629 |
|
| 630 |
-
# Calculate exact number of batches
|
| 631 |
remaining = total_pages - chunk_size
|
| 632 |
-
if remaining
|
| 633 |
-
total_batches = 1
|
| 634 |
-
else:
|
| 635 |
-
total_batches = (remaining + step - 1) // step + 1
|
| 636 |
|
| 637 |
all_concepts = []
|
| 638 |
all_diagrams = []
|
|
@@ -640,10 +631,8 @@ async def analyze_pdf(file_path: str, user: User) -> dict:
|
|
| 640 |
summaries = []
|
| 641 |
|
| 642 |
for batch_idx in range(total_batches):
|
| 643 |
-
# Berikan jeda lebih lama jika file besar agar tidak menabrak Rate Limit Google
|
| 644 |
if batch_idx > 0:
|
| 645 |
-
|
| 646 |
-
await asyncio.sleep(delay)
|
| 647 |
|
| 648 |
start = 1 + batch_idx * step
|
| 649 |
end = min(start + chunk_size - 1, total_pages)
|
|
@@ -657,16 +646,11 @@ async def analyze_pdf(file_path: str, user: User) -> dict:
|
|
| 657 |
all_concepts.extend(batch.get("key_concepts", []))
|
| 658 |
all_diagrams.extend(batch.get("diagrams", []))
|
| 659 |
all_objectives.extend(batch.get("learning_objectives", []))
|
| 660 |
-
all_objectives.extend(batch.get("learning_objectives", []))
|
| 661 |
|
| 662 |
-
#
|
| 663 |
-
unique_concepts = _deduplicate_concepts(all_concepts)
|
| 664 |
-
|
| 665 |
|
| 666 |
-
merged_diagrams = _merge_diagrams(all_diagrams)
|
| 667 |
-
merged_diagrams = merged_diagrams[:10]
|
| 668 |
-
|
| 669 |
-
# Deduplicate objectives
|
| 670 |
seen_obj = set()
|
| 671 |
unique_objectives = []
|
| 672 |
for obj in all_objectives:
|
|
@@ -674,56 +658,13 @@ async def analyze_pdf(file_path: str, user: User) -> dict:
|
|
| 674 |
if norm and norm not in seen_obj:
|
| 675 |
seen_obj.add(norm)
|
| 676 |
unique_objectives.append(obj)
|
| 677 |
-
|
| 678 |
-
|
| 679 |
-
# Synthesize final summary
|
| 680 |
-
final_summary = await _synthesize_summary_from_chunks(summaries, user)
|
| 681 |
-
|
| 682 |
-
# Fallback: if very few concepts from chunked analysis on large doc, try legacy and merge
|
| 683 |
-
title = ""
|
| 684 |
-
if len(unique_concepts) < 5 and total_pages > 10:
|
| 685 |
-
try:
|
| 686 |
-
legacy = await _analyze_pdf_legacy(file_path, user)
|
| 687 |
-
# Merge: keep existing concepts, add any new from legacy up to cap
|
| 688 |
-
legacy_concepts = legacy.get("key_concepts", [])
|
| 689 |
-
for lc in legacy_concepts:
|
| 690 |
-
if len(unique_concepts) >= 20:
|
| 691 |
-
break
|
| 692 |
-
norm = _normalize_concept_name(lc.get("concept", ""))
|
| 693 |
-
if norm and norm not in {_normalize_concept_name(c.get("concept", "")) for c in unique_concepts}:
|
| 694 |
-
unique_concepts.append(lc)
|
| 695 |
-
# Merge diagrams similarly
|
| 696 |
-
legacy_diagrams = legacy.get("diagrams", [])
|
| 697 |
-
for ld in legacy_diagrams:
|
| 698 |
-
if len(merged_diagrams) >= 10:
|
| 699 |
-
break
|
| 700 |
-
key = (_normalize_concept_name(ld.get("name", "")), ld.get("type", ""))
|
| 701 |
-
if key not in {(_normalize_concept_name(d.get("name", "")), d.get("type", "")) for d in merged_diagrams}:
|
| 702 |
-
merged_diagrams.append(ld)
|
| 703 |
-
# Merge objectives
|
| 704 |
-
for lo in legacy.get("learning_objectives", []):
|
| 705 |
-
if len(unique_objectives) >= 10:
|
| 706 |
-
break
|
| 707 |
-
norm = _normalize_concept_name(lo)
|
| 708 |
-
if norm and norm not in seen_obj:
|
| 709 |
-
seen_obj.add(norm)
|
| 710 |
-
unique_objectives.append(lo)
|
| 711 |
-
# Use legacy title if missing short summary
|
| 712 |
-
if not title and legacy.get("title"):
|
| 713 |
-
title = legacy.get("title")
|
| 714 |
-
except Exception as e:
|
| 715 |
-
logger.warning(f"Legacy fallback failed: {e}")
|
| 716 |
-
|
| 717 |
-
# Estimate title from first batch if still empty
|
| 718 |
-
if not title:
|
| 719 |
-
title = f"Dokumen ({total_pages} halaman)"
|
| 720 |
-
|
| 721 |
return {
|
| 722 |
-
"title":
|
| 723 |
-
"summary":
|
| 724 |
"key_concepts": unique_concepts,
|
| 725 |
"diagrams": merged_diagrams,
|
| 726 |
-
"learning_objectives": unique_objectives
|
| 727 |
}
|
| 728 |
|
| 729 |
|
|
@@ -3121,6 +3062,84 @@ async def _sync_local_audios_to_mongodb():
|
|
| 3121 |
for file_path_str in wav_files:
|
| 3122 |
file_path = Path(file_path_str)
|
| 3123 |
filename = file_path.name
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3124 |
existing = await db.audio_files.find_one({"filename": filename}, {"_id": 1})
|
| 3125 |
if not existing:
|
| 3126 |
logger.info(f"Mengunggah file audio baru ke MongoDB: {filename}")
|
|
@@ -3148,4 +3167,25 @@ async def startup():
|
|
| 3148 |
|
| 3149 |
@fastapi_app.on_event("shutdown")
|
| 3150 |
async def shutdown_db_client():
|
| 3151 |
-
client.close()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 55 |
|
| 56 |
GEMINI_API_KEY = os.environ.get('GEMINI_API_KEY', "AIzaSyAhVnCOblQvDvq9VIG6A4ztOdGh_yqarfk")
|
| 57 |
GEMINI_BASE_URL = os.environ.get('GEMINI_BASE_URL', "https://generativelanguage.googleapis.com/v1beta")
|
| 58 |
+
GEMINI_MODEL = os.environ.get('GEMINI_MODEL', "gemini-1.5-flash")
|
| 59 |
+
GEMINI_ANALYSIS_MODEL = os.environ.get('GEMINI_ANALYSIS_MODEL', "gemini-1.5-pro")
|
| 60 |
|
| 61 |
GROQ_API_KEY = os.environ.get('GROQ_API_KEY', "")
|
| 62 |
GROQ_MODEL = os.environ.get('GROQ_MODEL', "llama-3.3-70b-versatile")
|
|
|
|
| 604 |
|
| 605 |
|
| 606 |
async def analyze_pdf(file_path: str, user: User) -> dict:
|
| 607 |
+
"""Analyze PDF using single-pass strategy for high quality with Gemini 2.5."""
|
| 608 |
+
# Kasih jeda awal sesuai permintaan user
|
| 609 |
await asyncio.sleep(2)
|
| 610 |
|
| 611 |
reader = PdfReader(file_path)
|
| 612 |
total_pages = len(reader.pages)
|
| 613 |
|
| 614 |
+
# Gemini 2.5 memiliki context window 1jt+ token.
|
| 615 |
+
# Kita gunakan single-pass untuk dokumen sampai 200 halaman agar kualitas Anand Kumar maksimal.
|
| 616 |
+
if total_pages <= 200:
|
| 617 |
return await _analyze_pdf_legacy(file_path, user)
|
| 618 |
|
| 619 |
+
# Chunking hanya untuk file ekstrim (> 200 halaman)
|
| 620 |
+
chunk_size = 50
|
| 621 |
+
overlap = 5
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 622 |
step = chunk_size - overlap
|
| 623 |
|
| 624 |
+
# Calculate exact number of batches
|
| 625 |
remaining = total_pages - chunk_size
|
| 626 |
+
total_batches = (remaining + step - 1) // step + 1 if remaining > 0 else 1
|
|
|
|
|
|
|
|
|
|
| 627 |
|
| 628 |
all_concepts = []
|
| 629 |
all_diagrams = []
|
|
|
|
| 631 |
summaries = []
|
| 632 |
|
| 633 |
for batch_idx in range(total_batches):
|
|
|
|
| 634 |
if batch_idx > 0:
|
| 635 |
+
await asyncio.sleep(5) # Jeda lebih lama untuk file raksasa
|
|
|
|
| 636 |
|
| 637 |
start = 1 + batch_idx * step
|
| 638 |
end = min(start + chunk_size - 1, total_pages)
|
|
|
|
| 646 |
all_concepts.extend(batch.get("key_concepts", []))
|
| 647 |
all_diagrams.extend(batch.get("diagrams", []))
|
| 648 |
all_objectives.extend(batch.get("learning_objectives", []))
|
|
|
|
| 649 |
|
| 650 |
+
# Final merge logic
|
| 651 |
+
unique_concepts = _deduplicate_concepts(all_concepts)[:25]
|
| 652 |
+
merged_diagrams = _merge_diagrams(all_diagrams)[:12]
|
| 653 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 654 |
seen_obj = set()
|
| 655 |
unique_objectives = []
|
| 656 |
for obj in all_objectives:
|
|
|
|
| 658 |
if norm and norm not in seen_obj:
|
| 659 |
seen_obj.add(norm)
|
| 660 |
unique_objectives.append(obj)
|
| 661 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 662 |
return {
|
| 663 |
+
"title": f"Analisis Dokumen ({total_pages} hal)",
|
| 664 |
+
"summary": await _synthesize_summary_from_chunks(summaries, user),
|
| 665 |
"key_concepts": unique_concepts,
|
| 666 |
"diagrams": merged_diagrams,
|
| 667 |
+
"learning_objectives": unique_objectives[:12]
|
| 668 |
}
|
| 669 |
|
| 670 |
|
|
|
|
| 3062 |
for file_path_str in wav_files:
|
| 3063 |
file_path = Path(file_path_str)
|
| 3064 |
filename = file_path.name
|
| 3065 |
+
|
| 3066 |
+
if file_path.stat().st_size > 15 * 1024 * 1024:
|
| 3067 |
+
logger.warning(f"File audio {filename} terlalu besar untuk MongoDB (>15MB), dilewati.")
|
| 3068 |
+
continue
|
| 3069 |
+
|
| 3070 |
+
existing = await db.audio_files.find_one({"filename": filename}, {"_id": 1})
|
| 3071 |
+
if not existing:
|
| 3072 |
+
logger.info(f"Mengunggah file audio baru ke MongoDB: {filename}")
|
| 3073 |
+
with open(file_path_str, "rb") as f:
|
| 3074 |
+
data = f.read()
|
| 3075 |
+
await db.audio_files.update_one(
|
| 3076 |
+
{"filename": filename},
|
| 3077 |
+
{"$set": {
|
| 3078 |
+
"filename": filename,
|
| 3079 |
+
"data": Binary(data),
|
| 3080 |
+
"created_at": datetime.now(timezone.utc).isoformat()
|
| 3081 |
+
}},
|
| 3082 |
+
upsert=True
|
| 3083 |
+
)
|
| 3084 |
+
logger.info("Sinkronisasi audio selesai.")
|
| 3085 |
+
except Exception as e:
|
| 3086 |
+
logger.warning(f"Gagal melakukan sinkronisasi audio ke MongoDB: {e}")
|
| 3087 |
+
|
| 3088 |
+
|
| 3089 |
+
@fastapi_app.on_event("startup")
|
| 3090 |
+
async def startup():
|
| 3091 |
+
await _ensure_pdfs_bucket()
|
| 3092 |
+
await _sync_local_audios_to_mongodb()
|
| 3093 |
+
|
| 3094 |
+
|
| 3095 |
+
@fastapi_app.on_event("shutdown")
|
| 3096 |
+
async def shutdown_db_client():
|
| 3097 |
+
client.close()
|
| 3098 |
+
|
| 3099 |
+
|
| 3100 |
+
# ============== Supabase Storage helpers ==============
|
| 3101 |
+
SUPABASE_STORAGE_URL = f"{SUPABASE_URL}/storage/v1" if SUPABASE_URL else ""
|
| 3102 |
+
|
| 3103 |
+
|
| 3104 |
+
async def _ensure_pdfs_bucket():
|
| 3105 |
+
if not SUPABASE_STORAGE_URL or not SUPABASE_URL:
|
| 3106 |
+
return
|
| 3107 |
+
try:
|
| 3108 |
+
async with httpx.AsyncClient() as hc:
|
| 3109 |
+
await hc.post(
|
| 3110 |
+
f"{SUPABASE_STORAGE_URL}/bucket",
|
| 3111 |
+
headers={
|
| 3112 |
+
"Authorization": f"Bearer {SUPABASE_ANON_KEY}",
|
| 3113 |
+
"Content-Type": "application/json",
|
| 3114 |
+
},
|
| 3115 |
+
json={"id": "pdfs", "name": "pdfs", "public": True},
|
| 3116 |
+
)
|
| 3117 |
+
except Exception:
|
| 3118 |
+
passi_app.include_router(api_router)
|
| 3119 |
+
fastapi_app.add_middleware(
|
| 3120 |
+
CORSMiddleware,
|
| 3121 |
+
allow_credentials=True,
|
| 3122 |
+
allow_origins=os.environ.get('CORS_ORIGINS', '*').split(','),
|
| 3123 |
+
allow_methods=["*"],
|
| 3124 |
+
allow_headers=["*"],
|
| 3125 |
+
)
|
| 3126 |
+
|
| 3127 |
+
|
| 3128 |
+
async def _sync_local_audios_to_mongodb():
|
| 3129 |
+
logger.info("Memulai sinkronisasi file audio lokal ke MongoDB...")
|
| 3130 |
+
try:
|
| 3131 |
+
if not AUDIO_DIR.exists():
|
| 3132 |
+
return
|
| 3133 |
+
import glob
|
| 3134 |
+
wav_files = glob.glob(str(AUDIO_DIR / "*.wav"))
|
| 3135 |
+
for file_path_str in wav_files:
|
| 3136 |
+
file_path = Path(file_path_str)
|
| 3137 |
+
filename = file_path.name
|
| 3138 |
+
|
| 3139 |
+
if file_path.stat().st_size > 15 * 1024 * 1024:
|
| 3140 |
+
logger.warning(f"File audio {filename} terlalu besar untuk MongoDB (>15MB), dilewati.")
|
| 3141 |
+
continue
|
| 3142 |
+
|
| 3143 |
existing = await db.audio_files.find_one({"filename": filename}, {"_id": 1})
|
| 3144 |
if not existing:
|
| 3145 |
logger.info(f"Mengunggah file audio baru ke MongoDB: {filename}")
|
|
|
|
| 3167 |
|
| 3168 |
@fastapi_app.on_event("shutdown")
|
| 3169 |
async def shutdown_db_client():
|
| 3170 |
+
client.close()
|
| 3171 |
+
|
| 3172 |
+
|
| 3173 |
+
# ============== Supabase Storage helpers ==============
|
| 3174 |
+
SUPABASE_STORAGE_URL = f"{SUPABASE_URL}/storage/v1" if SUPABASE_URL else ""
|
| 3175 |
+
|
| 3176 |
+
|
| 3177 |
+
async def _ensure_pdfs_bucket():
|
| 3178 |
+
if not SUPABASE_STORAGE_URL or not SUPABASE_URL:
|
| 3179 |
+
return
|
| 3180 |
+
try:
|
| 3181 |
+
async with httpx.AsyncClient() as hc:
|
| 3182 |
+
await hc.post(
|
| 3183 |
+
f"{SUPABASE_STORAGE_URL}/bucket",
|
| 3184 |
+
headers={
|
| 3185 |
+
"Authorization": f"Bearer {SUPABASE_ANON_KEY}",
|
| 3186 |
+
"Content-Type": "application/json",
|
| 3187 |
+
},
|
| 3188 |
+
json={"id": "pdfs", "name": "pdfs", "public": True},
|
| 3189 |
+
)
|
| 3190 |
+
except Exception:
|
| 3191 |
+
pass
|