init
Browse files- .gitignore +2 -1
- app/main.py +75 -28
- scripts/ingest_hackathon_data.py +162 -0
- test_llm.json +0 -1
.gitignore
CHANGED
|
@@ -31,4 +31,5 @@ __pycache__
|
|
| 31 |
*.egg-info
|
| 32 |
venv/
|
| 33 |
env/
|
| 34 |
-
ENV/
|
|
|
|
|
|
| 31 |
*.egg-info
|
| 32 |
venv/
|
| 33 |
env/
|
| 34 |
+
ENV/
|
| 35 |
+
data/hackathon_data
|
app/main.py
CHANGED
|
@@ -143,7 +143,7 @@ def retrieve_documents(query: str, top_k: int = 3) -> List[Dict]:
|
|
| 143 |
documents.append({
|
| 144 |
'pdf_name': match['metadata'].get('pdf_name', 'unknown.pdf'),
|
| 145 |
'page_number': page_num,
|
| 146 |
-
'content': match['metadata'].get('
|
| 147 |
'score': match.get('score', 0.0)
|
| 148 |
})
|
| 149 |
|
|
@@ -250,34 +250,86 @@ async def llm_endpoint(request: Request):
|
|
| 250 |
Accepts two formats:
|
| 251 |
1. QuestionRequest: {"question": "...", "temperature": 0.2, "max_tokens": 1000}
|
| 252 |
2. ChatRequest: {"messages": [{"role": "user", "content": "..."}], ...}
|
|
|
|
|
|
|
| 253 |
"""
|
| 254 |
try:
|
| 255 |
# Parse request body
|
| 256 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 257 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 258 |
# Determine request format and extract query
|
| 259 |
-
|
| 260 |
# QuestionRequest format
|
| 261 |
query = body.get("question")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 262 |
temperature = body.get("temperature", 0.2)
|
| 263 |
max_tokens = body.get("max_tokens", 1000)
|
| 264 |
-
is_simple_format = True
|
| 265 |
elif "messages" in body:
|
| 266 |
# ChatRequest format
|
| 267 |
messages = body.get("messages", [])
|
| 268 |
if not messages:
|
| 269 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 270 |
|
| 271 |
user_messages = [msg for msg in messages if msg.get("role") == "user"]
|
| 272 |
if not user_messages:
|
| 273 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 274 |
|
| 275 |
query = user_messages[-1].get("content")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 276 |
temperature = body.get("temperature", 0.2)
|
| 277 |
max_tokens = body.get("max_tokens", 1000)
|
| 278 |
-
is_simple_format = False
|
| 279 |
else:
|
| 280 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 281 |
|
| 282 |
# Retrieve relevant documents
|
| 283 |
documents = retrieve_documents(query, top_k=3)
|
|
@@ -290,35 +342,30 @@ async def llm_endpoint(request: Request):
|
|
| 290 |
max_tokens=max_tokens
|
| 291 |
)
|
| 292 |
|
| 293 |
-
# Format sources for response
|
| 294 |
sources = [
|
| 295 |
{
|
| 296 |
"pdf_name": doc['pdf_name'],
|
| 297 |
-
"page_number": doc['page_number'],
|
| 298 |
-
"
|
| 299 |
}
|
| 300 |
for doc in documents
|
| 301 |
]
|
| 302 |
|
| 303 |
-
#
|
| 304 |
-
|
| 305 |
-
|
| 306 |
-
|
| 307 |
-
|
| 308 |
-
|
| 309 |
-
)
|
| 310 |
-
else:
|
| 311 |
-
return ChatResponse(
|
| 312 |
-
response=answer,
|
| 313 |
-
sources=[{k: str(v) for k, v in s.items()} for s in sources],
|
| 314 |
-
response_time=round(response_time, 2),
|
| 315 |
-
model="Llama-4-Maverick-17B-128E-Instruct-FP8"
|
| 316 |
-
)
|
| 317 |
|
| 318 |
-
except HTTPException:
|
| 319 |
-
raise
|
| 320 |
except Exception as e:
|
| 321 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 322 |
|
| 323 |
|
| 324 |
# ============================================================================
|
|
|
|
| 143 |
documents.append({
|
| 144 |
'pdf_name': match['metadata'].get('pdf_name', 'unknown.pdf'),
|
| 145 |
'page_number': page_num,
|
| 146 |
+
'content': match['metadata'].get('content', ''), # Changed from 'text' to 'content'
|
| 147 |
'score': match.get('score', 0.0)
|
| 148 |
})
|
| 149 |
|
|
|
|
| 250 |
Accepts two formats:
|
| 251 |
1. QuestionRequest: {"question": "...", "temperature": 0.2, "max_tokens": 1000}
|
| 252 |
2. ChatRequest: {"messages": [{"role": "user", "content": "..."}], ...}
|
| 253 |
+
|
| 254 |
+
ALWAYS returns: {"answer": str, "sources": List[Dict]}
|
| 255 |
"""
|
| 256 |
try:
|
| 257 |
# Parse request body
|
| 258 |
+
try:
|
| 259 |
+
body = await request.json()
|
| 260 |
+
except:
|
| 261 |
+
# Empty or invalid JSON - return error in expected format
|
| 262 |
+
return AnswerResponse(
|
| 263 |
+
answer="Error: Invalid JSON in request body. Please send valid JSON with 'question' field.",
|
| 264 |
+
sources=[],
|
| 265 |
+
response_time=0.0
|
| 266 |
+
)
|
| 267 |
|
| 268 |
+
# Handle list format (validator sends list directly)
|
| 269 |
+
if isinstance(body, list):
|
| 270 |
+
# Validator format: [{"role": "user", "content": "..."}]
|
| 271 |
+
user_messages = [msg for msg in body if isinstance(msg, dict) and msg.get("role") == "user"]
|
| 272 |
+
if not user_messages:
|
| 273 |
+
return AnswerResponse(
|
| 274 |
+
answer="Error: No user message found in messages array.",
|
| 275 |
+
sources=[],
|
| 276 |
+
response_time=0.0
|
| 277 |
+
)
|
| 278 |
+
query = user_messages[-1].get("content")
|
| 279 |
+
if not query or not query.strip():
|
| 280 |
+
return AnswerResponse(
|
| 281 |
+
answer="Error: Empty message content provided.",
|
| 282 |
+
sources=[],
|
| 283 |
+
response_time=0.0
|
| 284 |
+
)
|
| 285 |
+
temperature = 0.2
|
| 286 |
+
max_tokens = 1000
|
| 287 |
# Determine request format and extract query
|
| 288 |
+
elif "question" in body:
|
| 289 |
# QuestionRequest format
|
| 290 |
query = body.get("question")
|
| 291 |
+
if not query or not query.strip():
|
| 292 |
+
return AnswerResponse(
|
| 293 |
+
answer="Error: Empty question provided. Please provide a valid question.",
|
| 294 |
+
sources=[],
|
| 295 |
+
response_time=0.0
|
| 296 |
+
)
|
| 297 |
temperature = body.get("temperature", 0.2)
|
| 298 |
max_tokens = body.get("max_tokens", 1000)
|
|
|
|
| 299 |
elif "messages" in body:
|
| 300 |
# ChatRequest format
|
| 301 |
messages = body.get("messages", [])
|
| 302 |
if not messages:
|
| 303 |
+
return AnswerResponse(
|
| 304 |
+
answer="Error: No messages provided in request.",
|
| 305 |
+
sources=[],
|
| 306 |
+
response_time=0.0
|
| 307 |
+
)
|
| 308 |
|
| 309 |
user_messages = [msg for msg in messages if msg.get("role") == "user"]
|
| 310 |
if not user_messages:
|
| 311 |
+
return AnswerResponse(
|
| 312 |
+
answer="Error: No user message found in messages array.",
|
| 313 |
+
sources=[],
|
| 314 |
+
response_time=0.0
|
| 315 |
+
)
|
| 316 |
|
| 317 |
query = user_messages[-1].get("content")
|
| 318 |
+
if not query or not query.strip():
|
| 319 |
+
return AnswerResponse(
|
| 320 |
+
answer="Error: Empty message content provided.",
|
| 321 |
+
sources=[],
|
| 322 |
+
response_time=0.0
|
| 323 |
+
)
|
| 324 |
temperature = body.get("temperature", 0.2)
|
| 325 |
max_tokens = body.get("max_tokens", 1000)
|
|
|
|
| 326 |
else:
|
| 327 |
+
# No question or messages field - return error in expected format
|
| 328 |
+
return AnswerResponse(
|
| 329 |
+
answer="Error: Invalid request format. Expected 'question' or 'messages' field in request body.",
|
| 330 |
+
sources=[],
|
| 331 |
+
response_time=0.0
|
| 332 |
+
)
|
| 333 |
|
| 334 |
# Retrieve relevant documents
|
| 335 |
documents = retrieve_documents(query, top_k=3)
|
|
|
|
| 342 |
max_tokens=max_tokens
|
| 343 |
)
|
| 344 |
|
| 345 |
+
# Format sources for response (validator expects pdf_name, page_number, content)
|
| 346 |
sources = [
|
| 347 |
{
|
| 348 |
"pdf_name": doc['pdf_name'],
|
| 349 |
+
"page_number": doc['page_number'], # Already converted to int
|
| 350 |
+
"content": doc['content'] # The actual document text
|
| 351 |
}
|
| 352 |
for doc in documents
|
| 353 |
]
|
| 354 |
|
| 355 |
+
# Always return AnswerResponse format (validator expects 'answer' and 'sources' keys)
|
| 356 |
+
return AnswerResponse(
|
| 357 |
+
answer=answer,
|
| 358 |
+
sources=sources,
|
| 359 |
+
response_time=round(response_time, 2)
|
| 360 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 361 |
|
|
|
|
|
|
|
| 362 |
except Exception as e:
|
| 363 |
+
# Always return expected format, even for errors
|
| 364 |
+
return AnswerResponse(
|
| 365 |
+
answer=f"Error: {str(e)}",
|
| 366 |
+
sources=[],
|
| 367 |
+
response_time=0.0
|
| 368 |
+
)
|
| 369 |
|
| 370 |
|
| 371 |
# ============================================================================
|
scripts/ingest_hackathon_data.py
ADDED
|
@@ -0,0 +1,162 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Ingest ONLY PDFs from hackathon_data folder
|
| 3 |
+
Parallel processing with 4 workers
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import os
|
| 7 |
+
import sys
|
| 8 |
+
import time
|
| 9 |
+
import json
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
from concurrent.futures import ProcessPoolExecutor, as_completed
|
| 12 |
+
from dotenv import load_dotenv
|
| 13 |
+
|
| 14 |
+
# Add parent directory to path
|
| 15 |
+
sys.path.insert(0, str(Path(__file__).parent))
|
| 16 |
+
|
| 17 |
+
# Load environment
|
| 18 |
+
load_dotenv()
|
| 19 |
+
|
| 20 |
+
# Import from the main ingestion script
|
| 21 |
+
PROJECT_ROOT = Path(__file__).parent.parent
|
| 22 |
+
PDFS_DIR = PROJECT_ROOT / "data" / "hackathon_data" # Changed to hackathon_data
|
| 23 |
+
OUTPUT_DIR = PROJECT_ROOT / "output" / "ingestion"
|
| 24 |
+
|
| 25 |
+
# Import the ingestion function
|
| 26 |
+
import ingest_pdfs
|
| 27 |
+
|
| 28 |
+
def worker_ingest(pdf_path: str):
|
| 29 |
+
"""Worker function to ingest a single PDF"""
|
| 30 |
+
try:
|
| 31 |
+
result = ingest_pdfs.ingest_pdf(str(pdf_path))
|
| 32 |
+
return result
|
| 33 |
+
except Exception as e:
|
| 34 |
+
return {
|
| 35 |
+
"pdf_name": Path(pdf_path).name,
|
| 36 |
+
"status": "error",
|
| 37 |
+
"error": str(e)
|
| 38 |
+
}
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def main():
|
| 42 |
+
"""Main parallel ingestion pipeline"""
|
| 43 |
+
print("\n" + "="*70)
|
| 44 |
+
print("🚀 HACKATHON DATA INGESTION (4x PARALLEL)")
|
| 45 |
+
print("="*70)
|
| 46 |
+
print(f"📂 PDF Directory: {PDFS_DIR}")
|
| 47 |
+
print(f"⚡ Workers: 4 PDFs at once")
|
| 48 |
+
print(f"🎯 Vector Database: Pinecone ({os.getenv('PINECONE_INDEX_NAME')})")
|
| 49 |
+
print("="*70)
|
| 50 |
+
|
| 51 |
+
# Get all PDFs
|
| 52 |
+
all_pdfs = sorted(PDFS_DIR.glob("*.pdf"))
|
| 53 |
+
print(f"\n📚 Found {len(all_pdfs)} PDFs in hackathon_data folder")
|
| 54 |
+
|
| 55 |
+
if not all_pdfs:
|
| 56 |
+
print("\n❌ No PDFs found in hackathon_data folder!")
|
| 57 |
+
return
|
| 58 |
+
|
| 59 |
+
for pdf in all_pdfs:
|
| 60 |
+
print(f" → {pdf.name}")
|
| 61 |
+
|
| 62 |
+
print(f"\n⚡ Starting parallel processing with 4 workers...")
|
| 63 |
+
print(f"⏱️ Estimated time: ~{len(all_pdfs) * 80 / 4 / 60:.1f} minutes\n")
|
| 64 |
+
|
| 65 |
+
# Process in parallel
|
| 66 |
+
results = []
|
| 67 |
+
completed = 0
|
| 68 |
+
start_time = time.time()
|
| 69 |
+
|
| 70 |
+
with ProcessPoolExecutor(max_workers=4) as executor:
|
| 71 |
+
# Submit all jobs
|
| 72 |
+
future_to_pdf = {
|
| 73 |
+
executor.submit(worker_ingest, str(pdf)): pdf
|
| 74 |
+
for pdf in all_pdfs
|
| 75 |
+
}
|
| 76 |
+
|
| 77 |
+
# Collect results as they complete
|
| 78 |
+
for future in as_completed(future_to_pdf):
|
| 79 |
+
pdf = future_to_pdf[future]
|
| 80 |
+
completed += 1
|
| 81 |
+
|
| 82 |
+
try:
|
| 83 |
+
result = future.result()
|
| 84 |
+
results.append(result)
|
| 85 |
+
|
| 86 |
+
if result.get("status") == "success":
|
| 87 |
+
elapsed = time.time() - start_time
|
| 88 |
+
avg_time = elapsed / completed
|
| 89 |
+
remaining = len(all_pdfs) - completed
|
| 90 |
+
eta = remaining * avg_time / 60
|
| 91 |
+
|
| 92 |
+
print(f"✅ [{completed}/{len(all_pdfs)}] {pdf.name}")
|
| 93 |
+
print(f" 📊 {result['num_vectors']} vectors, {result['time_total']:.1f}s")
|
| 94 |
+
print(f" ⏱️ ETA: {eta:.1f} minutes remaining\n")
|
| 95 |
+
else:
|
| 96 |
+
print(f"❌ [{completed}/{len(all_pdfs)}] {pdf.name} - {result.get('error', 'Unknown error')}\n")
|
| 97 |
+
|
| 98 |
+
except Exception as e:
|
| 99 |
+
print(f"❌ [{completed}/{len(all_pdfs)}] {pdf.name} - Error: {e}\n")
|
| 100 |
+
results.append({
|
| 101 |
+
"pdf_name": pdf.name,
|
| 102 |
+
"status": "error",
|
| 103 |
+
"error": str(e)
|
| 104 |
+
})
|
| 105 |
+
|
| 106 |
+
total_time = time.time() - start_time
|
| 107 |
+
|
| 108 |
+
# Summary
|
| 109 |
+
print("\n" + "="*70)
|
| 110 |
+
print("📊 INGESTION COMPLETE")
|
| 111 |
+
print("="*70)
|
| 112 |
+
|
| 113 |
+
successful = [r for r in results if r.get("status") == "success"]
|
| 114 |
+
failed = [r for r in results if r.get("status") == "error"]
|
| 115 |
+
|
| 116 |
+
print(f"\n✅ Successful: {len(successful)}/{len(all_pdfs)}")
|
| 117 |
+
print(f"❌ Failed: {len(failed)}")
|
| 118 |
+
print(f"⏱️ Total Time: {total_time/60:.1f} minutes")
|
| 119 |
+
|
| 120 |
+
if successful:
|
| 121 |
+
total_vectors = sum(r["num_vectors"] for r in successful)
|
| 122 |
+
avg_time = sum(r["time_total"] for r in successful) / len(successful)
|
| 123 |
+
print(f"\n📦 Total Vectors Uploaded: {total_vectors}")
|
| 124 |
+
print(f"⏱️ Average Time per PDF: {avg_time:.1f}s")
|
| 125 |
+
|
| 126 |
+
# Save results
|
| 127 |
+
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
| 128 |
+
results_file = OUTPUT_DIR / "hackathon_data_ingestion.json"
|
| 129 |
+
|
| 130 |
+
with open(results_file, 'w', encoding='utf-8') as f:
|
| 131 |
+
json.dump({
|
| 132 |
+
"timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
|
| 133 |
+
"source_folder": "hackathon_data",
|
| 134 |
+
"total_pdfs": len(all_pdfs),
|
| 135 |
+
"successful": len(successful),
|
| 136 |
+
"failed": len(failed),
|
| 137 |
+
"total_time_seconds": round(total_time, 2),
|
| 138 |
+
"results": results
|
| 139 |
+
}, f, indent=2, ensure_ascii=False)
|
| 140 |
+
|
| 141 |
+
print(f"\n📄 Results saved to: {results_file}")
|
| 142 |
+
|
| 143 |
+
# Final Pinecone stats
|
| 144 |
+
try:
|
| 145 |
+
from pinecone import Pinecone
|
| 146 |
+
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
|
| 147 |
+
index = pc.Index(os.getenv("PINECONE_INDEX_NAME", "hackathon"))
|
| 148 |
+
stats = index.describe_index_stats()
|
| 149 |
+
|
| 150 |
+
print(f"\n📊 Final Pinecone Stats:")
|
| 151 |
+
print(f" Total Vectors: {stats.get('total_vector_count', 0)}")
|
| 152 |
+
print(f" Dimensions: {stats.get('dimension', 0)}")
|
| 153 |
+
except Exception as e:
|
| 154 |
+
print(f"\nCould not fetch Pinecone stats: {e}")
|
| 155 |
+
|
| 156 |
+
print("\n" + "="*70)
|
| 157 |
+
print("🎉 HACKATHON DATA INGESTION COMPLETE!")
|
| 158 |
+
print("="*70)
|
| 159 |
+
|
| 160 |
+
|
| 161 |
+
if __name__ == "__main__":
|
| 162 |
+
main()
|
test_llm.json
DELETED
|
@@ -1 +0,0 @@
|
|
| 1 |
-
{"question":"Neft hasilatı haqqında nə məlumat var?"}
|
|
|
|
|
|