IsmatS commited on
Commit
291c0bb
·
1 Parent(s): 470cbeb
Files changed (4) hide show
  1. .gitignore +2 -1
  2. app/main.py +75 -28
  3. scripts/ingest_hackathon_data.py +162 -0
  4. test_llm.json +0 -1
.gitignore CHANGED
@@ -31,4 +31,5 @@ __pycache__
31
  *.egg-info
32
  venv/
33
  env/
34
- ENV/
 
 
31
  *.egg-info
32
  venv/
33
  env/
34
+ ENV/
35
+ data/hackathon_data
app/main.py CHANGED
@@ -143,7 +143,7 @@ def retrieve_documents(query: str, top_k: int = 3) -> List[Dict]:
143
  documents.append({
144
  'pdf_name': match['metadata'].get('pdf_name', 'unknown.pdf'),
145
  'page_number': page_num,
146
- 'content': match['metadata'].get('text', ''),
147
  'score': match.get('score', 0.0)
148
  })
149
 
@@ -250,34 +250,86 @@ async def llm_endpoint(request: Request):
250
  Accepts two formats:
251
  1. QuestionRequest: {"question": "...", "temperature": 0.2, "max_tokens": 1000}
252
  2. ChatRequest: {"messages": [{"role": "user", "content": "..."}], ...}
 
 
253
  """
254
  try:
255
  # Parse request body
256
- body = await request.json()
 
 
 
 
 
 
 
 
257
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
258
  # Determine request format and extract query
259
- if "question" in body:
260
  # QuestionRequest format
261
  query = body.get("question")
 
 
 
 
 
 
262
  temperature = body.get("temperature", 0.2)
263
  max_tokens = body.get("max_tokens", 1000)
264
- is_simple_format = True
265
  elif "messages" in body:
266
  # ChatRequest format
267
  messages = body.get("messages", [])
268
  if not messages:
269
- raise HTTPException(status_code=400, detail="No messages provided")
 
 
 
 
270
 
271
  user_messages = [msg for msg in messages if msg.get("role") == "user"]
272
  if not user_messages:
273
- raise HTTPException(status_code=400, detail="No user message found")
 
 
 
 
274
 
275
  query = user_messages[-1].get("content")
 
 
 
 
 
 
276
  temperature = body.get("temperature", 0.2)
277
  max_tokens = body.get("max_tokens", 1000)
278
- is_simple_format = False
279
  else:
280
- raise HTTPException(status_code=400, detail="Invalid request format. Expected 'question' or 'messages' field.")
 
 
 
 
 
281
 
282
  # Retrieve relevant documents
283
  documents = retrieve_documents(query, top_k=3)
@@ -290,35 +342,30 @@ async def llm_endpoint(request: Request):
290
  max_tokens=max_tokens
291
  )
292
 
293
- # Format sources for response
294
  sources = [
295
  {
296
  "pdf_name": doc['pdf_name'],
297
- "page_number": doc['page_number'],
298
- "relevance_score": f"{doc['score']:.3f}"
299
  }
300
  for doc in documents
301
  ]
302
 
303
- # Return appropriate response format
304
- if is_simple_format:
305
- return AnswerResponse(
306
- answer=answer,
307
- sources=sources,
308
- response_time=round(response_time, 2)
309
- )
310
- else:
311
- return ChatResponse(
312
- response=answer,
313
- sources=[{k: str(v) for k, v in s.items()} for s in sources],
314
- response_time=round(response_time, 2),
315
- model="Llama-4-Maverick-17B-128E-Instruct-FP8"
316
- )
317
 
318
- except HTTPException:
319
- raise
320
  except Exception as e:
321
- raise HTTPException(status_code=500, detail=f"Error: {str(e)}")
 
 
 
 
 
322
 
323
 
324
  # ============================================================================
 
143
  documents.append({
144
  'pdf_name': match['metadata'].get('pdf_name', 'unknown.pdf'),
145
  'page_number': page_num,
146
+ 'content': match['metadata'].get('content', ''), # Changed from 'text' to 'content'
147
  'score': match.get('score', 0.0)
148
  })
149
 
 
250
  Accepts two formats:
251
  1. QuestionRequest: {"question": "...", "temperature": 0.2, "max_tokens": 1000}
252
  2. ChatRequest: {"messages": [{"role": "user", "content": "..."}], ...}
253
+
254
+ ALWAYS returns: {"answer": str, "sources": List[Dict]}
255
  """
256
  try:
257
  # Parse request body
258
+ try:
259
+ body = await request.json()
260
+ except:
261
+ # Empty or invalid JSON - return error in expected format
262
+ return AnswerResponse(
263
+ answer="Error: Invalid JSON in request body. Please send valid JSON with 'question' field.",
264
+ sources=[],
265
+ response_time=0.0
266
+ )
267
 
268
+ # Handle list format (validator sends list directly)
269
+ if isinstance(body, list):
270
+ # Validator format: [{"role": "user", "content": "..."}]
271
+ user_messages = [msg for msg in body if isinstance(msg, dict) and msg.get("role") == "user"]
272
+ if not user_messages:
273
+ return AnswerResponse(
274
+ answer="Error: No user message found in messages array.",
275
+ sources=[],
276
+ response_time=0.0
277
+ )
278
+ query = user_messages[-1].get("content")
279
+ if not query or not query.strip():
280
+ return AnswerResponse(
281
+ answer="Error: Empty message content provided.",
282
+ sources=[],
283
+ response_time=0.0
284
+ )
285
+ temperature = 0.2
286
+ max_tokens = 1000
287
  # Determine request format and extract query
288
+ elif "question" in body:
289
  # QuestionRequest format
290
  query = body.get("question")
291
+ if not query or not query.strip():
292
+ return AnswerResponse(
293
+ answer="Error: Empty question provided. Please provide a valid question.",
294
+ sources=[],
295
+ response_time=0.0
296
+ )
297
  temperature = body.get("temperature", 0.2)
298
  max_tokens = body.get("max_tokens", 1000)
 
299
  elif "messages" in body:
300
  # ChatRequest format
301
  messages = body.get("messages", [])
302
  if not messages:
303
+ return AnswerResponse(
304
+ answer="Error: No messages provided in request.",
305
+ sources=[],
306
+ response_time=0.0
307
+ )
308
 
309
  user_messages = [msg for msg in messages if msg.get("role") == "user"]
310
  if not user_messages:
311
+ return AnswerResponse(
312
+ answer="Error: No user message found in messages array.",
313
+ sources=[],
314
+ response_time=0.0
315
+ )
316
 
317
  query = user_messages[-1].get("content")
318
+ if not query or not query.strip():
319
+ return AnswerResponse(
320
+ answer="Error: Empty message content provided.",
321
+ sources=[],
322
+ response_time=0.0
323
+ )
324
  temperature = body.get("temperature", 0.2)
325
  max_tokens = body.get("max_tokens", 1000)
 
326
  else:
327
+ # No question or messages field - return error in expected format
328
+ return AnswerResponse(
329
+ answer="Error: Invalid request format. Expected 'question' or 'messages' field in request body.",
330
+ sources=[],
331
+ response_time=0.0
332
+ )
333
 
334
  # Retrieve relevant documents
335
  documents = retrieve_documents(query, top_k=3)
 
342
  max_tokens=max_tokens
343
  )
344
 
345
+ # Format sources for response (validator expects pdf_name, page_number, content)
346
  sources = [
347
  {
348
  "pdf_name": doc['pdf_name'],
349
+ "page_number": doc['page_number'], # Already converted to int
350
+ "content": doc['content'] # The actual document text
351
  }
352
  for doc in documents
353
  ]
354
 
355
+ # Always return AnswerResponse format (validator expects 'answer' and 'sources' keys)
356
+ return AnswerResponse(
357
+ answer=answer,
358
+ sources=sources,
359
+ response_time=round(response_time, 2)
360
+ )
 
 
 
 
 
 
 
 
361
 
 
 
362
  except Exception as e:
363
+ # Always return expected format, even for errors
364
+ return AnswerResponse(
365
+ answer=f"Error: {str(e)}",
366
+ sources=[],
367
+ response_time=0.0
368
+ )
369
 
370
 
371
  # ============================================================================
scripts/ingest_hackathon_data.py ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Ingest ONLY PDFs from hackathon_data folder
3
+ Parallel processing with 4 workers
4
+ """
5
+
6
+ import os
7
+ import sys
8
+ import time
9
+ import json
10
+ from pathlib import Path
11
+ from concurrent.futures import ProcessPoolExecutor, as_completed
12
+ from dotenv import load_dotenv
13
+
14
+ # Add parent directory to path
15
+ sys.path.insert(0, str(Path(__file__).parent))
16
+
17
+ # Load environment
18
+ load_dotenv()
19
+
20
+ # Import from the main ingestion script
21
+ PROJECT_ROOT = Path(__file__).parent.parent
22
+ PDFS_DIR = PROJECT_ROOT / "data" / "hackathon_data" # Changed to hackathon_data
23
+ OUTPUT_DIR = PROJECT_ROOT / "output" / "ingestion"
24
+
25
+ # Import the ingestion function
26
+ import ingest_pdfs
27
+
28
+ def worker_ingest(pdf_path: str):
29
+ """Worker function to ingest a single PDF"""
30
+ try:
31
+ result = ingest_pdfs.ingest_pdf(str(pdf_path))
32
+ return result
33
+ except Exception as e:
34
+ return {
35
+ "pdf_name": Path(pdf_path).name,
36
+ "status": "error",
37
+ "error": str(e)
38
+ }
39
+
40
+
41
+ def main():
42
+ """Main parallel ingestion pipeline"""
43
+ print("\n" + "="*70)
44
+ print("🚀 HACKATHON DATA INGESTION (4x PARALLEL)")
45
+ print("="*70)
46
+ print(f"📂 PDF Directory: {PDFS_DIR}")
47
+ print(f"⚡ Workers: 4 PDFs at once")
48
+ print(f"🎯 Vector Database: Pinecone ({os.getenv('PINECONE_INDEX_NAME')})")
49
+ print("="*70)
50
+
51
+ # Get all PDFs
52
+ all_pdfs = sorted(PDFS_DIR.glob("*.pdf"))
53
+ print(f"\n📚 Found {len(all_pdfs)} PDFs in hackathon_data folder")
54
+
55
+ if not all_pdfs:
56
+ print("\n❌ No PDFs found in hackathon_data folder!")
57
+ return
58
+
59
+ for pdf in all_pdfs:
60
+ print(f" → {pdf.name}")
61
+
62
+ print(f"\n⚡ Starting parallel processing with 4 workers...")
63
+ print(f"⏱️ Estimated time: ~{len(all_pdfs) * 80 / 4 / 60:.1f} minutes\n")
64
+
65
+ # Process in parallel
66
+ results = []
67
+ completed = 0
68
+ start_time = time.time()
69
+
70
+ with ProcessPoolExecutor(max_workers=4) as executor:
71
+ # Submit all jobs
72
+ future_to_pdf = {
73
+ executor.submit(worker_ingest, str(pdf)): pdf
74
+ for pdf in all_pdfs
75
+ }
76
+
77
+ # Collect results as they complete
78
+ for future in as_completed(future_to_pdf):
79
+ pdf = future_to_pdf[future]
80
+ completed += 1
81
+
82
+ try:
83
+ result = future.result()
84
+ results.append(result)
85
+
86
+ if result.get("status") == "success":
87
+ elapsed = time.time() - start_time
88
+ avg_time = elapsed / completed
89
+ remaining = len(all_pdfs) - completed
90
+ eta = remaining * avg_time / 60
91
+
92
+ print(f"✅ [{completed}/{len(all_pdfs)}] {pdf.name}")
93
+ print(f" 📊 {result['num_vectors']} vectors, {result['time_total']:.1f}s")
94
+ print(f" ⏱️ ETA: {eta:.1f} minutes remaining\n")
95
+ else:
96
+ print(f"❌ [{completed}/{len(all_pdfs)}] {pdf.name} - {result.get('error', 'Unknown error')}\n")
97
+
98
+ except Exception as e:
99
+ print(f"❌ [{completed}/{len(all_pdfs)}] {pdf.name} - Error: {e}\n")
100
+ results.append({
101
+ "pdf_name": pdf.name,
102
+ "status": "error",
103
+ "error": str(e)
104
+ })
105
+
106
+ total_time = time.time() - start_time
107
+
108
+ # Summary
109
+ print("\n" + "="*70)
110
+ print("📊 INGESTION COMPLETE")
111
+ print("="*70)
112
+
113
+ successful = [r for r in results if r.get("status") == "success"]
114
+ failed = [r for r in results if r.get("status") == "error"]
115
+
116
+ print(f"\n✅ Successful: {len(successful)}/{len(all_pdfs)}")
117
+ print(f"❌ Failed: {len(failed)}")
118
+ print(f"⏱️ Total Time: {total_time/60:.1f} minutes")
119
+
120
+ if successful:
121
+ total_vectors = sum(r["num_vectors"] for r in successful)
122
+ avg_time = sum(r["time_total"] for r in successful) / len(successful)
123
+ print(f"\n📦 Total Vectors Uploaded: {total_vectors}")
124
+ print(f"⏱️ Average Time per PDF: {avg_time:.1f}s")
125
+
126
+ # Save results
127
+ OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
128
+ results_file = OUTPUT_DIR / "hackathon_data_ingestion.json"
129
+
130
+ with open(results_file, 'w', encoding='utf-8') as f:
131
+ json.dump({
132
+ "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
133
+ "source_folder": "hackathon_data",
134
+ "total_pdfs": len(all_pdfs),
135
+ "successful": len(successful),
136
+ "failed": len(failed),
137
+ "total_time_seconds": round(total_time, 2),
138
+ "results": results
139
+ }, f, indent=2, ensure_ascii=False)
140
+
141
+ print(f"\n📄 Results saved to: {results_file}")
142
+
143
+ # Final Pinecone stats
144
+ try:
145
+ from pinecone import Pinecone
146
+ pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
147
+ index = pc.Index(os.getenv("PINECONE_INDEX_NAME", "hackathon"))
148
+ stats = index.describe_index_stats()
149
+
150
+ print(f"\n📊 Final Pinecone Stats:")
151
+ print(f" Total Vectors: {stats.get('total_vector_count', 0)}")
152
+ print(f" Dimensions: {stats.get('dimension', 0)}")
153
+ except Exception as e:
154
+ print(f"\nCould not fetch Pinecone stats: {e}")
155
+
156
+ print("\n" + "="*70)
157
+ print("🎉 HACKATHON DATA INGESTION COMPLETE!")
158
+ print("="*70)
159
+
160
+
161
+ if __name__ == "__main__":
162
+ main()
test_llm.json DELETED
@@ -1 +0,0 @@
1
- {"question":"Neft hasilatı haqqında nə məlumat var?"}