SOCAR_Hackathon / scripts /ingest_parallel.py

init

6ac41d1 4 months ago

6.57 kB

	"""
	Parallel PDF Ingestion - 4x Faster
	Processes 4 PDFs simultaneously without affecting quality
	"""

	import os
	import sys
	import time
	import json
	from pathlib import Path
	from concurrent.futures import ProcessPoolExecutor, as_completed
	from dotenv import load_dotenv

	# Add parent directory to path
	sys.path.insert(0, str(Path(__file__).parent))

	# Load environment
	load_dotenv()

	# Import from the main ingestion script
	PROJECT_ROOT = Path(__file__).parent.parent
	PDFS_DIR = PROJECT_ROOT / "data" / "pdfs"
	OUTPUT_DIR = PROJECT_ROOT / "output" / "ingestion"

	# Import the ingestion function
	import ingest_pdfs

	def get_already_processed():
	"""Check which PDFs are already in Pinecone"""
	try:
	from pinecone import Pinecone

	pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
	index = pc.Index(os.getenv("PINECONE_INDEX_NAME", "hackathon"))

	# Query to get all unique PDF names
	results = index.query(
	vector=[0.0] * 1024,
	top_k=10000,
	include_metadata=True
	)

	processed = set()
	for match in results.get('matches', []):
	pdf_name = match.get('metadata', {}).get('pdf_name')
	if pdf_name:
	processed.add(pdf_name)

	return processed
	except Exception as e:
	print(f"Warning: Could not check existing PDFs: {e}")
	return set()


	def worker_ingest(pdf_path: str):
	"""Worker function to ingest a single PDF"""
	try:
	result = ingest_pdfs.ingest_pdf(str(pdf_path))
	return result
	except Exception as e:
	return {
	"pdf_name": Path(pdf_path).name,
	"status": "error",
	"error": str(e)
	}


	def main():
	"""Main parallel ingestion pipeline"""
	print("\n" + "="*70)
	print("🚀 PARALLEL PDF INGESTION (4x FASTER)")
	print("="*70)
	print(f"📂 PDF Directory: {PDFS_DIR}")
	print(f"⚡ Workers: 4 PDFs at once")
	print(f"🎯 Vector Database: Pinecone ({os.getenv('PINECONE_INDEX_NAME')})")
	print("="*70)

	# Get all PDFs
	all_pdfs = sorted(PDFS_DIR.glob("*.pdf"))
	print(f"\n📚 Found {len(all_pdfs)} total PDFs")

	# Check what's already done
	print("\n🔍 Checking Pinecone for already processed PDFs...")
	already_processed = get_already_processed()

	if already_processed:
	print(f"✅ Already processed: {len(already_processed)} PDFs")
	for pdf in sorted(already_processed):
	print(f" ✓ {pdf}")

	# Filter to only unprocessed PDFs
	pdfs_to_process = [
	pdf for pdf in all_pdfs
	if pdf.name not in already_processed
	]

	if not pdfs_to_process:
	print("\n🎉 All PDFs already processed!")
	return

	print(f"\n⏳ Remaining to process: {len(pdfs_to_process)} PDFs")
	for pdf in pdfs_to_process:
	print(f" → {pdf.name}")

	print(f"\n⚡ Starting parallel processing with 4 workers...")
	print(f"⏱️ Estimated time: ~{len(pdfs_to_process) * 80 / 4 / 60:.1f} minutes\n")

	# Process in parallel
	results = []
	completed = 0
	start_time = time.time()

	with ProcessPoolExecutor(max_workers=4) as executor:
	# Submit all jobs
	future_to_pdf = {
	executor.submit(worker_ingest, str(pdf)): pdf
	for pdf in pdfs_to_process
	}

	# Collect results as they complete
	for future in as_completed(future_to_pdf):
	pdf = future_to_pdf[future]
	completed += 1

	try:
	result = future.result()
	results.append(result)

	if result.get("status") == "success":
	elapsed = time.time() - start_time
	avg_time = elapsed / completed
	remaining = len(pdfs_to_process) - completed
	eta = remaining * avg_time / 60

	print(f"✅ [{completed}/{len(pdfs_to_process)}] {pdf.name}")
	print(f" 📊 {result['num_vectors']} vectors, {result['time_total']:.1f}s")
	print(f" ⏱️ ETA: {eta:.1f} minutes remaining\n")
	else:
	print(f"❌ [{completed}/{len(pdfs_to_process)}] {pdf.name} - {result.get('error', 'Unknown error')}\n")

	except Exception as e:
	print(f"❌ [{completed}/{len(pdfs_to_process)}] {pdf.name} - Error: {e}\n")
	results.append({
	"pdf_name": pdf.name,
	"status": "error",
	"error": str(e)
	})

	total_time = time.time() - start_time

	# Summary
	print("\n" + "="*70)
	print("📊 PARALLEL INGESTION COMPLETE")
	print("="*70)

	successful = [r for r in results if r.get("status") == "success"]
	failed = [r for r in results if r.get("status") == "error"]

	print(f"\n✅ Successful: {len(successful)}/{len(pdfs_to_process)}")
	print(f"❌ Failed: {len(failed)}")
	print(f"⏱️ Total Time: {total_time/60:.1f} minutes")

	if successful:
	total_vectors = sum(r["num_vectors"] for r in successful)
	avg_time = sum(r["time_total"] for r in successful) / len(successful)
	print(f"\n📦 Total Vectors Uploaded: {total_vectors}")
	print(f"⏱️ Average Time per PDF: {avg_time:.1f}s")

	# Save results
	OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
	results_file = OUTPUT_DIR / "parallel_ingestion_results.json"

	with open(results_file, 'w', encoding='utf-8') as f:
	json.dump({
	"timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
	"total_pdfs": len(pdfs_to_process),
	"successful": len(successful),
	"failed": len(failed),
	"total_time_seconds": round(total_time, 2),
	"results": results
	}, f, indent=2, ensure_ascii=False)

	print(f"\n📄 Results saved to: {results_file}")

	# Final Pinecone stats
	try:
	from pinecone import Pinecone
	pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
	index = pc.Index(os.getenv("PINECONE_INDEX_NAME", "hackathon"))
	stats = index.describe_index_stats()

	print(f"\n📊 Final Pinecone Stats:")
	print(f" Total Vectors: {stats.get('total_vector_count', 0)}")
	print(f" Dimensions: {stats.get('dimension', 0)}")
	except Exception as e:
	print(f"\nCould not fetch Pinecone stats: {e}")

	print("\n" + "="*70)
	print("🎉 ALL DONE!")
	print("="*70)


	if __name__ == "__main__":
	main()