IsmatS commited on
Commit
34a60dc
·
1 Parent(s): d52b844
Files changed (2) hide show
  1. scripts/ingest_parallel.py +0 -204
  2. scripts/ingest_pdfs.py +0 -449
scripts/ingest_parallel.py DELETED
@@ -1,204 +0,0 @@
1
- """
2
- Parallel PDF Ingestion - 4x Faster
3
- Processes 4 PDFs simultaneously without affecting quality
4
- """
5
-
6
- import os
7
- import sys
8
- import time
9
- import json
10
- from pathlib import Path
11
- from concurrent.futures import ProcessPoolExecutor, as_completed
12
- from dotenv import load_dotenv
13
-
14
- # Add parent directory to path
15
- sys.path.insert(0, str(Path(__file__).parent))
16
-
17
- # Load environment
18
- load_dotenv()
19
-
20
- # Import from the main ingestion script
21
- PROJECT_ROOT = Path(__file__).parent.parent
22
- PDFS_DIR = PROJECT_ROOT / "data" / "pdfs"
23
- OUTPUT_DIR = PROJECT_ROOT / "output" / "ingestion"
24
-
25
- # Import the ingestion function
26
- import ingest_pdfs
27
-
28
- def get_already_processed():
29
- """Check which PDFs are already in Pinecone"""
30
- try:
31
- from pinecone import Pinecone
32
-
33
- pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
34
- index = pc.Index(os.getenv("PINECONE_INDEX_NAME", "hackathon"))
35
-
36
- # Query to get all unique PDF names
37
- results = index.query(
38
- vector=[0.0] * 1024,
39
- top_k=10000,
40
- include_metadata=True
41
- )
42
-
43
- processed = set()
44
- for match in results.get('matches', []):
45
- pdf_name = match.get('metadata', {}).get('pdf_name')
46
- if pdf_name:
47
- processed.add(pdf_name)
48
-
49
- return processed
50
- except Exception as e:
51
- print(f"Warning: Could not check existing PDFs: {e}")
52
- return set()
53
-
54
-
55
- def worker_ingest(pdf_path: str):
56
- """Worker function to ingest a single PDF"""
57
- try:
58
- result = ingest_pdfs.ingest_pdf(str(pdf_path))
59
- return result
60
- except Exception as e:
61
- return {
62
- "pdf_name": Path(pdf_path).name,
63
- "status": "error",
64
- "error": str(e)
65
- }
66
-
67
-
68
- def main():
69
- """Main parallel ingestion pipeline"""
70
- print("\n" + "="*70)
71
- print("🚀 PARALLEL PDF INGESTION (4x FASTER)")
72
- print("="*70)
73
- print(f"📂 PDF Directory: {PDFS_DIR}")
74
- print(f"⚡ Workers: 4 PDFs at once")
75
- print(f"🎯 Vector Database: Pinecone ({os.getenv('PINECONE_INDEX_NAME')})")
76
- print("="*70)
77
-
78
- # Get all PDFs
79
- all_pdfs = sorted(PDFS_DIR.glob("*.pdf"))
80
- print(f"\n📚 Found {len(all_pdfs)} total PDFs")
81
-
82
- # Check what's already done
83
- print("\n🔍 Checking Pinecone for already processed PDFs...")
84
- already_processed = get_already_processed()
85
-
86
- if already_processed:
87
- print(f"✅ Already processed: {len(already_processed)} PDFs")
88
- for pdf in sorted(already_processed):
89
- print(f" ✓ {pdf}")
90
-
91
- # Filter to only unprocessed PDFs
92
- pdfs_to_process = [
93
- pdf for pdf in all_pdfs
94
- if pdf.name not in already_processed
95
- ]
96
-
97
- if not pdfs_to_process:
98
- print("\n🎉 All PDFs already processed!")
99
- return
100
-
101
- print(f"\n⏳ Remaining to process: {len(pdfs_to_process)} PDFs")
102
- for pdf in pdfs_to_process:
103
- print(f" → {pdf.name}")
104
-
105
- print(f"\n⚡ Starting parallel processing with 4 workers...")
106
- print(f"⏱️ Estimated time: ~{len(pdfs_to_process) * 80 / 4 / 60:.1f} minutes\n")
107
-
108
- # Process in parallel
109
- results = []
110
- completed = 0
111
- start_time = time.time()
112
-
113
- with ProcessPoolExecutor(max_workers=4) as executor:
114
- # Submit all jobs
115
- future_to_pdf = {
116
- executor.submit(worker_ingest, str(pdf)): pdf
117
- for pdf in pdfs_to_process
118
- }
119
-
120
- # Collect results as they complete
121
- for future in as_completed(future_to_pdf):
122
- pdf = future_to_pdf[future]
123
- completed += 1
124
-
125
- try:
126
- result = future.result()
127
- results.append(result)
128
-
129
- if result.get("status") == "success":
130
- elapsed = time.time() - start_time
131
- avg_time = elapsed / completed
132
- remaining = len(pdfs_to_process) - completed
133
- eta = remaining * avg_time / 60
134
-
135
- print(f"✅ [{completed}/{len(pdfs_to_process)}] {pdf.name}")
136
- print(f" 📊 {result['num_vectors']} vectors, {result['time_total']:.1f}s")
137
- print(f" ⏱️ ETA: {eta:.1f} minutes remaining\n")
138
- else:
139
- print(f"❌ [{completed}/{len(pdfs_to_process)}] {pdf.name} - {result.get('error', 'Unknown error')}\n")
140
-
141
- except Exception as e:
142
- print(f"❌ [{completed}/{len(pdfs_to_process)}] {pdf.name} - Error: {e}\n")
143
- results.append({
144
- "pdf_name": pdf.name,
145
- "status": "error",
146
- "error": str(e)
147
- })
148
-
149
- total_time = time.time() - start_time
150
-
151
- # Summary
152
- print("\n" + "="*70)
153
- print("📊 PARALLEL INGESTION COMPLETE")
154
- print("="*70)
155
-
156
- successful = [r for r in results if r.get("status") == "success"]
157
- failed = [r for r in results if r.get("status") == "error"]
158
-
159
- print(f"\n✅ Successful: {len(successful)}/{len(pdfs_to_process)}")
160
- print(f"❌ Failed: {len(failed)}")
161
- print(f"⏱️ Total Time: {total_time/60:.1f} minutes")
162
-
163
- if successful:
164
- total_vectors = sum(r["num_vectors"] for r in successful)
165
- avg_time = sum(r["time_total"] for r in successful) / len(successful)
166
- print(f"\n📦 Total Vectors Uploaded: {total_vectors}")
167
- print(f"⏱️ Average Time per PDF: {avg_time:.1f}s")
168
-
169
- # Save results
170
- OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
171
- results_file = OUTPUT_DIR / "parallel_ingestion_results.json"
172
-
173
- with open(results_file, 'w', encoding='utf-8') as f:
174
- json.dump({
175
- "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
176
- "total_pdfs": len(pdfs_to_process),
177
- "successful": len(successful),
178
- "failed": len(failed),
179
- "total_time_seconds": round(total_time, 2),
180
- "results": results
181
- }, f, indent=2, ensure_ascii=False)
182
-
183
- print(f"\n📄 Results saved to: {results_file}")
184
-
185
- # Final Pinecone stats
186
- try:
187
- from pinecone import Pinecone
188
- pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
189
- index = pc.Index(os.getenv("PINECONE_INDEX_NAME", "hackathon"))
190
- stats = index.describe_index_stats()
191
-
192
- print(f"\n📊 Final Pinecone Stats:")
193
- print(f" Total Vectors: {stats.get('total_vector_count', 0)}")
194
- print(f" Dimensions: {stats.get('dimension', 0)}")
195
- except Exception as e:
196
- print(f"\nCould not fetch Pinecone stats: {e}")
197
-
198
- print("\n" + "="*70)
199
- print("🎉 ALL DONE!")
200
- print("="*70)
201
-
202
-
203
- if __name__ == "__main__":
204
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
scripts/ingest_pdfs.py DELETED
@@ -1,449 +0,0 @@
1
- """
2
- PDF Ingestion Script for SOCAR Hackathon
3
- Processes all PDFs with VLM OCR and uploads to Pinecone
4
-
5
- Based on benchmark results:
6
- - OCR: Llama-4-Maverick-17B (87.75% CSR)
7
- - Embedding: BAAI/bge-large-en-v1.5 (1024 dims)
8
- - Chunking: 600 chars with 100 overlap
9
- - Vector DB: Pinecone (cosine similarity)
10
- """
11
-
12
- import os
13
- import re
14
- import time
15
- import base64
16
- from pathlib import Path
17
- from typing import List, Dict
18
- from io import BytesIO
19
-
20
- import fitz # PyMuPDF
21
- from PIL import Image
22
- from dotenv import load_dotenv
23
- from openai import AzureOpenAI
24
- from pinecone import Pinecone
25
- from sentence_transformers import SentenceTransformer
26
- from tqdm import tqdm
27
-
28
- # Load environment
29
- load_dotenv()
30
-
31
- # Project paths
32
- PROJECT_ROOT = Path(__file__).parent.parent
33
- PDFS_DIR = PROJECT_ROOT / "data" / "pdfs"
34
- OUTPUT_DIR = PROJECT_ROOT / "output" / "ingestion"
35
- OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
36
-
37
- # Initialize clients
38
- print("🔄 Initializing clients...")
39
-
40
- azure_client = AzureOpenAI(
41
- api_key=os.getenv("AZURE_OPENAI_API_KEY"),
42
- api_version=os.getenv("AZURE_OPENAI_API_VERSION", "2024-08-01-preview"),
43
- azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT")
44
- )
45
-
46
- pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
47
- index = pc.Index(os.getenv("PINECONE_INDEX_NAME", "hackathon"))
48
-
49
- # Best performing embedding model from benchmarks
50
- embedding_model = SentenceTransformer("BAAI/bge-large-en-v1.5")
51
-
52
- # Best performing VLM from benchmarks
53
- VLM_MODEL = "Llama-4-Maverick-17B-128E-Instruct-FP8"
54
-
55
- # Optimal chunking parameters from benchmarks
56
- CHUNK_SIZE = 600
57
- CHUNK_OVERLAP = 100
58
-
59
- print("✅ Clients initialized")
60
-
61
-
62
- def pdf_to_images(pdf_path: str, dpi: int = 100) -> List[Image.Image]:
63
- """Convert PDF pages to PIL Images."""
64
- doc = fitz.open(pdf_path)
65
- images = []
66
-
67
- for page_num in range(len(doc)):
68
- page = doc[page_num]
69
- zoom = dpi / 72
70
- mat = fitz.Matrix(zoom, zoom)
71
- pix = page.get_pixmap(matrix=mat)
72
- img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
73
- images.append(img)
74
-
75
- doc.close()
76
- return images
77
-
78
-
79
- def image_to_base64(image: Image.Image, format: str = "JPEG", quality: int = 85) -> str:
80
- """Convert PIL Image to base64 with compression."""
81
- buffered = BytesIO()
82
- image.save(buffered, format=format, quality=quality, optimize=True)
83
- return base64.b64encode(buffered.getvalue()).decode("utf-8")
84
-
85
-
86
- def vlm_extract_text(pdf_path: str) -> str:
87
- """
88
- Extract text from PDF using VLM (Llama-4-Maverick).
89
- Best performer: 87.75% CSR, 75s for 12 pages
90
- """
91
- images = pdf_to_images(pdf_path, dpi=100)
92
-
93
- system_prompt = """You are an expert OCR system for historical oil & gas documents.
94
-
95
- Extract ALL text from the image with 100% accuracy. Follow these rules:
96
- 1. Preserve EXACT spelling - including Azerbaijani, Russian, and English text
97
- 2. Maintain original Cyrillic characters - DO NOT transliterate
98
- 3. Keep all numbers, symbols, and special characters exactly as shown
99
- 4. Preserve layout structure (paragraphs, line breaks)
100
- 5. Include ALL text - headers, body, footnotes, tables, captions
101
-
102
- Output ONLY the extracted text. No explanations, no descriptions."""
103
-
104
- all_text = []
105
-
106
- print(f" Extracting text from {len(images)} pages...")
107
- for page_num, image in enumerate(tqdm(images, desc=" OCR Progress"), 1):
108
- # Convert to base64
109
- image_base64 = image_to_base64(image, format="JPEG", quality=85)
110
-
111
- messages = [
112
- {"role": "system", "content": system_prompt},
113
- {
114
- "role": "user",
115
- "content": [
116
- {"type": "text", "text": f"Extract all text from page {page_num}:"},
117
- {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_base64}"}}
118
- ]
119
- }
120
- ]
121
-
122
- try:
123
- response = azure_client.chat.completions.create(
124
- model=VLM_MODEL,
125
- messages=messages,
126
- temperature=0.0, # Deterministic OCR
127
- max_tokens=4000
128
- )
129
-
130
- page_text = response.choices[0].message.content
131
- all_text.append(page_text)
132
-
133
- except Exception as e:
134
- print(f" ❌ Error on page {page_num}: {e}")
135
- all_text.append("") # Add empty page on error
136
-
137
- # Combine all pages
138
- full_text = "\n\n".join(all_text)
139
- return full_text
140
-
141
-
142
- def clean_text_for_vectordb(text: str) -> str:
143
- """
144
- Clean text for vector database storage.
145
- CRITICAL: Remove image markdown - images are ONLY for /ocr endpoint!
146
- """
147
- # Remove image markdown references
148
- clean = re.sub(r'!\[Image\]\([^)]+\)', '', text)
149
-
150
- # Normalize whitespace
151
- clean = re.sub(r'\n\s*\n+', '\n\n', clean)
152
- clean = clean.strip()
153
-
154
- return clean
155
-
156
-
157
- def chunk_text(text: str, chunk_size: int = CHUNK_SIZE, overlap: int = CHUNK_OVERLAP) -> List[str]:
158
- """
159
- Chunk text with overlap for better context preservation.
160
- Optimal config from benchmarks: 600 chars, 100 overlap
161
- """
162
- if not text or len(text) == 0:
163
- return []
164
-
165
- chunks = []
166
- start = 0
167
-
168
- while start < len(text):
169
- end = start + chunk_size
170
- chunk = text[start:end]
171
-
172
- # Try to break at word boundary
173
- if end < len(text) and not text[end].isspace():
174
- last_space = chunk.rfind(' ')
175
- if last_space > chunk_size - 100: # Keep chunk reasonably sized
176
- chunk = chunk[:last_space]
177
- end = start + last_space
178
-
179
- chunk = chunk.strip()
180
- if chunk: # Only add non-empty chunks
181
- chunks.append(chunk)
182
-
183
- start = end - overlap if end < len(text) else end
184
-
185
- return chunks
186
-
187
-
188
- def ingest_pdf(pdf_path: str) -> Dict:
189
- """
190
- Full ingestion pipeline for one PDF:
191
- 1. VLM OCR (Llama-4-Maverick)
192
- 2. Clean text (remove images)
193
- 3. Chunk (600/100)
194
- 4. Embed (bge-large-en)
195
- 5. Upsert to Pinecone
196
- """
197
- pdf_name = Path(pdf_path).name
198
- start_time = time.time()
199
-
200
- print(f"\n{'='*70}")
201
- print(f"📄 Processing: {pdf_name}")
202
- print(f"{'='*70}")
203
-
204
- # Step 1: OCR with VLM
205
- print(" Step 1/5: Running VLM OCR...")
206
- ocr_start = time.time()
207
- raw_text = vlm_extract_text(pdf_path)
208
- ocr_time = time.time() - ocr_start
209
- print(f" ✅ OCR complete: {len(raw_text)} characters ({ocr_time:.1f}s)")
210
-
211
- # Step 2: Clean text (remove image markdown)
212
- print(" Step 2/5: Cleaning text...")
213
- clean = clean_text_for_vectordb(raw_text)
214
- print(f" ✅ Cleaned: {len(clean)} characters")
215
-
216
- # Step 3: Chunk text
217
- print(" Step 3/5: Chunking text...")
218
- chunks = chunk_text(clean, chunk_size=CHUNK_SIZE, overlap=CHUNK_OVERLAP)
219
- print(f" ✅ Created {len(chunks)} chunks")
220
-
221
- if len(chunks) == 0:
222
- print(" ⚠️ No chunks created - skipping document")
223
- return {
224
- "pdf_name": pdf_name,
225
- "status": "skipped",
226
- "reason": "no_text_extracted",
227
- "time": time.time() - start_time
228
- }
229
-
230
- # Step 4: Generate embeddings
231
- print(f" Step 4/5: Generating embeddings...")
232
- embed_start = time.time()
233
- embeddings = embedding_model.encode(chunks, show_progress_bar=True)
234
- embed_time = time.time() - embed_start
235
- print(f" ✅ Embeddings generated ({embed_time:.1f}s)")
236
-
237
- # Step 5: Prepare vectors for Pinecone
238
- print(" Step 5/5: Upserting to Pinecone...")
239
- vectors = []
240
-
241
- # Calculate approximate page numbers
242
- # (simple heuristic: distribute chunks evenly across document)
243
- doc = fitz.open(pdf_path)
244
- num_pages = len(doc)
245
- doc.close()
246
-
247
- for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
248
- # Estimate page number (chunks distributed across pages)
249
- estimated_page = int((i / len(chunks)) * num_pages) + 1
250
-
251
- vectors.append({
252
- "id": f"{pdf_name}_chunk_{i}",
253
- "values": embedding.tolist(),
254
- "metadata": {
255
- "pdf_name": pdf_name,
256
- "page_number": estimated_page,
257
- "content": chunk # Changed from "text" to "content" to match API expectations
258
- }
259
- })
260
-
261
- # Upsert in batches
262
- batch_size = 100
263
- upsert_start = time.time()
264
-
265
- for i in range(0, len(vectors), batch_size):
266
- batch = vectors[i:i + batch_size]
267
- index.upsert(vectors=batch)
268
-
269
- upsert_time = time.time() - upsert_start
270
- total_time = time.time() - start_time
271
-
272
- print(f" ✅ Upserted {len(vectors)} vectors ({upsert_time:.1f}s)")
273
- print(f"\n 🎉 Complete: {pdf_name}")
274
- print(f" 📊 Total time: {total_time:.1f}s")
275
- print(f" 📊 Breakdown: OCR={ocr_time:.1f}s, Embed={embed_time:.1f}s, Upload={upsert_time:.1f}s")
276
-
277
- return {
278
- "pdf_name": pdf_name,
279
- "status": "success",
280
- "num_chunks": len(chunks),
281
- "num_vectors": len(vectors),
282
- "text_length": len(clean),
283
- "time_total": round(total_time, 2),
284
- "time_ocr": round(ocr_time, 2),
285
- "time_embedding": round(embed_time, 2),
286
- "time_upsert": round(upsert_time, 2)
287
- }
288
-
289
-
290
- def ingest_all_pdfs(clear_existing: bool = False):
291
- """
292
- Ingest all PDFs from data/pdfs directory.
293
-
294
- Args:
295
- clear_existing: If True, clear existing index before ingestion
296
- """
297
- print("\n" + "="*70)
298
- print("🚀 SOCAR PDF INGESTION PIPELINE")
299
- print("="*70)
300
- print(f"📂 PDF Directory: {PDFS_DIR}")
301
- print(f"🎯 Vector Database: Pinecone ({os.getenv('PINECONE_INDEX_NAME')})")
302
- print(f"🤖 OCR Model: {VLM_MODEL}")
303
- print(f"📊 Embedding Model: BAAI/bge-large-en-v1.5")
304
- print(f"✂️ Chunking: {CHUNK_SIZE} chars, {CHUNK_OVERLAP} overlap")
305
- print("="*70)
306
-
307
- # Clear index if requested
308
- if clear_existing:
309
- print("\n⚠️ Clearing existing vectors from index...")
310
- response = input("Are you sure? This will delete ALL vectors. (yes/no): ")
311
- if response.lower() == "yes":
312
- index.delete(delete_all=True)
313
- print("✅ Index cleared")
314
- time.sleep(2) # Wait for index to stabilize
315
- else:
316
- print("❌ Clearing cancelled")
317
- return
318
-
319
- # Get all PDFs
320
- pdf_files = sorted(PDFS_DIR.glob("*.pdf"))
321
-
322
- if not pdf_files:
323
- print(f"\n❌ No PDF files found in {PDFS_DIR}")
324
- return
325
-
326
- print(f"\n📚 Found {len(pdf_files)} PDF files")
327
-
328
- # Process each PDF
329
- results = []
330
- start_time = time.time()
331
-
332
- for pdf_path in pdf_files:
333
- try:
334
- result = ingest_pdf(str(pdf_path))
335
- results.append(result)
336
- except Exception as e:
337
- print(f"\n❌ Error processing {pdf_path.name}: {e}")
338
- results.append({
339
- "pdf_name": pdf_path.name,
340
- "status": "error",
341
- "error": str(e)
342
- })
343
-
344
- total_time = time.time() - start_time
345
-
346
- # Summary
347
- print("\n" + "="*70)
348
- print("📊 INGESTION SUMMARY")
349
- print("="*70)
350
-
351
- successful = [r for r in results if r.get("status") == "success"]
352
- failed = [r for r in results if r.get("status") == "error"]
353
- skipped = [r for r in results if r.get("status") == "skipped"]
354
-
355
- print(f"\n✅ Successful: {len(successful)}/{len(pdf_files)}")
356
- print(f"❌ Failed: {len(failed)}")
357
- print(f"⏭️ Skipped: {len(skipped)}")
358
- print(f"\n⏱️ Total Time: {total_time/60:.1f} minutes")
359
-
360
- if successful:
361
- total_chunks = sum(r["num_chunks"] for r in successful)
362
- total_vectors = sum(r["num_vectors"] for r in successful)
363
- avg_time = sum(r["time_total"] for r in successful) / len(successful)
364
-
365
- print(f"\n📦 Total Chunks: {total_chunks}")
366
- print(f"🔢 Total Vectors: {total_vectors}")
367
- print(f"⏱️ Average Time per PDF: {avg_time:.1f}s")
368
-
369
- # Check index stats
370
- stats = index.describe_index_stats()
371
- print(f"\n📊 Pinecone Index Stats:")
372
- print(f" Total Vectors: {stats.get('total_vector_count', 0)}")
373
- print(f" Dimensions: {stats.get('dimension', 0)}")
374
-
375
- # Save detailed results
376
- import json
377
- results_file = OUTPUT_DIR / "ingestion_results.json"
378
- with open(results_file, 'w', encoding='utf-8') as f:
379
- json.dump({
380
- "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
381
- "total_pdfs": len(pdf_files),
382
- "successful": len(successful),
383
- "failed": len(failed),
384
- "skipped": len(skipped),
385
- "total_time_seconds": round(total_time, 2),
386
- "results": results
387
- }, f, indent=2, ensure_ascii=False)
388
-
389
- print(f"\n📄 Detailed results saved to: {results_file}")
390
- print("\n" + "="*70)
391
- print("🎉 INGESTION COMPLETE!")
392
- print("="*70)
393
-
394
-
395
- def test_single_pdf(pdf_name: str = "document_00.pdf"):
396
- """Test ingestion with a single PDF."""
397
- pdf_path = PDFS_DIR / pdf_name
398
-
399
- if not pdf_path.exists():
400
- print(f"❌ PDF not found: {pdf_path}")
401
- return
402
-
403
- print(f"\n🧪 Testing with: {pdf_name}")
404
- result = ingest_pdf(str(pdf_path))
405
-
406
- print("\n📊 Test Result:")
407
- print(json.dumps(result, indent=2))
408
-
409
-
410
- if __name__ == "__main__":
411
- import sys
412
- import json
413
-
414
- # Parse command line arguments
415
- if len(sys.argv) > 1:
416
- command = sys.argv[1]
417
-
418
- if command == "test":
419
- # Test with single PDF
420
- pdf_name = sys.argv[2] if len(sys.argv) > 2 else "document_00.pdf"
421
- test_single_pdf(pdf_name)
422
-
423
- elif command == "clear":
424
- # Clear index and ingest all
425
- ingest_all_pdfs(clear_existing=True)
426
-
427
- elif command == "stats":
428
- # Show current index stats
429
- stats = index.describe_index_stats()
430
- print("\n📊 Pinecone Index Stats:")
431
- if stats:
432
- print(f" Total Vectors: {stats.get('total_vector_count', 0)}")
433
- print(f" Dimensions: {stats.get('dimension', 0)}")
434
- if 'namespaces' in stats:
435
- print(f" Namespaces: {stats.get('namespaces', {})}")
436
- else:
437
- print(" No stats available")
438
-
439
- else:
440
- print("Usage:")
441
- print(" python ingest_pdfs.py - Ingest all PDFs (append)")
442
- print(" python ingest_pdfs.py clear - Clear index and ingest all")
443
- print(" python ingest_pdfs.py test - Test with document_00.pdf")
444
- print(" python ingest_pdfs.py test document_05.pdf - Test with specific PDF")
445
- print(" python ingest_pdfs.py stats - Show index statistics")
446
-
447
- else:
448
- # Default: ingest all PDFs (append mode)
449
- ingest_all_pdfs(clear_existing=False)