| import faiss |
| from sentence_transformers import SentenceTransformer |
| import fitz |
| from PIL import Image |
| import io |
| import numpy as np |
| import os |
|
|
| from database import get_db_connection, INDEX_FILE |
| from security import encrypt_data |
|
|
| MODEL_NAME = 'clip-ViT-B-32' |
|
|
| def ingest_pdf(file_path, file_name): |
| """Parses a PDF, encrypts its content (text+images), and adds it to the database and FAISS index.""" |
| print(f"Starting ingestion for: {file_name}") |
| model = SentenceTransformer(MODEL_NAME) |
| conn = get_db_connection() |
| cursor = conn.cursor() |
|
|
| |
| try: |
| cursor.execute("INSERT INTO documents (name) VALUES (?)", (file_name,)) |
| doc_id = cursor.lastrowid |
| except conn.IntegrityError: |
| print("Document already exists in DB. Skipping doc table insert.") |
| doc_id = cursor.execute("SELECT id FROM documents WHERE name=?", (file_name,)).fetchone()['id'] |
|
|
| doc = fitz.open(file_path) |
| new_embeddings = [] |
| |
| |
| if os.path.exists(INDEX_FILE): |
| index = faiss.read_index(INDEX_FILE) |
| else: |
| |
| dimension = model.encode(["test"]).shape[1] |
| index = faiss.IndexFlatL2(dimension) |
|
|
| for page_num, page in enumerate(doc): |
| |
| text = page.get_text() |
| if text.strip(): |
| encrypted_text = encrypt_data(text.encode('utf-8')) |
| cursor.execute( |
| "INSERT INTO chunks (doc_id, content_type, encrypted_content, page_num) VALUES (?, ?, ?, ?)", |
| (doc_id, 'text', encrypted_text, page_num + 1) |
| ) |
| text_embedding = model.encode([text]) |
| new_embeddings.append(text_embedding) |
|
|
| |
| image_list = page.get_images(full=True) |
| for img_index, img in enumerate(image_list): |
| xref = img[0] |
| base_image = doc.extract_image(xref) |
| image_bytes = base_image["image"] |
| |
| encrypted_image = encrypt_data(image_bytes) |
| cursor.execute( |
| "INSERT INTO chunks (doc_id, content_type, encrypted_content, page_num) VALUES (?, ?, ?, ?)", |
| (doc_id, 'image', encrypted_image, page_num + 1) |
| ) |
| pil_image = Image.open(io.BytesIO(image_bytes)) |
| image_embedding = model.encode(pil_image) |
| new_embeddings.append(image_embedding.reshape(1, -1)) |
|
|
| conn.commit() |
| conn.close() |
|
|
| if new_embeddings: |
| |
| embeddings_np = np.vstack(new_embeddings).astype('float32') |
| index.add(embeddings_np) |
| faiss.write_index(index, INDEX_FILE) |
| print(f"Successfully ingested {file_name} and added {len(new_embeddings)} new chunks to the knowledge base.") |
| else: |
| print(f"No new content found to ingest in {file_name}.") |
|
|