import os from io import BytesIO import PyPDF2 import docx from django.core.files.storage import default_storage from .engine import engine from .models import KnowledgeDocument # ponytail: simple text extractor for documents def extract_text_from_file(file_path): ext = os.path.splitext(file_path)[1].lower() text = "" if ext == '.pdf': with open(file_path, 'rb') as f: reader = PyPDF2.PdfReader(f) for page in reader.pages: text += page.extract_text() + "\n" elif ext in ['.doc', '.docx']: doc = docx.Document(file_path) for para in doc.paragraphs: text += para.text + "\n" elif ext == '.txt': with open(file_path, 'r', encoding='utf-8') as f: text = f.read() return text def chunk_text(text, chunk_size=1000, overlap=200): chunks = [] start = 0 while start < len(text): end = start + chunk_size chunks.append(text[start:end]) start = end - overlap return chunks def ingest_document(doc_id): doc = KnowledgeDocument.objects.get(id=doc_id) doc.status = 'processing' doc.save() file_path = doc.file.path try: text = extract_text_from_file(file_path) chunks = chunk_text(text) # Ensure collection exists (ponytail: let's hope it does or recreate it here if we want to be safe) try: engine.qdrant.get_collection("knowledge_base") except: from qdrant_client.models import Distance, VectorParams engine.qdrant.create_collection( collection_name="knowledge_base", vectors_config=VectorParams(size=384, distance=Distance.COSINE) # MiniLM size ) vectors = engine.model.encode(chunks).tolist() payloads = [{"text": chunk, "doc_id": doc_id, "title": doc.title} for chunk in chunks] import uuid from qdrant_client.models import PointStruct points = [ PointStruct(id=str(uuid.uuid4()), vector=v, payload=p) for v, p in zip(vectors, payloads) ] engine.qdrant.upsert( collection_name="knowledge_base", points=points ) doc.chunks_count = len(chunks) doc.status = 'indexed' doc.save() except Exception as e: print(f"Error ingesting doc {doc_id}: {e}") doc.status = 'error' doc.save()