import os
from io import BytesIO
import PyPDF2
import docx
from django.core.files.storage import default_storage
from .engine import engine
from .models import KnowledgeDocument

# ponytail: simple text extractor for documents
def extract_text_from_file(file_path):
    ext = os.path.splitext(file_path)[1].lower()
    text = ""
    if ext == '.pdf':
        with open(file_path, 'rb') as f:
            reader = PyPDF2.PdfReader(f)
            for page in reader.pages:
                text += page.extract_text() + "\n"
    elif ext in ['.doc', '.docx']:
        doc = docx.Document(file_path)
        for para in doc.paragraphs:
            text += para.text + "\n"
    elif ext == '.txt':
        with open(file_path, 'r', encoding='utf-8') as f:
            text = f.read()
    return text

def chunk_text(text, chunk_size=1000, overlap=200):
    chunks = []
    start = 0
    while start < len(text):
        end = start + chunk_size
        chunks.append(text[start:end])
        start = end - overlap
    return chunks

def ingest_document(doc_id):
    doc = KnowledgeDocument.objects.get(id=doc_id)
    doc.status = 'processing'
    doc.save()
    
    file_path = doc.file.path
    try:
        text = extract_text_from_file(file_path)
        chunks = chunk_text(text)
        
        # Ensure collection exists (ponytail: let's hope it does or recreate it here if we want to be safe)
        try:
            engine.qdrant.get_collection("knowledge_base")
        except:
            from qdrant_client.models import Distance, VectorParams
            engine.qdrant.create_collection(
                collection_name="knowledge_base",
                vectors_config=VectorParams(size=384, distance=Distance.COSINE) # MiniLM size
            )
            
        vectors = engine.model.encode(chunks).tolist()
        payloads = [{"text": chunk, "doc_id": doc_id, "title": doc.title} for chunk in chunks]
        
        import uuid
        from qdrant_client.models import PointStruct
        
        points = [
            PointStruct(id=str(uuid.uuid4()), vector=v, payload=p)
            for v, p in zip(vectors, payloads)
        ]
        
        engine.qdrant.upsert(
            collection_name="knowledge_base",
            points=points
        )
        
        doc.chunks_count = len(chunks)
        doc.status = 'indexed'
        doc.save()
        
    except Exception as e:
        print(f"Error ingesting doc {doc_id}: {e}")
        doc.status = 'error'
        doc.save()