import os
from docx import Document
from sentence_transformers import SentenceTransformer
import chromadb

# Initialize ChromaDB client
client = chromadb.PersistentClient(path="chroma_db")
collection = client.get_or_create_collection(name="knowledge_base")

# Load embedding model
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
"""
def extract_text_from_docx(file_path):
    doc = Document(file_path)
    return '\n'.join([para.text for para in doc.paragraphs if para.text.strip()])

def chunk_text(text, max_words=300):
    words = text.split()
    return [' '.join(words[i:i + max_words]) for i in range(0, len(words), max_words)]

def ingest_documents(file_path):
    if not os.path.exists(file_path):
        return {"status": "error", "message": f"File not found: {file_path}"}

    text = extract_text_from_docx(file_path)
    if not text.strip():
        return {"status": "error", "message": "Document is empty"}

    chunks = chunk_text(text)
    print(f"📄 Ingesting {os.path.basename(file_path)} with {len(chunks)} chunks")

    for i, chunk in enumerate(chunks):
        embedding = model.encode(chunk).tolist()
        doc_id = f"{os.path.basename(file_path)}_{i}"
        collection.add(
            ids=[doc_id],
            embeddings=[embedding],
            documents=[chunk],
            metadatas=[{"filename": os.path.basename(file_path)}]
        )

    return {"status": "success", "chunks": len(chunks), "message": "Ingestion completed"}

def ingest_all_documents(folder_path):
    files = [f for f in os.listdir(folder_path) if f.endswith('.docx')]
    if not files:
        return {"status": "error", "message": "No .docx files found"}

    for file in files:
        file_path = os.path.join(folder_path, file)
        ingest_documents(file_path)

    return {"status": "success", "message": f"Ingested {len(files)} files"}

def search_knowledge_base(query, top_k=3):
    query_embedding = model.encode(query).tolist()
    print("query_embedding",query_embedding)
    results = collection.query(
        query_embeddings=[query_embedding],
        n_results=top_k,
        include=['documents', 'metadatas', 'distances']
    )
    print("results",results)
    return results
"""
def extract_text_from_docx(file_path):
    """Extract text from a .docx file."""
    #print("file_path",file_path)
    doc = Document(file_path)
    return '\n'.join([para.text for para in doc.paragraphs])

def chunk_text(text, max_words=300):
    """Split text into smaller chunks for better embedding quality."""
    words = text.split()
    return [' '.join(words[i:i + max_words]) for i in range(0, len(words), max_words)]

def ingest_documents(folder_path):
    """Read .docx files, chunk text, generate embeddings, and store in ChromaDB."""
    #print(f"📂 Checking folder: {folder_path}")
    files = [f for f in os.listdir(folder_path) if f.endswith('.docx')]
    #print(f"Found {len(files)} Word files: {files}")

    if not files:
        print("⚠️ No .docx files found. Please check the folder path.")
        return

    for file in files:
        file_path = os.path.join(folder_path, file)
        text = extract_text_from_docx(file_path)
        chunks = chunk_text(text)

        #print(f"📄 Ingesting {file} with {len(chunks)} chunks")

        for i, chunk in enumerate(chunks):
            embedding = model.encode(chunk).tolist()
            doc_id = f"{file}_{i}"
            collection.add(
                ids=[doc_id],
                embeddings=[embedding],
                documents=[chunk],
                metadatas=[{"filename": file}]
            )

    print(f"✅ Documents ingested. Total entries: {collection.count()}")

def search_knowledge_base(query, top_k=3):
    """Search ChromaDB using semantic similarity."""
    query_embedding = model.encode(query).tolist()
    results = collection.query(query_embeddings=[query_embedding], n_results=top_k,include=['embeddings','documents', 'metadatas', 'distances'])
    #print("results",results)
    return results

# Example usage:
# ingest_documents("path/to/docs")
# results = search_knowledge_base("inventory mismatch in warehouse")
# print(results)

#def main():
#    folder_path = r"C:/Users/UD724VY/Downloads/Chatbot_V1/Chatbot_V1/Chatbot_FastAPI/GenericSOPsForTesting"
#    print(os.path.exists(r"C:/Users/UD724VY/Downloads/Chatbot_V1/Chatbot_FastAPI/GenericSOPsForTesting"))
#    if os.path.exists(folder_path):
#        print(folder_path)
#        ingest_documents(folder_path)
#    else:
#        print("❌ Invalid folder path. Please check and try again.")
#
#if __name__ == "__main__":
#    print("Running main()", flush=True)
#    main()