import os
import json
import logging
from dotenv import load_dotenv
from PyPDF2 import PdfReader
from pptx import Presentation
from langchain.text_splitter import CharacterTextSplitter
from goose3 import Goose
import streamlit as st
import whisper
from pytube import YouTube
from moviepy import VideoFileClip
import time

from langchain_community.vectorstores import Milvus
from pymilvus import Collection, connections, utility

from huggingface_hub import InferenceClient
from prompts import build_evaluation_prompt

EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
CHAT_MODEL = "deepseek-ai/DeepSeek-V3.2:novita"
MILVUS_CONFIG = {"host": "localhost", "port": "19530"}
DOCUMENT_CHUNK_SIZE = 1000
PDF_CHUNK_SIZE = 2500
PPTX_CHUNK_SIZE = 1800
CODE_CHUNK_SIZE = 1200
URL_CHUNK_SIZE = 1500
VIDEO_CHUNK_SIZE = 1000
CHUNK_OVERLAP = 150
CODE_FILE_TYPES = [
    "py", "js", "ts", "jsx", "tsx", "java", "c", "cpp", "cs", "go", "rs",
    "php", "rb", "html", "css", "scss", "json", "yaml", "yml", "toml",
    "ini", "sh", "sql", "xml"
]

load_dotenv()
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s"
)

connections.connect(alias="default", **MILVUS_CONFIG)

HF_TOKEN = os.getenv("HF_TOKEN")


def get_embeddings():
    client = InferenceClient(api_key=HF_TOKEN)

    def embed_documents(texts):
        result = client.feature_extraction(texts, model=EMBEDDING_MODEL)
        if isinstance(result, dict):
            raise ValueError(f"Embedding API error: {result}")
        return result

    def embed_query(text):
        result = client.feature_extraction(text, model=EMBEDDING_MODEL)
        if isinstance(result, dict):
            raise ValueError(f"Embedding API error: {result}")
        return result

    return type(
        "EmbeddingAdapter",
        (),
        {
            "embed_documents": staticmethod(embed_documents),
            "embed_query": staticmethod(embed_query),
        },
    )()

def run_llm(prompt):
    client = InferenceClient(api_key=HF_TOKEN)
    completion = client.chat.completions.create(
        model=CHAT_MODEL,
        messages=[
            {
                "role": "system",
                "content": "Answer only from the given context. Be concise and accurate."
            },
            {
                "role": "user",
                "content": prompt
            }
        ],
    )
    return completion.choices[0].message.content

def login():
    st.title("🔐 Login")

    user = st.text_input("Enter username")

    if st.button("Login"):
        if user:
            st.session_state["user_id"] = user.strip().lower()
            logging.info(f"Logged in as {st.session_state['user_id']}")
            st.success(f"Logged in as {user}")
            st.rerun()
        else:
            st.error("Enter username")

def build_chunks(texts, metadatas, chunk_size):
    if not texts:
        return [], []

    documents = CharacterTextSplitter(
        separator="\n",
        chunk_size=chunk_size,
        chunk_overlap=CHUNK_OVERLAP
    ).create_documents(texts, metadatas)
    return [doc.page_content for doc in documents], [doc.metadata for doc in documents]

def save_source_texts(user_id, source_type, source_name, texts, locators, chunk_size):
    metadatas = [
        {
            "source_type": source_type,
            "source_name": source_name,
            "locator": locator
        }
        for locator in locators
    ]
    chunks, metadatas = build_chunks(texts, metadatas, chunk_size)

    if not chunks:
        st.warning("No readable content was extracted from this source.")
        return

    process.success("Chunking done")
    logging.info(
        f"Chunking complete for {source_type} source '{source_name}' with {len(chunks)} chunks"
    )
    collection_name = f"multigpt_{user_id}"
    logging.info(f"Storing {len(chunks)} chunks in collection '{collection_name}'")
    Milvus.from_texts(
        chunks,
        metadatas=metadatas,
        embedding=get_embeddings(),
        collection_name=collection_name,
        connection_args=MILVUS_CONFIG
    )
    logging.info("Upload completed successfully")
    process.success("Uploaded")

def ingest_text_document(file):
    user_id = st.session_state["user_id"]
    logging.info(f"Reading text file '{file.name}'")

    text = file.read().decode("utf-8", errors="ignore")
    save_source_texts(user_id, "text", file.name, [text], [""], DOCUMENT_CHUNK_SIZE)

def ingest_pdf_document(file):
    user_id = st.session_state["user_id"]
    logging.info(f"Reading PDF '{file.name}'")

    reader = PdfReader(file)
    texts = []
    locators = []

    for index, page in enumerate(reader.pages, start=1):
        page_text = page.extract_text() or ""
        if page_text.strip():
            texts.append(page_text)
            locators.append(f"page={index}")

    save_source_texts(user_id, "pdf", file.name, texts, locators, PDF_CHUNK_SIZE)

def ingest_pptx_document(file):
    user_id = st.session_state["user_id"]
    logging.info(f"Reading PPTX '{file.name}'")

    presentation = Presentation(file)
    texts = []
    locators = []

    for index, slide in enumerate(presentation.slides, start=1):
        slide_parts = []
        for shape in slide.shapes:
            if hasattr(shape, "text") and shape.text:
                slide_parts.append(shape.text)

        slide_text = "\n".join(part.strip() for part in slide_parts if part.strip())
        if slide_text:
            texts.append(slide_text)
            locators.append(f"slide={index}")

    save_source_texts(user_id, "pptx", file.name, texts, locators, PPTX_CHUNK_SIZE)

def ingest_code_files(files):
    user_id = st.session_state["user_id"]

    for file in files:
        logging.info(f"Reading code file '{file.name}'")
        text = file.read().decode("utf-8", errors="ignore")
        save_source_texts(user_id, "code", file.name, [text], [file.name], CODE_CHUNK_SIZE)

def ingest_url(url):
    user_id = st.session_state["user_id"]
    logging.info(f"Fetching URL '{url}'")

    g = Goose()
    text = g.extract(url=url).cleaned_text
    save_source_texts(user_id, "url", url, [text], [url], URL_CHUNK_SIZE)

def ingest_youtube_video(link):
    user_id = st.session_state["user_id"]
    logging.info(f"Starting video ingestion for '{link}'")

    yt = YouTube(link).streams.get_highest_resolution()
    yt.download(filename="video.mp4")

    process.success("Downloading video")
    logging.info("Video download completed")

    while not os.path.exists("video.mp4"):
        time.sleep(5)

    video = VideoFileClip("video.mp4")

    process.warning("Extracting audio")
    logging.info("Extracting audio from video")
    audio = video.audio
    audio.write_audiofile("audio.mp3")

    process.warning("Transcribing")
    logging.info("Running Whisper transcription")
    model = whisper.load_model("base")
    result = model.transcribe("audio.mp3")

    save_source_texts(user_id, "video", link, [result["text"]], [link], VIDEO_CHUNK_SIZE)

def get_vector_store(collection_name):
    return Milvus(
        embedding_function=get_embeddings(),
        collection_name=collection_name,
        connection_args=MILVUS_CONFIG
    )

def collection_has_data(collection_name):
    if not utility.has_collection(collection_name):
        return False

    return get_vector_store(collection_name).col.num_entities > 0

def get_source_inventory(collection_name):
    if not utility.has_collection(collection_name):
        return []

    collection = Collection(collection_name)
    collection.load()
    rows = collection.query(
        expr="pk >= 0",
        output_fields=["source_type", "source_name", "locator"]
    )

    summary = {}
    for row in rows:
        key = (row.get("source_type", "unknown"), row.get("source_name", "unknown"))
        if key not in summary:
            summary[key] = {
                "source_type": key[0],
                "source_name": key[1],
                "chunks": 0,
                "locators": set()
            }

        summary[key]["chunks"] += 1
        if row.get("locator"):
            summary[key]["locators"].add(row["locator"])

    inventory = []
    for item in summary.values():
        inventory.append(
            {
                "source_type": item["source_type"],
                "source_name": item["source_name"],
                "chunks": item["chunks"],
                "locators": sorted(item["locators"]) if item["locators"] else []
            }
        )

    return sorted(inventory, key=lambda item: (item["source_type"], item["source_name"]))

def render_evidence_inventory():
    user_id = st.session_state["user_id"]
    collection_name = f"multigpt_{user_id}"

    st.subheader("Evidence Inventory")

    if not utility.has_collection(collection_name):
        logging.info(f"No collection found yet for '{collection_name}'")
        st.info("No project data has been uploaded for this user yet.")
        return

    inventory = get_source_inventory(collection_name)
    total_chunks = sum(item["chunks"] for item in inventory)
    logging.info(
        f"Loaded inventory for '{collection_name}' with {len(inventory)} sources and {total_chunks} chunks"
    )

    st.caption(f"{len(inventory)} sources indexed across {total_chunks} chunks")

    if not inventory:
        st.info("The collection exists, but no source records were found.")
        return

    table_rows = []
    for item in inventory:
        table_rows.append(
            {
                "Type": item["source_type"].upper(),
                "Source": item["source_name"],
                "Chunks": item["chunks"],
                "Locators": len(item["locators"])
            }
        )

    st.table(table_rows)

def format_context(documents):
    entries = []

    for index, doc in enumerate(documents, start=1):
        metadata = doc.metadata or {}
        source_type = metadata.get("source_type", "unknown")
        source_name = metadata.get("source_name", "unknown")
        locator_text = metadata.get("locator", "locator=unknown")
        entries.append(
            f"[Evidence {index}] source_type={source_type}; "
            f"source_name={source_name}; locator={locator_text}\n"
            f"{doc.page_content}"
        )

    return "\n\n".join(entries)

def get_rubric_criteria():
    return [
        "Problem Understanding",
        "Technical Approach",
        "Implementation Quality",
        "Innovation / Originality",
        "Communication & Demo Clarity",
        "Claim vs Reality Alignment",
        "Prototype Functionality"
    ]

def parse_json_response(raw_response):
    try:
        return json.loads(raw_response)
    except json.JSONDecodeError:
        start = raw_response.find("{")
        end = raw_response.rfind("}")
        if start != -1 and end != -1 and end > start:
            return json.loads(raw_response[start:end + 1])
        raise

def normalize_evaluation_response(data):
    defaults = {
        "project_summary": {
            "purpose": "",
            "high_level_description": ""
        },
        "sources_used": [],
        "claims_detected": [],
        "capabilities_detected": [],
        "evidence": [],
        "gaps_or_risks": [],
        "scores": [],
        "overall_assessment": {
            "verdict": "",
            "confidence": "low",
            "reason": ""
        }
    }

    if not isinstance(data, dict):
        return defaults

    normalized = defaults.copy()
    normalized.update({key: value for key, value in data.items() if key in normalized})

    if not isinstance(normalized["project_summary"], dict):
        normalized["project_summary"] = defaults["project_summary"]
    else:
        normalized["project_summary"] = {
            "purpose": normalized["project_summary"].get("purpose", ""),
            "high_level_description": normalized["project_summary"].get("high_level_description", "")
        }

    if not isinstance(normalized["overall_assessment"], dict):
        normalized["overall_assessment"] = defaults["overall_assessment"]
    else:
        normalized["overall_assessment"] = {
            "verdict": normalized["overall_assessment"].get("verdict", ""),
            "confidence": normalized["overall_assessment"].get("confidence", "low"),
            "reason": normalized["overall_assessment"].get("reason", "")
        }

    for key in ["sources_used", "claims_detected", "capabilities_detected", "evidence", "gaps_or_risks", "scores"]:
        if not isinstance(normalized[key], list):
            normalized[key] = []

    score_lookup = {}
    for item in normalized["scores"]:
        if not isinstance(item, dict):
            continue

        criterion = item.get("criterion")
        if criterion:
            score_lookup[criterion] = {
                "criterion": criterion,
                "score": max(1, min(5, int(item.get("score", 1)))) if str(item.get("score", "")).isdigit() else 1,
                "reasoning": item.get("reasoning", ""),
                "citations": item.get("citations", []) if isinstance(item.get("citations", []), list) else [],
                "confidence": max(0.0, min(1.0, float(item.get("confidence", 0.0)))) if isinstance(item.get("confidence", 0.0), (int, float)) else 0.0
            }

    normalized["scores"] = []
    for criterion in get_rubric_criteria():
        normalized["scores"].append(
            score_lookup.get(
                criterion,
                {
                    "criterion": criterion,
                    "score": 1,
                    "reasoning": "",
                    "citations": [],
                    "confidence": 0.0
                }
            )
        )

    return normalized

def run_evaluation():
    user_id = st.session_state["user_id"]
    collection_name = f"multigpt_{user_id}"
    logging.info(f"Starting evaluation for collection '{collection_name}'")

    if not collection_has_data(collection_name):
        logging.info("Evaluation skipped because no uploaded project data was found")
        st.warning("No uploaded project data found for this user yet.")
        return

    process.warning("Retrieving project evidence")
    logging.info("Retrieving project evidence from Milvus")
    db = get_vector_store(collection_name)
    documents = db.similarity_search(
        "Evaluate this software project using all available uploaded evidence. "
        "Summarize capabilities, evidence, gaps, and overall assessment.",
        k=16
    )

    if not documents:
        logging.info("Evaluation stopped because no retrievable evidence was found")
        st.warning("No retrievable evidence was found for evaluation.")
        return

    prompt = build_evaluation_prompt(format_context(documents), get_rubric_criteria())

    process.warning("Running evaluation")
    logging.info(f"Running evaluator on {len(documents)} retrieved evidence chunks")
    raw_response = run_llm(prompt)

    try:
        parsed_response = normalize_evaluation_response(parse_json_response(raw_response))
    except json.JSONDecodeError:
        logging.info("Model response was not valid JSON")
        st.error("The model response was not valid JSON.")
        st.code(raw_response, language="json")
        return

    logging.info("Evaluation completed successfully")
    process.success("Evaluation ready")
    st.json(parsed_response)

def add_evidence_page():
    placeholder.title("Add Evidence")

    choice = st.sidebar.radio("Evidence Type", ['', 'DOCUMENT', 'CODE', 'URL', 'VIDEO'])

    if choice == 'DOCUMENT':
        st.caption("Upload decks, notes, specs, or README-style documents.")
        file = st.file_uploader("Upload document", type=["txt", "md", "pdf", "pptx"])
        if file:
            extension = os.path.splitext(file.name)[1].lower()

            if extension in [".txt", ".md"]:
                ingest_text_document(file)
            elif extension == ".pdf":
                ingest_pdf_document(file)
            elif extension == ".pptx":
                ingest_pptx_document(file)
            else:
                st.error("Unsupported document type.")

    elif choice == 'CODE':
        st.caption("Upload source or configuration files that represent the implementation.")
        files = st.file_uploader(
            "Upload code files",
            type=CODE_FILE_TYPES,
            accept_multiple_files=True
        )
        if files:
            ingest_code_files(files)

    elif choice == 'URL':
        st.caption("Add a product page, documentation page, or prototype URL.")
        url = st.text_input("Enter URL")
        if url:
            ingest_url(url)

    elif choice == 'VIDEO':
        st.caption("Add a YouTube demo or walkthrough link.")
        link = st.text_input("YouTube link")
        if link:
            ingest_youtube_video(link)

def evaluate_page():
    placeholder.title("Run Evaluation")
    st.write("Generate a structured evaluation using all uploaded evidence for this submission.")
    render_evidence_inventory()

    if st.button("Run Evaluation"):
        run_evaluation()

def main():
    global placeholder, process

    placeholder = st.empty()
    process = st.empty()

    if "user_id" not in st.session_state:
        login()
        return

    st.sidebar.write(f"👤 {st.session_state['user_id']}")

    page = st.sidebar.radio("Navigate", ['Add Evidence', 'Evaluate', 'Logout'])

    if page == "Add Evidence":
        add_evidence_page()
    elif page == "Evaluate":
        evaluate_page()
    elif page == "Logout":
        logging.info("Logging out and clearing session")
        st.session_state.clear()
        st.rerun()

if __name__ == "__main__":
    main()