from fastapi import FastAPI, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
from typing import List, Optional
import os
import threading

from database import search_papers
from nlp_utils import get_keywords_from_abstract
from fastapi.staticfiles import StaticFiles
import data_pipeline

app = FastAPI()

frontend_dir = os.path.join(os.path.dirname(__file__), "../frontend")

app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

# Run data pipeline on startup in a separate thread
@app.on_event("startup")
def startup_event():
    print("Running data pipeline check on startup...")
    threading.Thread(target=data_pipeline.populate_db).start()

class SearchRequest(BaseModel):
    query: str
    query_type: str # "keywords" or "abstract"
    sort_by: Optional[str] = "relevance"

# API routes (defined before static mount)
@app.post("/api/search")
def search(request: SearchRequest):
    query = request.query.strip()
    if not query:
        raise HTTPException(status_code=400, detail="Query cannot be empty")
        
    keywords_to_search = []
    extracted_keywords = []
    
    if request.query_type == "abstract":
        # Extract keywords using KeyBERT
        extracted_keywords = get_keywords_from_abstract(query, top_n=8)
        keywords_to_search = extracted_keywords
    else:
        # Split keywords by comma or space
        if "," in query:
            keywords_to_search = [k.strip() for k in query.split(",") if k.strip()]
        else:
            keywords_to_search = [k.strip() for k in query.split() if k.strip()]
            
    if not keywords_to_search:
        raise HTTPException(status_code=400, detail="No valid keywords found")
        
    results = search_papers(keywords_to_search, sort_by=request.sort_by)
    
    return {
        "results": results,
        "extracted_keywords": extracted_keywords if request.query_type == "abstract" else []
    }

# Serve static files from the root last
app.mount("/", StaticFiles(directory=frontend_dir, html=True), name="frontend")

@app.get("/debug")
def debug_clembench():
    import re
    bib_file = os.path.join(os.path.dirname(__file__), "anthology.bib")
    if not os.path.exists(bib_file):
        return {"error": "file not found"}
    with open(bib_file, 'r', encoding='utf-8') as f:
        content = f.read()
    entries = re.split(r'\n@', content)
    for entry in entries:
        if "clembench: Using Game Play" in entry:
            # Let's see what our regex parser does
            field_split_re = re.compile(r'\n\s*([a-zA-Z_]+)\s*=\s*')
            raw = entry.strip()
            if raw.endswith('}'): raw = raw[:-1]
            parts = field_split_re.split(raw)
            parsed_keys = [parts[i].lower() for i in range(1, len(parts)-1, 2)]
            
            return {
                "raw_entry": entry,
                "parsed_keys": parsed_keys,
                "abstract_val": parts[parsed_keys.index('abstract')*2 + 2] if 'abstract' in parsed_keys else "NO ABSTRACT KEY FOUND"
            }
    return {"error": "entry not found"}