from fastapi import FastAPI, HTTPException, File, UploadFile
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
from sentence_transformers import SentenceTransformer, util
import google.generativeai as genai
import pdfplumber
import pickle
import io
import os
from datetime import datetime

# ================= APP =================
app = FastAPI(title="Jimma University Plagiarism API")

app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_methods=["*"],
    allow_headers=["*"],
)

# ================= CONFIG =================
MODEL_PATH = "plagiarism_model"
EMBEDDINGS_FILE = "reference_embeddings.pkl"
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY", "YOUR_KEY_HERE")

SIMILARITY_THRESHOLD = 30.0

# ================= LOAD SBERT MODEL =================
model = SentenceTransformer(MODEL_PATH)
print("✅ Model loaded:", MODEL_PATH)

# ================= LOAD REFERENCE DATA =================
with open(EMBEDDINGS_FILE, "rb") as f:
    data = pickle.load(f)

ref_embeddings = data["embeddings"]
df_ref = data["df_ref"]

print("✅ Reference dataset loaded")

# ================= GEMINI =================
genai.configure(api_key=GEMINI_API_KEY)
gemini_model = genai.GenerativeModel("gemini-2.5-flash")

# ================= REQUEST MODEL =================
class PlagiarismRequest(BaseModel):
    text: str
    title: str = "Unknown"
    student_name: str = "Unknown"
    year: str = "2026"

# ================= HEALTH CHECK =================
@app.get("/")
def home():
    return {"message": "Plagiarism API Running 🚀"}

# ================= TEXT CHECK API =================
@app.post("/check_plagiarism")
async def check_plagiarism(req: PlagiarismRequest):

    text = req.text.strip()

    if len(text) < 100:
        raise HTTPException(400, "Text too short")

    if len(text) > 8000:
        text = text[:8000]

    # ================= SBERT =================
    query_embedding = model.encode(
        text,
        convert_to_tensor=True,
        normalize_embeddings=True
    )

    scores = util.cos_sim(query_embedding, ref_embeddings)[0]
    scores = (scores * 100).cpu().numpy()

    top_idx = int(scores.argmax())
    top_score = float(scores[top_idx])

    row = df_ref.iloc[top_idx]

    # ================= LOW RISK =================
    if top_score < SIMILARITY_THRESHOLD:
        return {
            "status": "low_risk",
            "similarity_percent": round(top_score, 2),
            "rating": 1,
            "most_similar_source": str(row.get("title", "N/A")),
            "message": "No significant plagiarism detected"
        }

    # ================= GEMINI REPORT =================
    prompt = f"""
You are an academic plagiarism expert.

Title: {req.title}
Student: {req.student_name}
Year: {req.year}

Similarity: {top_score:.2f}%

Source: {row.get("title", "N/A")}

Give:
1. Similarity explanation
2. Risk level
3. Recommendation
"""

    try:
        response = gemini_model.generate_content(prompt)
        report = response.text
    except Exception as e:
        report = f"Gemini error: {str(e)}"

    # ================= RESPONSE =================
    return {
        "status": "suspicious",
        "similarity_percent": round(top_score, 2),
        "rating": 4 if top_score > 70 else 3,
        "stars": "★★★★☆" if top_score > 70 else "★★★☆☆",
        "most_similar_source": str(row.get("title", "N/A")),
        "source_student": str(row.get("student_name", "N/A")),
        "gemini_report": report,
        "timestamp": datetime.now().isoformat()
    }

# ================= PDF UPLOAD API (OPTIONAL) =================
@app.post("/check_pdf")
async def check_pdf(file: UploadFile = File(...)):

    content = await file.read()

    text = ""

    with pdfplumber.open(io.BytesIO(content)) as pdf:
        for page in pdf.pages:
            if page.extract_text():
                text += page.extract_text() + "\n"

    if len(text) < 100:
        return {"error": "PDF too short"}

    query_embedding = model.encode(text, convert_to_tensor=True, normalize_embeddings=True)

    scores = util.cos_sim(query_embedding, ref_embeddings)[0]
    scores = (scores * 100).cpu().numpy()

    top_idx = int(scores.argmax())
    top_score = float(scores[top_idx])

    row = df_ref.iloc[top_idx]

    return {
        "status": "done",
        "similarity_percent": round(top_score, 2),
        "best_match": str(row.get("title", "N/A"))
    }