from fastapi import FastAPI, HTTPException from fastapi.middleware.cors import CORSMiddleware from pydantic import BaseModel from sentence_transformers import SentenceTransformer, util import pickle import google.generativeai as genai from datetime import datetime from typing import Dict import os app = FastAPI(title="Jimma University Plagiarism API") # ====================== SAFE LIMITS ====================== MAX_TEXT_LENGTH = 8000 MAX_PROMPT_LENGTH = 4000 SIMILARITY_THRESHOLD = 30.0 # ========================================================= # ====================== RATING FUNCTION ====================== def convert_to_rating(similarity_percent: float) -> int: if similarity_percent >= 80: return 5 elif similarity_percent >= 60: return 4 elif similarity_percent >= 40: return 3 elif similarity_percent >= 20: return 2 else: return 1 # ============================================================ # ====================== ROOT ====================== @app.get("/") def home(): return {"message": "Jimma University Plagiarism API is running 🚀"} @app.get("/health") def health(): return {"status": "ok"} # ================================================== app.add_middleware( CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"], ) # ====================== CONFIG ====================== GEMINI_API_KEY = os.getenv( "GEMINI_API_KEY", "AQ.Ab8RN6Id1IlRKgMi19Vmy7PGrY82ZxG5D34vsDOnsFOFdrRI6g" ) MODEL_PATH = "plagiarism_sbert_model" EMBEDDINGS_FILE = "reference_embeddings.pkl" # =================================================== # ====================== LOAD MODEL (FIXED) ====================== if not os.path.exists(MODEL_PATH): raise RuntimeError("❌ Model folder not found") model = SentenceTransformer(MODEL_PATH) print("✅ SBERT model loaded") # ====================== LOAD REFERENCE DATASET ====================== if not os.path.exists(EMBEDDINGS_FILE): raise RuntimeError("❌ Reference embeddings file not found") with open(EMBEDDINGS_FILE, "rb") as f: data = pickle.load(f) ref_embeddings = data["embeddings"] df_ref = data["df_ref"] print("✅ Reference dataset loaded") # ================================================================ # ====================== GEMINI ====================== genai.configure(api_key=GEMINI_API_KEY) gemini_model = genai.GenerativeModel('gemini-2.5-flash') print("✅ System ready") # =================================================== # ====================== REQUEST MODEL ====================== class PlagiarismRequest(BaseModel): text: str title: str = "Submitted Document" student_name: str = "Unknown Student" year: str = "2026" # ============================================================ # ====================== API ====================== @app.post("/check_plagiarism") async def check_plagiarism(req: PlagiarismRequest) -> Dict: text = req.text.strip() if len(text) < 200: raise HTTPException(400, "Text too short (minimum 200 characters)") if len(text) > MAX_TEXT_LENGTH: text = text[:MAX_TEXT_LENGTH] # ================= SBERT ENCODING ================= try: query_embedding = model.encode( text, convert_to_tensor=True, normalize_embeddings=True ) # IMPORTANT FIX: cosine similarity stays in 0–1 range cosine_scores = util.cos_sim(query_embedding, ref_embeddings)[0] # convert to percentage properly similarities = (cosine_scores * 100).cpu().numpy() except Exception as e: raise HTTPException(status_code=500, detail=f"Embedding error: {str(e)}") # ================= TOP MATCH ================= top_idx = int(similarities.argmax()) top_similarity = float(similarities[top_idx]) rating = convert_to_rating(top_similarity) stars = "★" * rating + "☆" * (5 - rating) # ================= LOW RISK ================= if top_similarity <= SIMILARITY_THRESHOLD: return { "status": "low_risk", "similarity_percent": round(top_similarity, 2), "rating": rating, "stars": stars, "message": "No significant plagiarism detected." } # ================= SOURCE ================= row = df_ref.iloc[top_idx] source_title = str(row.get("title", "Reference Project"))[:150] source_student = str(row.get("student_name", "Original Student")) source_year = str(row.get("year", "2023")) category = "LOW" if top_similarity <= 30 else "MEDIUM" if top_similarity <= 70 else "HIGH" emoji = "✅" if category == "LOW" else "⚖️" if category == "MEDIUM" else "❌" # ================= GEMINI PROMPT ================= prompt = f""" You are a strict academic plagiarism supervisor at Jimma University. {emoji} {category} SIMILARITY CASE Source Title: {source_title} Student Name: {source_student} Year: {source_year} Suspicious Title: {req.title} Student Name: {req.student_name} Year: {req.year} Similarity Score: {top_similarity:.1f}% 1. Conceptual Similarity: 2. Conceptual Differences: 3. Technology Differences: 4. Supervisor Recommendation: """ try: prompt = prompt[:MAX_PROMPT_LENGTH] response = gemini_model.generate_content(prompt) report = response.text.strip() except Exception as e: report = f"Gemini error: {str(e)}" return { "status": "suspicious", "similarity_percent": round(top_similarity, 2), "rating": rating, "stars": stars, "most_similar_source": source_title, "source_student": source_student, "gemini_report": report, "timestamp": datetime.now().isoformat() }