from fastapi import FastAPI, HTTPException, File, UploadFile from fastapi.middleware.cors import CORSMiddleware from pydantic import BaseModel from sentence_transformers import SentenceTransformer, util import google.generativeai as genai import pdfplumber import pickle import io import os from datetime import datetime # ================= APP ================= app = FastAPI(title="Jimma University Plagiarism API") app.add_middleware( CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"], ) # ================= CONFIG ================= MODEL_PATH = "plagiarism_model" EMBEDDINGS_FILE = "reference_embeddings.pkl" GEMINI_API_KEY = os.getenv("GEMINI_API_KEY", "YOUR_KEY_HERE") SIMILARITY_THRESHOLD = 30.0 # ================= LOAD SBERT MODEL ================= model = SentenceTransformer(MODEL_PATH) print("✅ Model loaded:", MODEL_PATH) # ================= LOAD REFERENCE DATA ================= with open(EMBEDDINGS_FILE, "rb") as f: data = pickle.load(f) ref_embeddings = data["embeddings"] df_ref = data["df_ref"] print("✅ Reference dataset loaded") # ================= GEMINI ================= genai.configure(api_key=GEMINI_API_KEY) gemini_model = genai.GenerativeModel("gemini-2.5-flash") # ================= REQUEST MODEL ================= class PlagiarismRequest(BaseModel): text: str title: str = "Unknown" student_name: str = "Unknown" year: str = "2026" # ================= HEALTH CHECK ================= @app.get("/") def home(): return {"message": "Plagiarism API Running 🚀"} # ================= TEXT CHECK API ================= @app.post("/check_plagiarism") async def check_plagiarism(req: PlagiarismRequest): text = req.text.strip() if len(text) < 100: raise HTTPException(400, "Text too short") if len(text) > 8000: text = text[:8000] # ================= SBERT ================= query_embedding = model.encode( text, convert_to_tensor=True, normalize_embeddings=True ) scores = util.cos_sim(query_embedding, ref_embeddings)[0] scores = (scores * 100).cpu().numpy() top_idx = int(scores.argmax()) top_score = float(scores[top_idx]) row = df_ref.iloc[top_idx] # ================= LOW RISK ================= if top_score < SIMILARITY_THRESHOLD: return { "status": "low_risk", "similarity_percent": round(top_score, 2), "rating": 1, "most_similar_source": str(row.get("title", "N/A")), "message": "No significant plagiarism detected" } # ================= GEMINI REPORT ================= prompt = f""" You are an academic plagiarism expert. Title: {req.title} Student: {req.student_name} Year: {req.year} Similarity: {top_score:.2f}% Source: {row.get("title", "N/A")} Give: 1. Similarity explanation 2. Risk level 3. Recommendation """ try: response = gemini_model.generate_content(prompt) report = response.text except Exception as e: report = f"Gemini error: {str(e)}" # ================= RESPONSE ================= return { "status": "suspicious", "similarity_percent": round(top_score, 2), "rating": 4 if top_score > 70 else 3, "stars": "★★★★☆" if top_score > 70 else "★★★☆☆", "most_similar_source": str(row.get("title", "N/A")), "source_student": str(row.get("student_name", "N/A")), "gemini_report": report, "timestamp": datetime.now().isoformat() } # ================= PDF UPLOAD API (OPTIONAL) ================= @app.post("/check_pdf") async def check_pdf(file: UploadFile = File(...)): content = await file.read() text = "" with pdfplumber.open(io.BytesIO(content)) as pdf: for page in pdf.pages: if page.extract_text(): text += page.extract_text() + "\n" if len(text) < 100: return {"error": "PDF too short"} query_embedding = model.encode(text, convert_to_tensor=True, normalize_embeddings=True) scores = util.cos_sim(query_embedding, ref_embeddings)[0] scores = (scores * 100).cpu().numpy() top_idx = int(scores.argmax()) top_score = float(scores[top_idx]) row = df_ref.iloc[top_idx] return { "status": "done", "similarity_percent": round(top_score, 2), "best_match": str(row.get("title", "N/A")) }