Spaces:
Sleeping
Sleeping
| from fastapi import FastAPI, HTTPException | |
| from fastapi.middleware.cors import CORSMiddleware | |
| from pydantic import BaseModel | |
| from sentence_transformers import SentenceTransformer, util | |
| import pickle | |
| import google.generativeai as genai | |
| from datetime import datetime | |
| from typing import Dict | |
| import os | |
| app = FastAPI(title="Jimma University Plagiarism API") | |
| # ====================== SAFE LIMITS ====================== | |
| MAX_TEXT_LENGTH = 8000 | |
| MAX_PROMPT_LENGTH = 4000 | |
| SIMILARITY_THRESHOLD = 30.0 | |
| # ========================================================= | |
| # ====================== RATING FUNCTION ====================== | |
| def convert_to_rating(similarity_percent: float) -> int: | |
| if similarity_percent >= 80: | |
| return 5 | |
| elif similarity_percent >= 60: | |
| return 4 | |
| elif similarity_percent >= 40: | |
| return 3 | |
| elif similarity_percent >= 20: | |
| return 2 | |
| else: | |
| return 1 | |
| # ============================================================ | |
| # ====================== ROOT ====================== | |
| def home(): | |
| return {"message": "Jimma University Plagiarism API is running π"} | |
| def health(): | |
| return {"status": "ok"} | |
| # ================================================== | |
| app.add_middleware( | |
| CORSMiddleware, | |
| allow_origins=["*"], | |
| allow_methods=["*"], | |
| allow_headers=["*"], | |
| ) | |
| # ====================== CONFIG ====================== | |
| GEMINI_API_KEY = os.getenv( | |
| "GEMINI_API_KEY", | |
| "AQ.Ab8RN6Id1IlRKgMi19Vmy7PGrY82ZxG5D34vsDOnsFOFdrRI6g" | |
| ) | |
| MODEL_PATH = "plagiarism_sbert_model" | |
| EMBEDDINGS_FILE = "reference_embeddings.pkl" | |
| # =================================================== | |
| # ====================== LOAD MODEL (FIXED) ====================== | |
| if not os.path.exists(MODEL_PATH): | |
| raise RuntimeError("β Model folder not found") | |
| model = SentenceTransformer(MODEL_PATH) | |
| print("β SBERT model loaded") | |
| # ====================== LOAD REFERENCE DATASET ====================== | |
| if not os.path.exists(EMBEDDINGS_FILE): | |
| raise RuntimeError("β Reference embeddings file not found") | |
| with open(EMBEDDINGS_FILE, "rb") as f: | |
| data = pickle.load(f) | |
| ref_embeddings = data["embeddings"] | |
| df_ref = data["df_ref"] | |
| print("β Reference dataset loaded") | |
| # ================================================================ | |
| # ====================== GEMINI ====================== | |
| genai.configure(api_key=GEMINI_API_KEY) | |
| gemini_model = genai.GenerativeModel('gemini-2.5-flash') | |
| print("β System ready") | |
| # =================================================== | |
| # ====================== REQUEST MODEL ====================== | |
| class PlagiarismRequest(BaseModel): | |
| text: str | |
| title: str = "Submitted Document" | |
| student_name: str = "Unknown Student" | |
| year: str = "2026" | |
| # ============================================================ | |
| # ====================== API ====================== | |
| async def check_plagiarism(req: PlagiarismRequest) -> Dict: | |
| text = req.text.strip() | |
| if len(text) < 200: | |
| raise HTTPException(400, "Text too short (minimum 200 characters)") | |
| if len(text) > MAX_TEXT_LENGTH: | |
| text = text[:MAX_TEXT_LENGTH] | |
| # ================= SBERT ENCODING ================= | |
| try: | |
| query_embedding = model.encode( | |
| text, | |
| convert_to_tensor=True, | |
| normalize_embeddings=True | |
| ) | |
| # IMPORTANT FIX: cosine similarity stays in 0β1 range | |
| cosine_scores = util.cos_sim(query_embedding, ref_embeddings)[0] | |
| # convert to percentage properly | |
| similarities = (cosine_scores * 100).cpu().numpy() | |
| except Exception as e: | |
| raise HTTPException(status_code=500, detail=f"Embedding error: {str(e)}") | |
| # ================= TOP MATCH ================= | |
| top_idx = int(similarities.argmax()) | |
| top_similarity = float(similarities[top_idx]) | |
| rating = convert_to_rating(top_similarity) | |
| stars = "β " * rating + "β" * (5 - rating) | |
| # ================= LOW RISK ================= | |
| if top_similarity <= SIMILARITY_THRESHOLD: | |
| return { | |
| "status": "low_risk", | |
| "similarity_percent": round(top_similarity, 2), | |
| "rating": rating, | |
| "stars": stars, | |
| "message": "No significant plagiarism detected." | |
| } | |
| # ================= SOURCE ================= | |
| row = df_ref.iloc[top_idx] | |
| source_title = str(row.get("title", "Reference Project"))[:150] | |
| source_student = str(row.get("student_name", "Original Student")) | |
| source_year = str(row.get("year", "2023")) | |
| category = "LOW" if top_similarity <= 30 else "MEDIUM" if top_similarity <= 70 else "HIGH" | |
| emoji = "β " if category == "LOW" else "βοΈ" if category == "MEDIUM" else "β" | |
| # ================= GEMINI PROMPT ================= | |
| prompt = f""" | |
| You are a strict academic plagiarism supervisor at Jimma University. | |
| {emoji} {category} SIMILARITY CASE | |
| Source Title: {source_title} | |
| Student Name: {source_student} | |
| Year: {source_year} | |
| Suspicious Title: {req.title} | |
| Student Name: {req.student_name} | |
| Year: {req.year} | |
| Similarity Score: {top_similarity:.1f}% | |
| 1. Conceptual Similarity: | |
| 2. Conceptual Differences: | |
| 3. Technology Differences: | |
| 4. Supervisor Recommendation: | |
| """ | |
| try: | |
| prompt = prompt[:MAX_PROMPT_LENGTH] | |
| response = gemini_model.generate_content(prompt) | |
| report = response.text.strip() | |
| except Exception as e: | |
| report = f"Gemini error: {str(e)}" | |
| return { | |
| "status": "suspicious", | |
| "similarity_percent": round(top_similarity, 2), | |
| "rating": rating, | |
| "stars": stars, | |
| "most_similar_source": source_title, | |
| "source_student": source_student, | |
| "gemini_report": report, | |
| "timestamp": datetime.now().isoformat() | |
| } |