Spaces:
Sleeping
Sleeping
| from fastapi import FastAPI, HTTPException, File, UploadFile | |
| from fastapi.middleware.cors import CORSMiddleware | |
| from pydantic import BaseModel | |
| from sentence_transformers import SentenceTransformer, util | |
| import google.generativeai as genai | |
| import pdfplumber | |
| import pickle | |
| import io | |
| import os | |
| from datetime import datetime | |
| # ================= APP ================= | |
| app = FastAPI(title="Jimma University Plagiarism API") | |
| app.add_middleware( | |
| CORSMiddleware, | |
| allow_origins=["*"], | |
| allow_methods=["*"], | |
| allow_headers=["*"], | |
| ) | |
| # ================= CONFIG ================= | |
| MODEL_PATH = "plagiarism_model" | |
| EMBEDDINGS_FILE = "reference_embeddings.pkl" | |
| GEMINI_API_KEY = os.getenv("GEMINI_API_KEY", "YOUR_KEY_HERE") | |
| SIMILARITY_THRESHOLD = 30.0 | |
| # ================= LOAD SBERT MODEL ================= | |
| model = SentenceTransformer(MODEL_PATH) | |
| print("β Model loaded:", MODEL_PATH) | |
| # ================= LOAD REFERENCE DATA ================= | |
| with open(EMBEDDINGS_FILE, "rb") as f: | |
| data = pickle.load(f) | |
| ref_embeddings = data["embeddings"] | |
| df_ref = data["df_ref"] | |
| print("β Reference dataset loaded") | |
| # ================= GEMINI ================= | |
| genai.configure(api_key=GEMINI_API_KEY) | |
| gemini_model = genai.GenerativeModel("gemini-2.5-flash") | |
| # ================= REQUEST MODEL ================= | |
| class PlagiarismRequest(BaseModel): | |
| text: str | |
| title: str = "Unknown" | |
| student_name: str = "Unknown" | |
| year: str = "2026" | |
| # ================= HEALTH CHECK ================= | |
| def home(): | |
| return {"message": "Plagiarism API Running π"} | |
| # ================= TEXT CHECK API ================= | |
| async def check_plagiarism(req: PlagiarismRequest): | |
| text = req.text.strip() | |
| if len(text) < 100: | |
| raise HTTPException(400, "Text too short") | |
| if len(text) > 8000: | |
| text = text[:8000] | |
| # ================= SBERT ================= | |
| query_embedding = model.encode( | |
| text, | |
| convert_to_tensor=True, | |
| normalize_embeddings=True | |
| ) | |
| scores = util.cos_sim(query_embedding, ref_embeddings)[0] | |
| scores = (scores * 100).cpu().numpy() | |
| top_idx = int(scores.argmax()) | |
| top_score = float(scores[top_idx]) | |
| row = df_ref.iloc[top_idx] | |
| # ================= LOW RISK ================= | |
| if top_score < SIMILARITY_THRESHOLD: | |
| return { | |
| "status": "low_risk", | |
| "similarity_percent": round(top_score, 2), | |
| "rating": 1, | |
| "most_similar_source": str(row.get("title", "N/A")), | |
| "message": "No significant plagiarism detected" | |
| } | |
| # ================= GEMINI REPORT ================= | |
| prompt = f""" | |
| You are an academic plagiarism expert. | |
| Title: {req.title} | |
| Student: {req.student_name} | |
| Year: {req.year} | |
| Similarity: {top_score:.2f}% | |
| Source: {row.get("title", "N/A")} | |
| Give: | |
| 1. Similarity explanation | |
| 2. Risk level | |
| 3. Recommendation | |
| """ | |
| try: | |
| response = gemini_model.generate_content(prompt) | |
| report = response.text | |
| except Exception as e: | |
| report = f"Gemini error: {str(e)}" | |
| # ================= RESPONSE ================= | |
| return { | |
| "status": "suspicious", | |
| "similarity_percent": round(top_score, 2), | |
| "rating": 4 if top_score > 70 else 3, | |
| "stars": "β β β β β" if top_score > 70 else "β β β ββ", | |
| "most_similar_source": str(row.get("title", "N/A")), | |
| "source_student": str(row.get("student_name", "N/A")), | |
| "gemini_report": report, | |
| "timestamp": datetime.now().isoformat() | |
| } | |
| # ================= PDF UPLOAD API (OPTIONAL) ================= | |
| async def check_pdf(file: UploadFile = File(...)): | |
| content = await file.read() | |
| text = "" | |
| with pdfplumber.open(io.BytesIO(content)) as pdf: | |
| for page in pdf.pages: | |
| if page.extract_text(): | |
| text += page.extract_text() + "\n" | |
| if len(text) < 100: | |
| return {"error": "PDF too short"} | |
| query_embedding = model.encode(text, convert_to_tensor=True, normalize_embeddings=True) | |
| scores = util.cos_sim(query_embedding, ref_embeddings)[0] | |
| scores = (scores * 100).cpu().numpy() | |
| top_idx = int(scores.argmax()) | |
| top_score = float(scores[top_idx]) | |
| row = df_ref.iloc[top_idx] | |
| return { | |
| "status": "done", | |
| "similarity_percent": round(top_score, 2), | |
| "best_match": str(row.get("title", "N/A")) | |
| } |