docker_api / app.py
abdinkoo's picture
Update app.py
0dc67e0 verified
raw
history blame
5.8 kB
from fastapi import FastAPI, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
from sentence_transformers import SentenceTransformer, util
import pickle
import google.generativeai as genai
from datetime import datetime
from typing import Dict
import os
app = FastAPI(title="Jimma University Plagiarism API")
# ====================== SAFE LIMITS ======================
MAX_TEXT_LENGTH = 8000
MAX_PROMPT_LENGTH = 4000
SIMILARITY_THRESHOLD = 30.0
# =========================================================
# ====================== RATING FUNCTION ======================
def convert_to_rating(similarity_percent: float) -> int:
if similarity_percent >= 80:
return 5
elif similarity_percent >= 60:
return 4
elif similarity_percent >= 40:
return 3
elif similarity_percent >= 20:
return 2
else:
return 1
# ============================================================
# ====================== ROOT ======================
@app.get("/")
def home():
return {"message": "Jimma University Plagiarism API is running πŸš€"}
@app.get("/health")
def health():
return {"status": "ok"}
# ==================================================
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_methods=["*"],
allow_headers=["*"],
)
# ====================== CONFIG ======================
GEMINI_API_KEY = os.getenv(
"GEMINI_API_KEY",
"AQ.Ab8RN6Id1IlRKgMi19Vmy7PGrY82ZxG5D34vsDOnsFOFdrRI6g"
)
MODEL_PATH = "plagiarism_sbert_model"
EMBEDDINGS_FILE = "reference_embeddings.pkl"
# ===================================================
# ====================== LOAD MODEL (FIXED) ======================
if not os.path.exists(MODEL_PATH):
raise RuntimeError("❌ Model folder not found")
model = SentenceTransformer(MODEL_PATH)
print("βœ… SBERT model loaded")
# ====================== LOAD REFERENCE DATASET ======================
if not os.path.exists(EMBEDDINGS_FILE):
raise RuntimeError("❌ Reference embeddings file not found")
with open(EMBEDDINGS_FILE, "rb") as f:
data = pickle.load(f)
ref_embeddings = data["embeddings"]
df_ref = data["df_ref"]
print("βœ… Reference dataset loaded")
# ================================================================
# ====================== GEMINI ======================
genai.configure(api_key=GEMINI_API_KEY)
gemini_model = genai.GenerativeModel('gemini-2.5-flash')
print("βœ… System ready")
# ===================================================
# ====================== REQUEST MODEL ======================
class PlagiarismRequest(BaseModel):
text: str
title: str = "Submitted Document"
student_name: str = "Unknown Student"
year: str = "2026"
# ============================================================
# ====================== API ======================
@app.post("/check_plagiarism")
async def check_plagiarism(req: PlagiarismRequest) -> Dict:
text = req.text.strip()
if len(text) < 200:
raise HTTPException(400, "Text too short (minimum 200 characters)")
if len(text) > MAX_TEXT_LENGTH:
text = text[:MAX_TEXT_LENGTH]
# ================= SBERT ENCODING =================
try:
query_embedding = model.encode(
text,
convert_to_tensor=True,
normalize_embeddings=True
)
# IMPORTANT FIX: cosine similarity stays in 0–1 range
cosine_scores = util.cos_sim(query_embedding, ref_embeddings)[0]
# convert to percentage properly
similarities = (cosine_scores * 100).cpu().numpy()
except Exception as e:
raise HTTPException(status_code=500, detail=f"Embedding error: {str(e)}")
# ================= TOP MATCH =================
top_idx = int(similarities.argmax())
top_similarity = float(similarities[top_idx])
rating = convert_to_rating(top_similarity)
stars = "β˜…" * rating + "β˜†" * (5 - rating)
# ================= LOW RISK =================
if top_similarity <= SIMILARITY_THRESHOLD:
return {
"status": "low_risk",
"similarity_percent": round(top_similarity, 2),
"rating": rating,
"stars": stars,
"message": "No significant plagiarism detected."
}
# ================= SOURCE =================
row = df_ref.iloc[top_idx]
source_title = str(row.get("title", "Reference Project"))[:150]
source_student = str(row.get("student_name", "Original Student"))
source_year = str(row.get("year", "2023"))
category = "LOW" if top_similarity <= 30 else "MEDIUM" if top_similarity <= 70 else "HIGH"
emoji = "βœ…" if category == "LOW" else "βš–οΈ" if category == "MEDIUM" else "❌"
# ================= GEMINI PROMPT =================
prompt = f"""
You are a strict academic plagiarism supervisor at Jimma University.
{emoji} {category} SIMILARITY CASE
Source Title: {source_title}
Student Name: {source_student}
Year: {source_year}
Suspicious Title: {req.title}
Student Name: {req.student_name}
Year: {req.year}
Similarity Score: {top_similarity:.1f}%
1. Conceptual Similarity:
2. Conceptual Differences:
3. Technology Differences:
4. Supervisor Recommendation:
"""
try:
prompt = prompt[:MAX_PROMPT_LENGTH]
response = gemini_model.generate_content(prompt)
report = response.text.strip()
except Exception as e:
report = f"Gemini error: {str(e)}"
return {
"status": "suspicious",
"similarity_percent": round(top_similarity, 2),
"rating": rating,
"stars": stars,
"most_similar_source": source_title,
"source_student": source_student,
"gemini_report": report,
"timestamp": datetime.now().isoformat()
}