Spaces:

httpsAkayush
/

docker_api

Sleeping

App Files Files Community

docker_api / app.py

abdinkoo

Update app.py

0dc67e0 verified 16 days ago

raw

history blame

5.8 kB

	from fastapi import FastAPI, HTTPException
	from fastapi.middleware.cors import CORSMiddleware
	from pydantic import BaseModel
	from sentence_transformers import SentenceTransformer, util
	import pickle
	import google.generativeai as genai
	from datetime import datetime
	from typing import Dict
	import os

	app = FastAPI(title="Jimma University Plagiarism API")

	# ====================== SAFE LIMITS ======================
	MAX_TEXT_LENGTH = 8000
	MAX_PROMPT_LENGTH = 4000
	SIMILARITY_THRESHOLD = 30.0
	# =========================================================

	# ====================== RATING FUNCTION ======================
	def convert_to_rating(similarity_percent: float) -> int:
	if similarity_percent >= 80:
	return 5
	elif similarity_percent >= 60:
	return 4
	elif similarity_percent >= 40:
	return 3
	elif similarity_percent >= 20:
	return 2
	else:
	return 1
	# ============================================================

	# ====================== ROOT ======================
	@app.get("/")
	def home():
	return {"message": "Jimma University Plagiarism API is running 🚀"}

	@app.get("/health")
	def health():
	return {"status": "ok"}
	# ==================================================

	app.add_middleware(
	CORSMiddleware,
	allow_origins=["*"],
	allow_methods=["*"],
	allow_headers=["*"],
	)

	# ====================== CONFIG ======================
	GEMINI_API_KEY = os.getenv(
	"GEMINI_API_KEY",
	"AQ.Ab8RN6Id1IlRKgMi19Vmy7PGrY82ZxG5D34vsDOnsFOFdrRI6g"
	)

	MODEL_PATH = "plagiarism_sbert_model"
	EMBEDDINGS_FILE = "reference_embeddings.pkl"
	# ===================================================

	# ====================== LOAD MODEL (FIXED) ======================
	if not os.path.exists(MODEL_PATH):
	raise RuntimeError("❌ Model folder not found")

	model = SentenceTransformer(MODEL_PATH)

	print("✅ SBERT model loaded")

	# ====================== LOAD REFERENCE DATASET ======================
	if not os.path.exists(EMBEDDINGS_FILE):
	raise RuntimeError("❌ Reference embeddings file not found")

	with open(EMBEDDINGS_FILE, "rb") as f:
	data = pickle.load(f)

	ref_embeddings = data["embeddings"]
	df_ref = data["df_ref"]

	print("✅ Reference dataset loaded")
	# ================================================================

	# ====================== GEMINI ======================
	genai.configure(api_key=GEMINI_API_KEY)
	gemini_model = genai.GenerativeModel('gemini-2.5-flash')

	print("✅ System ready")
	# ===================================================

	# ====================== REQUEST MODEL ======================
	class PlagiarismRequest(BaseModel):
	text: str
	title: str = "Submitted Document"
	student_name: str = "Unknown Student"
	year: str = "2026"
	# ============================================================

	# ====================== API ======================
	@app.post("/check_plagiarism")
	async def check_plagiarism(req: PlagiarismRequest) -> Dict:

	text = req.text.strip()

	if len(text) < 200:
	raise HTTPException(400, "Text too short (minimum 200 characters)")

	if len(text) > MAX_TEXT_LENGTH:
	text = text[:MAX_TEXT_LENGTH]

	# ================= SBERT ENCODING =================
	try:
	query_embedding = model.encode(
	text,
	convert_to_tensor=True,
	normalize_embeddings=True
	)

	# IMPORTANT FIX: cosine similarity stays in 0–1 range
	cosine_scores = util.cos_sim(query_embedding, ref_embeddings)[0]

	# convert to percentage properly
	similarities = (cosine_scores * 100).cpu().numpy()

	except Exception as e:
	raise HTTPException(status_code=500, detail=f"Embedding error: {str(e)}")

	# ================= TOP MATCH =================
	top_idx = int(similarities.argmax())
	top_similarity = float(similarities[top_idx])

	rating = convert_to_rating(top_similarity)
	stars = "★" * rating + "☆" * (5 - rating)

	# ================= LOW RISK =================
	if top_similarity <= SIMILARITY_THRESHOLD:
	return {
	"status": "low_risk",
	"similarity_percent": round(top_similarity, 2),
	"rating": rating,
	"stars": stars,
	"message": "No significant plagiarism detected."
	}

	# ================= SOURCE =================
	row = df_ref.iloc[top_idx]
	source_title = str(row.get("title", "Reference Project"))[:150]
	source_student = str(row.get("student_name", "Original Student"))
	source_year = str(row.get("year", "2023"))

	category = "LOW" if top_similarity <= 30 else "MEDIUM" if top_similarity <= 70 else "HIGH"
	emoji = "✅" if category == "LOW" else "⚖️" if category == "MEDIUM" else "❌"

	# ================= GEMINI PROMPT =================
	prompt = f"""
	You are a strict academic plagiarism supervisor at Jimma University.

	{emoji} {category} SIMILARITY CASE

	Source Title: {source_title}
	Student Name: {source_student}
	Year: {source_year}

	Suspicious Title: {req.title}
	Student Name: {req.student_name}
	Year: {req.year}

	Similarity Score: {top_similarity:.1f}%

	1. Conceptual Similarity:
	2. Conceptual Differences:
	3. Technology Differences:
	4. Supervisor Recommendation:
	"""

	try:
	prompt = prompt[:MAX_PROMPT_LENGTH]
	response = gemini_model.generate_content(prompt)
	report = response.text.strip()

	except Exception as e:
	report = f"Gemini error: {str(e)}"

	return {
	"status": "suspicious",
	"similarity_percent": round(top_similarity, 2),
	"rating": rating,
	"stars": stars,
	"most_similar_source": source_title,
	"source_student": source_student,
	"gemini_report": report,
	"timestamp": datetime.now().isoformat()
	}