Spaces:

httpsAkayush
/

docker_api

Sleeping

App Files Files Community

docker_api / app.py

abdinkoo

Update app.py

3be0a64 verified 17 days ago

raw

history blame

4.45 kB

	from fastapi import FastAPI, HTTPException, File, UploadFile
	from fastapi.middleware.cors import CORSMiddleware
	from pydantic import BaseModel
	from sentence_transformers import SentenceTransformer, util
	import google.generativeai as genai
	import pdfplumber
	import pickle
	import io
	import os
	from datetime import datetime

	# ================= APP =================
	app = FastAPI(title="Jimma University Plagiarism API")

	app.add_middleware(
	CORSMiddleware,
	allow_origins=["*"],
	allow_methods=["*"],
	allow_headers=["*"],
	)

	# ================= CONFIG =================
	MODEL_PATH = "plagiarism_model"
	EMBEDDINGS_FILE = "reference_embeddings.pkl"
	GEMINI_API_KEY = os.getenv("GEMINI_API_KEY", "YOUR_KEY_HERE")

	SIMILARITY_THRESHOLD = 30.0

	# ================= LOAD SBERT MODEL =================
	model = SentenceTransformer(MODEL_PATH)
	print("✅ Model loaded:", MODEL_PATH)

	# ================= LOAD REFERENCE DATA =================
	with open(EMBEDDINGS_FILE, "rb") as f:
	data = pickle.load(f)

	ref_embeddings = data["embeddings"]
	df_ref = data["df_ref"]

	print("✅ Reference dataset loaded")

	# ================= GEMINI =================
	genai.configure(api_key=GEMINI_API_KEY)
	gemini_model = genai.GenerativeModel("gemini-2.5-flash")

	# ================= REQUEST MODEL =================
	class PlagiarismRequest(BaseModel):
	text: str
	title: str = "Unknown"
	student_name: str = "Unknown"
	year: str = "2026"

	# ================= HEALTH CHECK =================
	@app.get("/")
	def home():
	return {"message": "Plagiarism API Running 🚀"}

	# ================= TEXT CHECK API =================
	@app.post("/check_plagiarism")
	async def check_plagiarism(req: PlagiarismRequest):

	text = req.text.strip()

	if len(text) < 100:
	raise HTTPException(400, "Text too short")

	if len(text) > 8000:
	text = text[:8000]

	# ================= SBERT =================
	query_embedding = model.encode(
	text,
	convert_to_tensor=True,
	normalize_embeddings=True
	)

	scores = util.cos_sim(query_embedding, ref_embeddings)[0]
	scores = (scores * 100).cpu().numpy()

	top_idx = int(scores.argmax())
	top_score = float(scores[top_idx])

	row = df_ref.iloc[top_idx]

	# ================= LOW RISK =================
	if top_score < SIMILARITY_THRESHOLD:
	return {
	"status": "low_risk",
	"similarity_percent": round(top_score, 2),
	"rating": 1,
	"most_similar_source": str(row.get("title", "N/A")),
	"message": "No significant plagiarism detected"
	}

	# ================= GEMINI REPORT =================
	prompt = f"""
	You are an academic plagiarism expert.

	Title: {req.title}
	Student: {req.student_name}
	Year: {req.year}

	Similarity: {top_score:.2f}%

	Source: {row.get("title", "N/A")}

	Give:
	1. Similarity explanation
	2. Risk level
	3. Recommendation
	"""

	try:
	response = gemini_model.generate_content(prompt)
	report = response.text
	except Exception as e:
	report = f"Gemini error: {str(e)}"

	# ================= RESPONSE =================
	return {
	"status": "suspicious",
	"similarity_percent": round(top_score, 2),
	"rating": 4 if top_score > 70 else 3,
	"stars": "★★★★☆" if top_score > 70 else "★★★☆☆",
	"most_similar_source": str(row.get("title", "N/A")),
	"source_student": str(row.get("student_name", "N/A")),
	"gemini_report": report,
	"timestamp": datetime.now().isoformat()
	}

	# ================= PDF UPLOAD API (OPTIONAL) =================
	@app.post("/check_pdf")
	async def check_pdf(file: UploadFile = File(...)):

	content = await file.read()

	text = ""

	with pdfplumber.open(io.BytesIO(content)) as pdf:
	for page in pdf.pages:
	if page.extract_text():
	text += page.extract_text() + "\n"

	if len(text) < 100:
	return {"error": "PDF too short"}

	query_embedding = model.encode(text, convert_to_tensor=True, normalize_embeddings=True)

	scores = util.cos_sim(query_embedding, ref_embeddings)[0]
	scores = (scores * 100).cpu().numpy()

	top_idx = int(scores.argmax())
	top_score = float(scores[top_idx])

	row = df_ref.iloc[top_idx]

	return {
	"status": "done",
	"similarity_percent": round(top_score, 2),
	"best_match": str(row.get("title", "N/A"))
	}