docker_api / app.py
abdinkoo's picture
Update app.py
3be0a64 verified
raw
history blame
4.45 kB
from fastapi import FastAPI, HTTPException, File, UploadFile
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
from sentence_transformers import SentenceTransformer, util
import google.generativeai as genai
import pdfplumber
import pickle
import io
import os
from datetime import datetime
# ================= APP =================
app = FastAPI(title="Jimma University Plagiarism API")
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_methods=["*"],
allow_headers=["*"],
)
# ================= CONFIG =================
MODEL_PATH = "plagiarism_model"
EMBEDDINGS_FILE = "reference_embeddings.pkl"
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY", "YOUR_KEY_HERE")
SIMILARITY_THRESHOLD = 30.0
# ================= LOAD SBERT MODEL =================
model = SentenceTransformer(MODEL_PATH)
print("βœ… Model loaded:", MODEL_PATH)
# ================= LOAD REFERENCE DATA =================
with open(EMBEDDINGS_FILE, "rb") as f:
data = pickle.load(f)
ref_embeddings = data["embeddings"]
df_ref = data["df_ref"]
print("βœ… Reference dataset loaded")
# ================= GEMINI =================
genai.configure(api_key=GEMINI_API_KEY)
gemini_model = genai.GenerativeModel("gemini-2.5-flash")
# ================= REQUEST MODEL =================
class PlagiarismRequest(BaseModel):
text: str
title: str = "Unknown"
student_name: str = "Unknown"
year: str = "2026"
# ================= HEALTH CHECK =================
@app.get("/")
def home():
return {"message": "Plagiarism API Running πŸš€"}
# ================= TEXT CHECK API =================
@app.post("/check_plagiarism")
async def check_plagiarism(req: PlagiarismRequest):
text = req.text.strip()
if len(text) < 100:
raise HTTPException(400, "Text too short")
if len(text) > 8000:
text = text[:8000]
# ================= SBERT =================
query_embedding = model.encode(
text,
convert_to_tensor=True,
normalize_embeddings=True
)
scores = util.cos_sim(query_embedding, ref_embeddings)[0]
scores = (scores * 100).cpu().numpy()
top_idx = int(scores.argmax())
top_score = float(scores[top_idx])
row = df_ref.iloc[top_idx]
# ================= LOW RISK =================
if top_score < SIMILARITY_THRESHOLD:
return {
"status": "low_risk",
"similarity_percent": round(top_score, 2),
"rating": 1,
"most_similar_source": str(row.get("title", "N/A")),
"message": "No significant plagiarism detected"
}
# ================= GEMINI REPORT =================
prompt = f"""
You are an academic plagiarism expert.
Title: {req.title}
Student: {req.student_name}
Year: {req.year}
Similarity: {top_score:.2f}%
Source: {row.get("title", "N/A")}
Give:
1. Similarity explanation
2. Risk level
3. Recommendation
"""
try:
response = gemini_model.generate_content(prompt)
report = response.text
except Exception as e:
report = f"Gemini error: {str(e)}"
# ================= RESPONSE =================
return {
"status": "suspicious",
"similarity_percent": round(top_score, 2),
"rating": 4 if top_score > 70 else 3,
"stars": "β˜…β˜…β˜…β˜…β˜†" if top_score > 70 else "β˜…β˜…β˜…β˜†β˜†",
"most_similar_source": str(row.get("title", "N/A")),
"source_student": str(row.get("student_name", "N/A")),
"gemini_report": report,
"timestamp": datetime.now().isoformat()
}
# ================= PDF UPLOAD API (OPTIONAL) =================
@app.post("/check_pdf")
async def check_pdf(file: UploadFile = File(...)):
content = await file.read()
text = ""
with pdfplumber.open(io.BytesIO(content)) as pdf:
for page in pdf.pages:
if page.extract_text():
text += page.extract_text() + "\n"
if len(text) < 100:
return {"error": "PDF too short"}
query_embedding = model.encode(text, convert_to_tensor=True, normalize_embeddings=True)
scores = util.cos_sim(query_embedding, ref_embeddings)[0]
scores = (scores * 100).cpu().numpy()
top_idx = int(scores.argmax())
top_score = float(scores[top_idx])
row = df_ref.iloc[top_idx]
return {
"status": "done",
"similarity_percent": round(top_score, 2),
"best_match": str(row.get("title", "N/A"))
}