Spaces:
Sleeping
Sleeping
File size: 4,452 Bytes
3be0a64 db33294 3be0a64 db33294 3be0a64 b5017bd 3be0a64 db33294 3be0a64 db33294 3be0a64 db33294 3be0a64 db33294 3be0a64 db33294 3be0a64 db33294 3be0a64 b5017bd 3be0a64 fdf5004 3be0a64 db33294 3be0a64 db33294 3be0a64 db33294 3be0a64 db33294 3be0a64 f2d04f3 3be0a64 f2d04f3 3be0a64 f2d04f3 3be0a64 f2d04f3 3be0a64 f2d04f3 3be0a64 f2d04f3 3be0a64 f2d04f3 3be0a64 f2d04f3 3be0a64 f2d04f3 3be0a64 f2d04f3 3be0a64 f2d04f3 3be0a64 f2d04f3 3be0a64 f2d04f3 3be0a64 f2d04f3 3be0a64 f2d04f3 3be0a64 f2d04f3 3be0a64 f2d04f3 3be0a64 f2d04f3 3be0a64 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 | from fastapi import FastAPI, HTTPException, File, UploadFile
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
from sentence_transformers import SentenceTransformer, util
import google.generativeai as genai
import pdfplumber
import pickle
import io
import os
from datetime import datetime
# ================= APP =================
app = FastAPI(title="Jimma University Plagiarism API")
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_methods=["*"],
allow_headers=["*"],
)
# ================= CONFIG =================
MODEL_PATH = "plagiarism_model"
EMBEDDINGS_FILE = "reference_embeddings.pkl"
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY", "YOUR_KEY_HERE")
SIMILARITY_THRESHOLD = 30.0
# ================= LOAD SBERT MODEL =================
model = SentenceTransformer(MODEL_PATH)
print("β
Model loaded:", MODEL_PATH)
# ================= LOAD REFERENCE DATA =================
with open(EMBEDDINGS_FILE, "rb") as f:
data = pickle.load(f)
ref_embeddings = data["embeddings"]
df_ref = data["df_ref"]
print("β
Reference dataset loaded")
# ================= GEMINI =================
genai.configure(api_key=GEMINI_API_KEY)
gemini_model = genai.GenerativeModel("gemini-2.5-flash")
# ================= REQUEST MODEL =================
class PlagiarismRequest(BaseModel):
text: str
title: str = "Unknown"
student_name: str = "Unknown"
year: str = "2026"
# ================= HEALTH CHECK =================
@app.get("/")
def home():
return {"message": "Plagiarism API Running π"}
# ================= TEXT CHECK API =================
@app.post("/check_plagiarism")
async def check_plagiarism(req: PlagiarismRequest):
text = req.text.strip()
if len(text) < 100:
raise HTTPException(400, "Text too short")
if len(text) > 8000:
text = text[:8000]
# ================= SBERT =================
query_embedding = model.encode(
text,
convert_to_tensor=True,
normalize_embeddings=True
)
scores = util.cos_sim(query_embedding, ref_embeddings)[0]
scores = (scores * 100).cpu().numpy()
top_idx = int(scores.argmax())
top_score = float(scores[top_idx])
row = df_ref.iloc[top_idx]
# ================= LOW RISK =================
if top_score < SIMILARITY_THRESHOLD:
return {
"status": "low_risk",
"similarity_percent": round(top_score, 2),
"rating": 1,
"most_similar_source": str(row.get("title", "N/A")),
"message": "No significant plagiarism detected"
}
# ================= GEMINI REPORT =================
prompt = f"""
You are an academic plagiarism expert.
Title: {req.title}
Student: {req.student_name}
Year: {req.year}
Similarity: {top_score:.2f}%
Source: {row.get("title", "N/A")}
Give:
1. Similarity explanation
2. Risk level
3. Recommendation
"""
try:
response = gemini_model.generate_content(prompt)
report = response.text
except Exception as e:
report = f"Gemini error: {str(e)}"
# ================= RESPONSE =================
return {
"status": "suspicious",
"similarity_percent": round(top_score, 2),
"rating": 4 if top_score > 70 else 3,
"stars": "β
β
β
β
β" if top_score > 70 else "β
β
β
ββ",
"most_similar_source": str(row.get("title", "N/A")),
"source_student": str(row.get("student_name", "N/A")),
"gemini_report": report,
"timestamp": datetime.now().isoformat()
}
# ================= PDF UPLOAD API (OPTIONAL) =================
@app.post("/check_pdf")
async def check_pdf(file: UploadFile = File(...)):
content = await file.read()
text = ""
with pdfplumber.open(io.BytesIO(content)) as pdf:
for page in pdf.pages:
if page.extract_text():
text += page.extract_text() + "\n"
if len(text) < 100:
return {"error": "PDF too short"}
query_embedding = model.encode(text, convert_to_tensor=True, normalize_embeddings=True)
scores = util.cos_sim(query_embedding, ref_embeddings)[0]
scores = (scores * 100).cpu().numpy()
top_idx = int(scores.argmax())
top_score = float(scores[top_idx])
row = df_ref.iloc[top_idx]
return {
"status": "done",
"similarity_percent": round(top_score, 2),
"best_match": str(row.get("title", "N/A"))
} |