|
|
from fastapi import APIRouter, UploadFile, File, HTTPException, Depends |
|
|
from typing import Optional |
|
|
from datetime import datetime |
|
|
from fastapi.security import OAuth2PasswordBearer |
|
|
from jose import JWTError, jwt |
|
|
from motor.motor_asyncio import AsyncIOMotorClient |
|
|
import os |
|
|
|
|
|
from app.config import MONGODB_URI,ALGORITHM, SECRET_KEY |
|
|
from app.schemas.teacher_schemas import ( |
|
|
LexicalMatch |
|
|
) |
|
|
from app.utils.file_utils import extract_text_from_file, allowed_file |
|
|
from app.utils.lexical_utils import ( |
|
|
get_meaningful_sentences, extract_keywords, |
|
|
find_exact_matches, find_partial_phrase_match, |
|
|
) |
|
|
from app.utils.web_utils import fetch_sources, fetch_sources_multi_query |
|
|
|
|
|
router = APIRouter(prefix="/student", tags=["student-lexical"]) |
|
|
|
|
|
LEXICAL_DOC_THRESHOLD = 0.85 |
|
|
oauth2_scheme = OAuth2PasswordBearer(tokenUrl="/token") |
|
|
|
|
|
|
|
|
|
|
|
def verify_token(token: str = Depends(oauth2_scheme)): |
|
|
try: |
|
|
return jwt.decode(token, SECRET_KEY, algorithms=[ALGORITHM]) |
|
|
except JWTError: |
|
|
raise HTTPException(status_code=401, detail="Invalid or expired token") |
|
|
|
|
|
async def get_mongo_client(): |
|
|
return AsyncIOMotorClient(MONGODB_URI) |
|
|
|
|
|
@router.post("/lexical-analysis") |
|
|
async def student_lexical_analysis( |
|
|
file: UploadFile = File(...), |
|
|
current_user=Depends(verify_token), |
|
|
): |
|
|
if not file: |
|
|
raise HTTPException(status_code=400, detail="No file uploaded") |
|
|
|
|
|
t0 = datetime.utcnow() |
|
|
total_matches = 0 |
|
|
|
|
|
print(f"π Starting student lexical analysis for uploaded file...") |
|
|
|
|
|
|
|
|
if not allowed_file(file.filename): |
|
|
raise HTTPException(status_code=400, detail=f"Invalid file type: {file.filename}") |
|
|
|
|
|
raw = await file.read() |
|
|
text = extract_text_from_file(raw, file.filename) or "" |
|
|
sentences = get_meaningful_sentences(text) |
|
|
|
|
|
print(f"\nπ Processing file: {file.filename}") |
|
|
print(f" β€ Extracted {len(sentences)} sentences") |
|
|
print(f" β€ Approx word count: {len(text.split())}") |
|
|
|
|
|
|
|
|
sources = fetch_sources_multi_query(text, num_results=10) |
|
|
print(f" β€ Found {len(sources)} online sources from diverse queries") |
|
|
|
|
|
if not sources: |
|
|
raise HTTPException(status_code=404, detail=f"No sources found online for {file.filename}") |
|
|
|
|
|
matches = [] |
|
|
highest = 0.0 |
|
|
source_matches_count = {} |
|
|
|
|
|
externals = [ |
|
|
{ |
|
|
"title": s.get("url", "Unknown"), |
|
|
"text": s.get("content", ""), |
|
|
"source_url": s.get("url", ""), |
|
|
"type": "web", |
|
|
} |
|
|
for s in sources if s.get("content") |
|
|
] |
|
|
|
|
|
for ext in externals: |
|
|
print(f" π Source: {ext['source_url'][:60]}...") |
|
|
source_matches_count[ext['source_url']] = 0 |
|
|
|
|
|
|
|
|
for s in sentences: |
|
|
best_overall_score = 0.0 |
|
|
best_overall_match = None |
|
|
best_overall_src = None |
|
|
|
|
|
for ext in externals: |
|
|
|
|
|
sim = find_exact_matches(s, ext["text"]) |
|
|
if sim is not None and sim > best_overall_score: |
|
|
best_overall_score = sim |
|
|
best_overall_match = s |
|
|
best_overall_src = ext |
|
|
continue |
|
|
|
|
|
|
|
|
pp = find_partial_phrase_match(s, ext["text"]) |
|
|
if pp: |
|
|
phrase, score = pp |
|
|
if score > best_overall_score: |
|
|
best_overall_score = score |
|
|
best_overall_match = phrase |
|
|
best_overall_src = ext |
|
|
|
|
|
|
|
|
if best_overall_match and best_overall_score > 0.0: |
|
|
pct = round(best_overall_score * 100.0, 1) |
|
|
|
|
|
if pct >= 50: |
|
|
matches.append({ |
|
|
"matched_text": best_overall_match, |
|
|
"similarity": pct, |
|
|
"source_type": best_overall_src["type"], |
|
|
"source_title": best_overall_src["title"], |
|
|
"source_url": best_overall_src["source_url"], |
|
|
"context": "Potential plagiarism detected", |
|
|
}) |
|
|
source_matches_count[best_overall_src['source_url']] += 1 |
|
|
highest = max(highest, pct) |
|
|
total_matches += 1 |
|
|
print(f" β
Match ({pct}%) with {best_overall_src['source_url'][:50]}") |
|
|
|
|
|
|
|
|
num_sources_with_matches = sum(1 for c in source_matches_count.values() if c > 0) |
|
|
avg_match_score = (sum(m["similarity"] for m in matches) / len(matches)) if matches else 0.0 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
flagged = ( |
|
|
highest >= 85 or |
|
|
num_sources_with_matches >= 2 or |
|
|
(len(matches) >= 3 and avg_match_score >= 70) |
|
|
) |
|
|
|
|
|
print(f" β€ Highest similarity: {highest:.1f}%") |
|
|
print(f" β€ Total matches: {len(matches)}") |
|
|
print(f" β€ Sources with matches: {num_sources_with_matches}") |
|
|
print(f" β€ Average match score: {avg_match_score:.1f}%") |
|
|
print(f" β€ Flagged: {flagged}") |
|
|
|
|
|
elapsed = (datetime.utcnow() - t0).total_seconds() |
|
|
mm = int(elapsed // 60) |
|
|
ss = int(elapsed % 60) |
|
|
processing_time = f"{mm}m {ss:02d}s" |
|
|
|
|
|
print("\nβ
Analysis completed!") |
|
|
print(f" β€ Flagged: {flagged}") |
|
|
print(f" β€ Highest Similarity: {highest}%") |
|
|
print(f" β€ Average Similarity: {avg_match_score:.1f}%") |
|
|
print(f" β€ Processing Time: {processing_time}") |
|
|
|
|
|
|
|
|
all_sources = list(set(m["source_url"] for m in matches)) |
|
|
|
|
|
|
|
|
result = { |
|
|
"id": None, |
|
|
"name": file.filename, |
|
|
"content": text, |
|
|
"matches": matches, |
|
|
"similarity": round(highest, 1), |
|
|
"flagged": flagged, |
|
|
"wordCount": len(text.split()), |
|
|
"processingTime": processing_time, |
|
|
"totalMatches": total_matches, |
|
|
"averageSimilarity": round(avg_match_score, 1), |
|
|
"sources": all_sources, |
|
|
"uploadDate": datetime.utcnow().isoformat(), |
|
|
} |
|
|
|
|
|
|
|
|
try: |
|
|
mongo_client = await get_mongo_client() |
|
|
db = mongo_client.sluethink |
|
|
reports_collection = db.reports |
|
|
|
|
|
|
|
|
report_doc = { |
|
|
"name": file.filename, |
|
|
"analysisType": "lexical", |
|
|
"submittedBy": current_user.get("username", "System"), |
|
|
"uploadDate": datetime.utcnow().strftime("%Y-%m-%d"), |
|
|
"similarity": highest, |
|
|
"status": "completed", |
|
|
"flagged": flagged, |
|
|
"fileCount": 1, |
|
|
"processingTime": processing_time, |
|
|
"avgSimilarity": avg_match_score, |
|
|
"sources": all_sources, |
|
|
"createdAt": datetime.utcnow(), |
|
|
"userId": current_user.get("sub") or current_user.get("user_id"), |
|
|
"content": text, |
|
|
"wordCount": len(text.split()), |
|
|
"matches": matches, |
|
|
"totalMatches": total_matches, |
|
|
} |
|
|
|
|
|
|
|
|
insert_result = await reports_collection.insert_one(report_doc) |
|
|
print(f"\nπΎ Report saved to MongoDB with ID: {insert_result.inserted_id}") |
|
|
|
|
|
|
|
|
result["id"] = str(insert_result.inserted_id) |
|
|
|
|
|
mongo_client.close() |
|
|
|
|
|
except Exception as e: |
|
|
print(f"\nβ Error saving to MongoDB: {str(e)}") |
|
|
|
|
|
result["id"] = "temp_id" |
|
|
|
|
|
print(f"\nπ§Ύ Returning report:\n" |
|
|
f" Flagged: {flagged}\n" |
|
|
f" Avg Similarity: {avg_match_score:.1f}%\n" |
|
|
f" Highest Similarity: {highest}%\n" |
|
|
f" Total Matches: {total_matches}") |
|
|
|
|
|
return result |