from fastapi import APIRouter, UploadFile, File, HTTPException, Depends from typing import Optional from datetime import datetime from fastapi.security import OAuth2PasswordBearer from jose import JWTError, jwt from motor.motor_asyncio import AsyncIOMotorClient import os from app.config import MONGODB_URI,ALGORITHM, SECRET_KEY from app.schemas.teacher_schemas import ( LexicalMatch ) from app.utils.file_utils import extract_text_from_file, allowed_file from app.utils.lexical_utils import ( get_meaningful_sentences, extract_keywords, find_exact_matches, find_partial_phrase_match, ) from app.utils.web_utils import fetch_sources, fetch_sources_multi_query router = APIRouter(prefix="/student", tags=["student-lexical"]) LEXICAL_DOC_THRESHOLD = 0.85 # 85% oauth2_scheme = OAuth2PasswordBearer(tokenUrl="/token") def verify_token(token: str = Depends(oauth2_scheme)): try: return jwt.decode(token, SECRET_KEY, algorithms=[ALGORITHM]) except JWTError: raise HTTPException(status_code=401, detail="Invalid or expired token") async def get_mongo_client(): return AsyncIOMotorClient(MONGODB_URI) @router.post("/lexical-analysis") async def student_lexical_analysis( file: UploadFile = File(...), current_user=Depends(verify_token), ): if not file: raise HTTPException(status_code=400, detail="No file uploaded") t0 = datetime.utcnow() total_matches = 0 print(f"šŸ” Starting student lexical analysis for uploaded file...") # Process single file if not allowed_file(file.filename): raise HTTPException(status_code=400, detail=f"Invalid file type: {file.filename}") raw = await file.read() text = extract_text_from_file(raw, file.filename) or "" sentences = get_meaningful_sentences(text) print(f"\nšŸ“„ Processing file: {file.filename}") print(f" āž¤ Extracted {len(sentences)} sentences") print(f" āž¤ Approx word count: {len(text.split())}") # Build search query from keywords sources = fetch_sources_multi_query(text, num_results=10) print(f" āž¤ Found {len(sources)} online sources from diverse queries") if not sources: raise HTTPException(status_code=404, detail=f"No sources found online for {file.filename}") matches = [] highest = 0.0 source_matches_count = {} externals = [ { "title": s.get("url", "Unknown"), "text": s.get("content", ""), "source_url": s.get("url", ""), "type": "web", } for s in sources if s.get("content") ] for ext in externals: print(f" 🌐 Source: {ext['source_url'][:60]}...") source_matches_count[ext['source_url']] = 0 # Compare each sentence against ALL sources for s in sentences: best_overall_score = 0.0 best_overall_match = None best_overall_src = None for ext in externals: # Try exact match first sim = find_exact_matches(s, ext["text"]) if sim is not None and sim > best_overall_score: best_overall_score = sim best_overall_match = s best_overall_src = ext continue # Try partial phrase match pp = find_partial_phrase_match(s, ext["text"]) if pp: phrase, score = pp if score > best_overall_score: best_overall_score = score best_overall_match = phrase best_overall_src = ext # Add match if found and above threshold (50%) if best_overall_match and best_overall_score > 0.0: pct = round(best_overall_score * 100.0, 1) if pct >= 50: matches.append({ "matched_text": best_overall_match, "similarity": pct, "source_type": best_overall_src["type"], "source_title": best_overall_src["title"], "source_url": best_overall_src["source_url"], "context": "Potential plagiarism detected", }) source_matches_count[best_overall_src['source_url']] += 1 highest = max(highest, pct) total_matches += 1 print(f" āœ… Match ({pct}%) with {best_overall_src['source_url'][:50]}") # Better flagging logic considering multiple sources num_sources_with_matches = sum(1 for c in source_matches_count.values() if c > 0) avg_match_score = (sum(m["similarity"] for m in matches) / len(matches)) if matches else 0.0 # Flag if any of these conditions are met: # 1. Single source with high similarity (>85%) # 2. Content plagiarized from 2+ different sources # 3. 3+ matches with average >70% flagged = ( highest >= 85 or num_sources_with_matches >= 2 or (len(matches) >= 3 and avg_match_score >= 70) ) print(f" āž¤ Highest similarity: {highest:.1f}%") print(f" āž¤ Total matches: {len(matches)}") print(f" āž¤ Sources with matches: {num_sources_with_matches}") print(f" āž¤ Average match score: {avg_match_score:.1f}%") print(f" āž¤ Flagged: {flagged}") elapsed = (datetime.utcnow() - t0).total_seconds() mm = int(elapsed // 60) ss = int(elapsed % 60) processing_time = f"{mm}m {ss:02d}s" print("\nāœ… Analysis completed!") print(f" āž¤ Flagged: {flagged}") print(f" āž¤ Highest Similarity: {highest}%") print(f" āž¤ Average Similarity: {avg_match_score:.1f}%") print(f" āž¤ Processing Time: {processing_time}") # Extract unique sources all_sources = list(set(m["source_url"] for m in matches)) # Build response result = { "id": None, # Will be set after MongoDB insert "name": file.filename, "content": text, "matches": matches, "similarity": round(highest, 1), "flagged": flagged, "wordCount": len(text.split()), "processingTime": processing_time, "totalMatches": total_matches, "averageSimilarity": round(avg_match_score, 1), "sources": all_sources, "uploadDate": datetime.utcnow().isoformat(), } # Save to MongoDB try: mongo_client = await get_mongo_client() db = mongo_client.sluethink reports_collection = db.reports # Prepare document for MongoDB report_doc = { "name": file.filename, "analysisType": "lexical", "submittedBy": current_user.get("username", "System"), "uploadDate": datetime.utcnow().strftime("%Y-%m-%d"), "similarity": highest, "status": "completed", "flagged": flagged, "fileCount": 1, "processingTime": processing_time, "avgSimilarity": avg_match_score, "sources": all_sources, "createdAt": datetime.utcnow(), "userId": current_user.get("sub") or current_user.get("user_id"), "content": text, "wordCount": len(text.split()), "matches": matches, "totalMatches": total_matches, } # Insert into MongoDB insert_result = await reports_collection.insert_one(report_doc) print(f"\nšŸ’¾ Report saved to MongoDB with ID: {insert_result.inserted_id}") # Update the result with the MongoDB ID result["id"] = str(insert_result.inserted_id) mongo_client.close() except Exception as e: print(f"\nāŒ Error saving to MongoDB: {str(e)}") # Don't fail the request if MongoDB save fails result["id"] = "temp_id" print(f"\n🧾 Returning report:\n" f" Flagged: {flagged}\n" f" Avg Similarity: {avg_match_score:.1f}%\n" f" Highest Similarity: {highest}%\n" f" Total Matches: {total_matches}") return result