Spaces:

topGdev
/

Sluethink

Sleeping

+import os
+from dotenv import load_dotenv
+load_dotenv()
+# ───── API Keys & URLs ─────
+MONGODB_URI = os.getenv("MONGODB_URI", "mongodb+srv://localhost:27017/sluethink")
+# ───── Similarity thresholds ─────
+MIN_WORDS_PER_SENTENCE = 4
+MIN_SENTENCE_LENGTH = 30
+SEQUENCE_THRESHOLD = 0.75
+TFIDF_THRESHOLD = 0.80
+SUB_PHRASE_TFIDF_MIN = 0.50
+EXACT_MATCH_SCORE = 1.0
+# ───── File support ─────
+ALLOWED_EXTENSIONS = {"txt", "pdf", "docx"}
+JWT_SECRET_KEY = os.getenv("JWT_SECRET_KEY", "your_nextauth_secret")
+JWT_ALGORITHM = os.getenv("JWT_ALGORITHM", "HS256")
+HF_TOKEN = os.getenv("HF_TOKEN", "")
+API_KEYS = os.getenv("API_KEYS", "").split(",") if os.getenv("API_KEYS") else []
+SEARCH_ENGINE_IDS = os.getenv("SEARCH_ENGINE_IDS", "").split(",") if os.getenv("SEARCH_ENGINE_IDS") else []
+SECRET_KEY = os.getenv("SECRET_KEY", "change-me")
+ALGORITHM = os.getenv("ALGORITHM", "HS256")
+MIN_FUNCTION_LINES = 5
+MIN_CODE_BLOCK_LINES = 3
+STRUCTURAL_SIMILARITY_THRESHOLD = 0.75
+TOKEN_SIMILARITY_THRESHOLD = 0.70
+EXACT_MATCH_THRESHOLD = 0.90
+MAX_QUERIES_PER_SUBMISSION = 3
+RESULTS_PER_QUERY = 10
+MAX_SOURCES_TO_ANALYZE = 8
+REQUEST_TIMEOUT = 6
+MAX_SCRAPE_WORKERS = 4
+POLITENESS_DELAY = 0.2
+ALLOWED_CODE_EXTENSIONS = {'.py', '.java', '.cpp', '.c', '.js', '.jsx', '.ts', '.tsx', '.cs', '.rb', '.go', '.php'}
+MAX_FILE_SIZE_MB = 5
+LOG_LEVEL = "INFO"

app/dependencies/__pycache__/auth.cpython-312.pyc ADDED Viewed

Binary file (787 Bytes). View file

app/dependencies/auth.py ADDED Viewed

	@@ -0,0 +1,41 @@

+# app/dependencies/auth.py
+from jose import JWTError, jwt
+from fastapi import Depends, HTTPException
+from fastapi.security import OAuth2PasswordBearer
+from motor.motor_asyncio import AsyncIOMotorClient
+from bson import ObjectId
+from app.config import MONGODB_URI, JWT_SECRET_KEY, JWT_ALGORITHM
+# from app.schemas.report_schemas import User
+oauth2_scheme = OAuth2PasswordBearer(tokenUrl="/auth/login")
+async def get_mongo_client():
+    return AsyncIOMotorClient(MONGODB_URI)
+# async def get_current_user(
+#     token: str = Depends(oauth2_scheme),
+#     mongo_client: AsyncIOMotorClient = Depends(get_mongo_client),
+# ):
+#     # 1) Decode the JWT
+#     try:
+#         payload = jwt.decode(token, JWT_SECRET_KEY, algorithms=[JWT_ALGORITHM])
+#         user_id: str = payload.get("id")
+#         if not user_id:
+#             raise HTTPException(status_code=401, detail="Invalid token payload")
+#     except JWTError:
+#         raise HTTPException(status_code=401, detail="Could not validate credentials")
+#     # 2) Retrieve the user document from MongoDB
+#     db = mongo_client.get_default_database()
+#     users_col = db["users"]
+#     user_doc = await users_col.find_one({"_id": ObjectId(user_id)})
+#     if not user_doc:
+#         raise HTTPException(status_code=404, detail="User not found")
+#     user_doc["id"] = str(user_doc["_id"])
+#     return User(**user_doc)

app/logger.py ADDED Viewed

	@@ -0,0 +1,13 @@

+# logger.py
+import logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s [%(levelname)s] %(name)s: %(message)s',
+    handlers=[
+        logging.FileHandler("scraper.log"),
+        logging.StreamHandler()
+    ]
+)
+logger = logging.getLogger("scraper")

app/main.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import logging
+from fastapi import FastAPI
+from app.routers.student.lexical_analysis import router as student_lexical_router
+from app.routers.teacher.semantic_analysis import router as teacher_semantic_router
+from app.routers.teacher.lexical_analysis import router as teacher_lexical_router
+from app.routers.teacher.internal_analysis import router as teacher_internal_router
+from fastapi.middleware.cors import CORSMiddleware
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+app = FastAPI()
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["http://localhost:3000", "https://plagiarism-detection-frontend.vercel.app", "*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+app.include_router(student_lexical_router)
+app.include_router(teacher_internal_router)
+app.include_router(teacher_lexical_router)
+app.include_router(teacher_semantic_router)

app/routers/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

app/routers/__pycache__/health.cpython-312.pyc ADDED Viewed

Binary file (151 Bytes). View file

app/routers/__pycache__/plagiarism.cpython-312.pyc ADDED Viewed

Binary file (8.67 kB). View file

app/routers/student/__pycache__/lexical_analysis.cpython-312.pyc ADDED Viewed

Binary file (9.74 kB). View file

app/routers/student/__pycache__/plagiarism.cpython-312.pyc ADDED Viewed

Binary file (8.68 kB). View file

app/routers/student/lexical_analysis.py ADDED Viewed

	@@ -0,0 +1,225 @@

+from fastapi import APIRouter, UploadFile, File, HTTPException, Depends
+from typing import Optional
+from datetime import datetime
+from fastapi.security import OAuth2PasswordBearer
+from jose import JWTError, jwt
+from motor.motor_asyncio import AsyncIOMotorClient
+import os
+from app.config import MONGODB_URI,ALGORITHM, SECRET_KEY
+from app.schemas.teacher_schemas import (
+    LexicalMatch
+)
+from app.utils.file_utils import extract_text_from_file, allowed_file
+from app.utils.lexical_utils import (
+    get_meaningful_sentences, extract_keywords,
+    find_exact_matches, find_partial_phrase_match,
+)
+from app.utils.web_utils import fetch_sources, fetch_sources_multi_query
+router = APIRouter(prefix="/student", tags=["student-lexical"])
+LEXICAL_DOC_THRESHOLD = 0.85  # 85%
+oauth2_scheme = OAuth2PasswordBearer(tokenUrl="/token")
+def verify_token(token: str = Depends(oauth2_scheme)):
+    try:
+        return jwt.decode(token, SECRET_KEY, algorithms=[ALGORITHM])
+    except JWTError:
+        raise HTTPException(status_code=401, detail="Invalid or expired token")
+async def get_mongo_client():
+    return AsyncIOMotorClient(MONGODB_URI)
+@router.post("/lexical-analysis")
+async def student_lexical_analysis(
+    file: UploadFile = File(...),
+    current_user=Depends(verify_token),
+):
+    if not file:
+        raise HTTPException(status_code=400, detail="No file uploaded")
+    t0 = datetime.utcnow()
+    total_matches = 0
+    print(f"🔍 Starting student lexical analysis for uploaded file...")
+    # Process single file
+    if not allowed_file(file.filename):
+        raise HTTPException(status_code=400, detail=f"Invalid file type: {file.filename}")
+    raw = await file.read()
+    text = extract_text_from_file(raw, file.filename) or ""
+    sentences = get_meaningful_sentences(text)
+    print(f"\n📄 Processing file: {file.filename}")
+    print(f"   ➤ Extracted {len(sentences)} sentences")
+    print(f"   ➤ Approx word count: {len(text.split())}")
+    # Build search query from keywords
+    sources = fetch_sources_multi_query(text, num_results=10)
+    print(f"   ➤ Found {len(sources)} online sources from diverse queries")
+    if not sources:
+        raise HTTPException(status_code=404, detail=f"No sources found online for {file.filename}")
+    matches = []
+    highest = 0.0
+    source_matches_count = {}
+    externals = [
+        {
+            "title": s.get("url", "Unknown"),
+            "text": s.get("content", ""),
+            "source_url": s.get("url", ""),
+            "type": "web",
+        }
+        for s in sources if s.get("content")
+    ]
+    for ext in externals:
+        print(f"      🌐 Source: {ext['source_url'][:60]}...")
+        source_matches_count[ext['source_url']] = 0
+    # Compare each sentence against ALL sources
+    for s in sentences:
+        best_overall_score = 0.0
+        best_overall_match = None
+        best_overall_src = None
+        for ext in externals:
+            # Try exact match first
+            sim = find_exact_matches(s, ext["text"])
+            if sim is not None and sim > best_overall_score:
+                best_overall_score = sim
+                best_overall_match = s
+                best_overall_src = ext
+                continue
+            # Try partial phrase match
+            pp = find_partial_phrase_match(s, ext["text"])
+            if pp:
+                phrase, score = pp
+                if score > best_overall_score:
+                    best_overall_score = score
+                    best_overall_match = phrase
+                    best_overall_src = ext
+        # Add match if found and above threshold (50%)
+        if best_overall_match and best_overall_score > 0.0:
+            pct = round(best_overall_score * 100.0, 1)
+            if pct >= 50:
+                matches.append({
+                    "matched_text": best_overall_match,
+                    "similarity": pct,
+                    "source_type": best_overall_src["type"],
+                    "source_title": best_overall_src["title"],
+                    "source_url": best_overall_src["source_url"],
+                    "context": "Potential plagiarism detected",
+                })
+                source_matches_count[best_overall_src['source_url']] += 1
+                highest = max(highest, pct)
+                total_matches += 1
+                print(f"      ✅ Match ({pct}%) with {best_overall_src['source_url'][:50]}")
+    # Better flagging logic considering multiple sources
+    num_sources_with_matches = sum(1 for c in source_matches_count.values() if c > 0)
+    avg_match_score = (sum(m["similarity"] for m in matches) / len(matches)) if matches else 0.0
+    # Flag if any of these conditions are met:
+    # 1. Single source with high similarity (>85%)
+    # 2. Content plagiarized from 2+ different sources
+    # 3. 3+ matches with average >70%
+    flagged = (
+        highest >= 85 or
+        num_sources_with_matches >= 2 or
+        (len(matches) >= 3 and avg_match_score >= 70)
+    )
+    print(f"   ➤ Highest similarity: {highest:.1f}%")
+    print(f"   ➤ Total matches: {len(matches)}")
+    print(f"   ➤ Sources with matches: {num_sources_with_matches}")
+    print(f"   ➤ Average match score: {avg_match_score:.1f}%")
+    print(f"   ➤ Flagged: {flagged}")
+    elapsed = (datetime.utcnow() - t0).total_seconds()
+    mm = int(elapsed // 60)
+    ss = int(elapsed % 60)
+    processing_time = f"{mm}m {ss:02d}s"
+    print("\n✅ Analysis completed!")
+    print(f"   ➤ Flagged: {flagged}")
+    print(f"   ➤ Highest Similarity: {highest}%")
+    print(f"   ➤ Average Similarity: {avg_match_score:.1f}%")
+    print(f"   ➤ Processing Time: {processing_time}")
+    # Extract unique sources
+    all_sources = list(set(m["source_url"] for m in matches))
+    # Build response
+    result = {
+        "id": None,  # Will be set after MongoDB insert
+        "name": file.filename,
+        "content": text,
+        "matches": matches,
+        "similarity": round(highest, 1),
+        "flagged": flagged,
+        "wordCount": len(text.split()),
+        "processingTime": processing_time,
+        "totalMatches": total_matches,
+        "averageSimilarity": round(avg_match_score, 1),
+        "sources": all_sources,
+        "uploadDate": datetime.utcnow().isoformat(),
+    }
+    # Save to MongoDB
+    try:
+        mongo_client = await get_mongo_client()
+        db = mongo_client.sluethink
+        reports_collection = db.reports
+        # Prepare document for MongoDB
+        report_doc = {
+            "name": file.filename,
+            "analysisType": "lexical",
+            "submittedBy": current_user.get("username", "System"),
+            "uploadDate": datetime.utcnow().strftime("%Y-%m-%d"),
+            "similarity": highest,
+            "status": "completed",
+            "flagged": flagged,
+            "fileCount": 1,
+            "processingTime": processing_time,
+            "avgSimilarity": avg_match_score,
+            "sources": all_sources,
+            "createdAt": datetime.utcnow(),
+            "userId": current_user.get("sub") or current_user.get("user_id"),
+            "content": text,
+            "wordCount": len(text.split()),
+            "matches": matches,
+            "totalMatches": total_matches,
+        }
+        # Insert into MongoDB
+        insert_result = await reports_collection.insert_one(report_doc)
+        print(f"\n💾 Report saved to MongoDB with ID: {insert_result.inserted_id}")
+        # Update the result with the MongoDB ID
+        result["id"] = str(insert_result.inserted_id)
+        mongo_client.close()
+    except Exception as e:
+        print(f"\n❌ Error saving to MongoDB: {str(e)}")
+        # Don't fail the request if MongoDB save fails
+        result["id"] = "temp_id"
+    print(f"\n🧾 Returning report:\n"
+          f"  Flagged: {flagged}\n"
+          f"  Avg Similarity: {avg_match_score:.1f}%\n"
+          f"  Highest Similarity: {highest}%\n"
+          f"  Total Matches: {total_matches}")
+    return result

app/routers/teacher/__pycache__/code_analysis.cpython-312.pyc ADDED Viewed

Binary file (16.5 kB). View file

app/routers/teacher/__pycache__/internal_analysis.cpython-312.pyc ADDED Viewed

Binary file (14 kB). View file

app/routers/teacher/__pycache__/lexical_analysis.cpython-312.pyc ADDED Viewed

Binary file (24.5 kB). View file

app/routers/teacher/__pycache__/semantic_analysis.cpython-312.pyc ADDED Viewed

Binary file (21.9 kB). View file

app/routers/teacher/internal_analysis.py ADDED Viewed

	@@ -0,0 +1,323 @@

+from fastapi import APIRouter, UploadFile, File, HTTPException, Depends
+from typing import List, Tuple, Set
+from datetime import datetime
+from fastapi.security import OAuth2PasswordBearer
+from jose import JWTError, jwt
+from motor.motor_asyncio import AsyncIOMotorClient
+from app.schemas.teacher_schemas import (
+    DocumentInfo, OverlapDetail, ComparisonDetail,
+    InternalReportDetail, InternalReportSummary
+)
+from app.utils.file_utils import extract_text_from_file, allowed_file
+from app.utils.lexical_utils import (
+    find_partial_phrase_match_for_internal,
+    get_meaningful_sentences,
+    find_exact_matches,
+    find_partial_phrase_match,
+)
+from app.config import MONGODB_URI,ALGORITHM, SECRET_KEY
+router = APIRouter(prefix="/teacher", tags=["teacher-internal"])
+LEXICAL_PAIR_THRESHOLD = 0.50  # 50% - pairs above this are flagged
+OVERLAP_MIN_TOKENS = 12
+# Add these new thresholds for color coding:
+HIGH_SIMILARITY_THRESHOLD = 0.85   # 85% - Red (very high)
+MEDIUM_SIMILARITY_THRESHOLD = 0.70  # 70% - Yellow (medium)
+LOW_SIMILARITY_THRESHOLD = 0.50
+oauth2_scheme = OAuth2PasswordBearer(tokenUrl="/token")
+def verify_token(token: str = Depends(oauth2_scheme)):
+    try:
+        return jwt.decode(token, SECRET_KEY, algorithms=[ALGORITHM])
+    except JWTError:
+        raise HTTPException(status_code=401, detail="Invalid or expired token")
+async def get_mongo_client():
+    return AsyncIOMotorClient(MONGODB_URI)
+def _percent(x: float) -> float:
+    return round(float(x) * 100.0, 1)
+def _ordered_pair_key(i: int, j: int) -> str:
+    a, b = (i, j) if i < j else (j, i)
+    return f"{a}-{b}"
+def _aggregate_pair_score(overlaps: List[OverlapDetail]) -> float:
+    return max((o.similarity for o in overlaps), default=0.0)
+def _create_overlap_key(name_a: str, name_b: str, text: str, similarity: float, context: str) -> str:
+    """Create unique key for overlap deduplication - includes context to distinguish different match types"""
+    # Normalize text to handle whitespace variations
+    text_normalized = ' '.join(text.split())
+    return f"{name_a}|{name_b}|{text_normalized}|{similarity}|{context}"
+def _extract_matched_text_from_sentence(sent_b: str, phrase: str) -> str:
+    """Extract the actual text from sent_b that matches the phrase"""
+    if not sent_b or not phrase:
+        return phrase
+    # Normalize both for comparison
+    phrase_normalized = ' '.join(phrase.split()).lower()
+    sent_normalized = ' '.join(sent_b.split()).lower()
+    sent_b_normalized = ' '.join(sent_b.split())  # Keep original casing
+    # If phrase exists in sentence, extract it as-is from original
+    if phrase_normalized in sent_normalized:
+        start_idx = sent_normalized.find(phrase_normalized)
+        end_idx = start_idx + len(phrase_normalized)
+        return sent_b_normalized[start_idx:end_idx].strip()
+    # If not found exactly, try to find similar chunks
+    # Split into words and try to find the best match
+    phrase_words = phrase_normalized.split()
+    sent_words = sent_normalized.split()
+    # Look for the phrase words in the sentence
+    for i in range(len(sent_words) - len(phrase_words) + 1):
+        if sent_words[i:i+len(phrase_words)] == phrase_words:
+            return ' '.join(sent_b_normalized.split()[i:i+len(phrase_words)])
+    # Fallback: return the phrase as-is
+    return phrase
+def _find_overlaps_for_pair(
+    name_a: str, sents_a: List[str],
+    name_b: str, sents_b: List[str],
+    seen_overlaps: Set[str]
+) -> List[OverlapDetail]:
+    """Find all overlaps between two document's sentences"""
+    overlaps: List[OverlapDetail] = []
+    for sent_a in sents_a:
+        # Check exact matches
+        for sent_b in sents_b:
+            exact_score = find_exact_matches(sent_a, sent_b)
+            if exact_score is not None:
+                sim_pct = _percent(exact_score)
+                if sim_pct >= LEXICAL_PAIR_THRESHOLD * 100:
+                    context = "Exact/near-exact sentence overlap"
+                    overlap_key = _create_overlap_key(name_a, name_b, sent_a, sim_pct, context)
+                    if overlap_key not in seen_overlaps:
+                        seen_overlaps.add(overlap_key)
+                        overlaps.append(OverlapDetail(
+                            fromDoc=name_a,
+                            toDoc=name_b,
+                            text=sent_a,
+                            similarity=sim_pct,
+                            sectionA=sent_a,
+                            sectionB=sent_b,
+                            context=context,
+                        ))
+        # Check partial phrase matches
+        best_partial = None
+        best_score = 0.0
+        best_sent_b = None
+        for sent_b in sents_b:
+            partial_result = find_partial_phrase_match_for_internal(sent_a, sent_b)
+            if partial_result:
+                phrase, score = partial_result
+                print(f"DEBUG: Partial match - phrase: {phrase[:80]}, score: {score}")
+                if score > best_score:
+                    best_score = score
+                    best_partial = phrase
+                    best_sent_b = sent_b
+        # Add best partial match if it meets threshold
+        if best_partial and best_sent_b and len(best_partial.split()) >= OVERLAP_MIN_TOKENS:
+            sim_pct = _percent(best_score)
+            if sim_pct >= LEXICAL_PAIR_THRESHOLD * 100:
+                context = "High-overlap phrase (shingle/containment)"
+                overlap_key = _create_overlap_key(name_a, name_b, best_partial, sim_pct, context)
+                if overlap_key not in seen_overlaps:
+                    seen_overlaps.add(overlap_key)
+                    overlaps.append(OverlapDetail(
+                        fromDoc=name_a,
+                        toDoc=name_b,
+                        text=best_partial,
+                        similarity=sim_pct,
+                        sectionA=sent_a,
+                        sectionB=best_sent_b,
+                        context=context,
+                    ))
+    return overlaps
+@router.post("/internal-analysis", response_model=InternalReportDetail)
+async def internal_analysis(
+    files: List[UploadFile] = File(...),
+    token_payload: dict = Depends(verify_token),
+    mongo: AsyncIOMotorClient = Depends(get_mongo_client),
+):
+    if len(files) < 2:
+        raise HTTPException(status_code=400, detail="Upload at least 2 files")
+    t0 = datetime.utcnow()
+    # --- Load & sentence-split all docs ---
+    docs: List[Tuple[str, List[str]]] = []
+    doc_infos: List[DocumentInfo] = []
+    doc_texts = {}
+    for idx, f in enumerate(files, start=1):
+        if not allowed_file(f.filename):
+            raise HTTPException(status_code=400, detail=f"Invalid file type: {f.filename}")
+        raw = await f.read()
+        text = extract_text_from_file(raw, f.filename) or ""
+        sents = get_meaningful_sentences(text)
+        doc_infos.append(DocumentInfo(id=idx, name=f.filename, author=None))
+        docs.append((f.filename, sents))
+        doc_texts[f.filename] = text
+    # --- Pairwise comparisons ---
+    comparisons: List[ComparisonDetail] = []
+    seen_overlaps: Set[str] = set()
+    for i in range(len(docs)):
+        for j in range(i + 1, len(docs)):
+            name_a, sents_a = docs[i]
+            name_b, sents_b = docs[j]
+            # Find all overlaps for this pair
+            overlaps = _find_overlaps_for_pair(
+                name_a, sents_a,
+                name_b, sents_b,
+                seen_overlaps
+            )
+            # Calculate pair score and flag if needed
+            pair_score = _aggregate_pair_score(overlaps)
+            flagged = pair_score >= LEXICAL_PAIR_THRESHOLD * 100
+            comp = ComparisonDetail(
+                id=_ordered_pair_key(i + 1, j + 1),
+                docA=name_a,
+                docB=name_b,
+                similarity=round(pair_score, 1),
+                flagged=flagged,
+                overlaps=overlaps,
+                contentA=doc_texts[name_a],
+                contentB=doc_texts[name_b],
+            )
+            if flagged:
+                comparisons.append(comp)
+    # --- Compute per-document results ---
+    doc_results = []
+    total_matches = 0
+    flagged_count = 0
+    for d_idx, d in enumerate(doc_infos, start=1):
+        name = d.name
+        word_count = len(doc_texts[name].split())
+        matches = [o for c in comparisons for o in c.overlaps if o.fromDoc == name or o.toDoc == name]
+        highest_similarity = max((o.similarity for o in matches), default=0.0)
+        flagged = highest_similarity >= LEXICAL_PAIR_THRESHOLD * 100
+        if flagged:
+            flagged_count += 1
+        total_matches += len(matches)
+        doc_results.append({
+            "id": d.id,
+            "name": d.name,
+            "similarity": round(highest_similarity, 1),
+            "flagged": flagged,
+            "wordCount": word_count,
+            "matchCount": len(matches),
+            "matches": matches
+        })
+    highest_any = max(d['similarity'] for d in doc_results) if doc_results else 0.0
+    avg_similarity = round(sum(d['similarity'] for d in doc_results) / len(doc_results), 1) if doc_results else 0.0
+    elapsed = (datetime.utcnow() - t0).total_seconds()
+    processing = f"{int(elapsed // 60)}m {int(elapsed % 60):02d}s"
+    report = InternalReportDetail(
+        id="internal_report",
+        name="Internal Plagiarism Check",
+        uploadDate=datetime.utcnow(),
+        processingTime=processing,
+        documents=doc_infos,
+        comparisons=comparisons,
+        summary=InternalReportSummary(
+            totalDocuments=len(doc_results),
+            totalComparisons=(len(docs) * (len(docs) - 1)) // 2,
+            flaggedComparisons=flagged_count,
+            highestSimilarity=round(highest_any, 1),
+            averageSimilarity=avg_similarity,
+        ),
+    )
+    # --- Save to MongoDB ---
+    try:
+        db = mongo.sluethink
+        reports_collection = db.reports
+        all_sources = set()
+        for comp in comparisons:
+            for o in comp.overlaps:
+                all_sources.add(o.toDoc)
+        report_doc = {
+            "name": f"Internal_{datetime.utcnow().strftime('%Y%m%d_%H%M%S')}",
+            "analysisType": "internal",
+            "submittedBy": token_payload.get("name", "System"),
+            "uploadDate": datetime.utcnow().strftime("%Y-%m-%d"),
+            "similarity": highest_any,
+            "status": "completed",
+            "flagged": flagged_count > 0,
+            "fileCount": len(doc_results),
+            "processingTime": processing,
+            "avgSimilarity": avg_similarity,
+            "totalMatches": total_matches,
+            "sources": list(all_sources),
+            "createdAt": datetime.utcnow(),
+            "userId": token_payload.get("sub") or token_payload.get("user_id"),
+            "documents": [
+                {
+                    "id": d['id'],
+                    "name": d['name'],
+                    "similarity": d['similarity'],
+                    "flagged": d['flagged'],
+                    "wordCount": d['wordCount'],
+                    "matchCount": d['matchCount'],
+                    "matches": [
+                        {
+                            "matched_text": m.text,
+                            "similarity": m.similarity,
+                            "source_url": m.toDoc,
+                            "source_title": m.toDoc,
+                            "source_type": "internal",
+                        } for m in d['matches']
+                    ]
+                } for d in doc_results
+            ],
+            "summary": {
+                "totalDocuments": len(doc_results),
+                "flaggedDocuments": flagged_count,
+                "highestSimilarity": highest_any,
+                "averageSimilarity": avg_similarity,
+                "totalMatches": total_matches,
+            }
+        }
+        insert_result = await reports_collection.insert_one(report_doc)
+        print(f"💾 Report saved to MongoDB with ID: {insert_result.inserted_id}")
+        report.id = str(insert_result.inserted_id)
+    except Exception as e:
+        print(f"❌ Error saving to MongoDB: {str(e)}")
+    return report

app/routers/teacher/lexical_analysis.py ADDED Viewed

	@@ -0,0 +1,472 @@

+from fastapi import APIRouter, UploadFile, File, HTTPException, Depends
+from typing import List
+from datetime import datetime
+from fastapi.security import OAuth2PasswordBearer
+from jose import JWTError, jwt
+from motor.motor_asyncio import AsyncIOMotorClient
+import logging
+import asyncio
+import threading
+from app.config import MONGODB_URI, ALGORITHM, SECRET_KEY
+from app.schemas.teacher_schemas import (
+    TeacherLexicalBatchReport, TeacherLexicalSummary,
+    LexicalDocResult, LexicalMatch
+)
+from app.utils.file_utils import extract_text_from_file, allowed_file
+from app.utils.lexical_utils import (
+    get_meaningful_sentences, extract_keywords,
+    find_exact_matches, find_partial_phrase_match,
+)
+from app.utils.web_utils import fetch_sources_multi_query
+router = APIRouter(prefix="/teacher", tags=["teacher-lexical"])
+LEXICAL_DOC_THRESHOLD = 0.85  # 85%
+# ✅ HARD TIMEOUT: 3 minutes (180 seconds) for all queries combined
+SCRAPING_TIMEOUT = 180
+oauth2_scheme = OAuth2PasswordBearer(tokenUrl="/token")
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger("lexical_analysis")
+def verify_token(token: str = Depends(oauth2_scheme)):
+    try:
+        return jwt.decode(token, SECRET_KEY, algorithms=[ALGORITHM])
+    except JWTError:
+        raise HTTPException(status_code=401, detail="Invalid or expired token")
+async def get_mongo_client():
+    return AsyncIOMotorClient(MONGODB_URI)
+def generate_five_queries(text: str) -> List[str]:
+    """
+    Generate 5 high-quality search queries from document.
+    Covers: beginning, 1/4, middle, 3/4, end
+    """
+    from app.utils.lexical_utils import get_meaningful_sentences
+    logger.info("   🔍 Generating 5 lexical queries from content...")
+    sentences = get_meaningful_sentences(text)
+    if len(sentences) < 5:
+        logger.warning("   ⚠️  Not enough sentences, using fewer queries")
+        # Fallback for short documents
+        words = text.split()
+        return [
+            ' '.join(words[:30]) if len(words) > 0 else text,
+            ' '.join(words[max(0, len(words)//4):max(0, len(words)//4)+30]) if len(words) > 30 else text,
+            ' '.join(words[max(0, len(words)//2):max(0, len(words)//2)+30]) if len(words) > 30 else text,
+        ]
+    queries = []
+    # ✅ Query 1: BEGINNING - First 3-4 sentences
+    beginning_end = min(4, len(sentences))
+    query1 = ' '.join(sentences[:beginning_end])
+    queries.append(query1)
+    logger.debug(f"   Query 1 length: {len(query1.split())} words")
+    # ✅ Query 2: QUARTER-POINT - Around 25% of document
+    quarter_start = max(beginning_end, len(sentences) // 4)
+    quarter_end = min(quarter_start + 4, len(sentences))
+    query2 = ' '.join(sentences[quarter_start:quarter_end])
+    queries.append(query2)
+    logger.debug(f"   Query 2 length: {len(query2.split())} words")
+    # ✅ Query 3: MIDDLE - Around 50% of document
+    mid_start = max(quarter_end, len(sentences) // 2)
+    mid_end = min(mid_start + 4, len(sentences))
+    query3 = ' '.join(sentences[mid_start:mid_end])
+    queries.append(query3)
+    logger.debug(f"   Query 3 length: {len(query3.split())} words")
+    # ✅ Query 4: THREE-QUARTER-POINT - Around 75% of document
+    three_quarter_start = max(mid_end, int(len(sentences) * 0.75))
+    three_quarter_end = min(three_quarter_start + 4, len(sentences))
+    query4 = ' '.join(sentences[three_quarter_start:three_quarter_end])
+    queries.append(query4)
+    logger.debug(f"   Query 4 length: {len(query4.split())} words")
+    # ✅ Query 5: END - Last 3-4 sentences
+    end_start = max(three_quarter_end, len(sentences) - 4)
+    query5 = ' '.join(sentences[end_start:])
+    queries.append(query5)
+    logger.debug(f"   Query 5 length: {len(query5.split())} words")
+    # ✅ Validate queries
+    final_queries = []
+    for q in queries:
+        q = q.strip()
+        if len(q.split()) >= 15:  # Minimum 15 words for good search
+            final_queries.append(q)
+    logger.info(f"   ✅ Generated {len(final_queries)} queries:")
+    for i, q in enumerate(final_queries, 1):
+        word_count = len(q.split())
+        preview = q[:80] + "..." if len(q) > 80 else q
+        logger.info(f"      Query {i} ({word_count} words): {preview}")
+    return final_queries
+class ScrapingTimeoutManager:
+    """Manages web scraping with hard 3-minute overall timeout"""
+    def __init__(self, timeout_seconds: int = 180):
+        self.timeout = timeout_seconds
+        self.start_time = None
+        self.sources = []
+        self.lock = threading.Lock()
+        self.cancelled = False
+    def elapsed(self) -> float:
+        """Get elapsed time in seconds"""
+        if self.start_time is None:
+            return 0.0
+        return (datetime.utcnow() - self.start_time).total_seconds()
+    def is_timeout(self) -> bool:
+        """Check if 3-minute timeout exceeded"""
+        return self.elapsed() >= self.timeout
+    async def fetch_all_sources(self, queries: List[str], num_results: int = 10) -> List:
+        """
+        Fetch sources for all 5 queries with hard 180-second overall timeout.
+        Immediately stops and starts matching when timeout reached.
+        """
+        self.start_time = datetime.utcnow()
+        self.sources = []
+        logger.info(f"\n🔎 WEB SCRAPING PHASE")
+        logger.info(f"   Max Duration: {self.timeout}s (3 minutes)")
+        logger.info(f"   Queries: {len(queries)}")
+        logger.info(f"   Starting: {self.start_time.strftime('%H:%M:%S')}")
+        # Process all queries in parallel with timeout
+        tasks = []
+        for query_idx, query in enumerate(queries, 1):
+            logger.info(f"\n   Query {query_idx}/{len(queries)}: {query[:60]}...")
+            tasks.append(self._fetch_query(query, num_results))
+        try:
+            # Wait for all tasks with overall timeout
+            await asyncio.wait_for(
+                asyncio.gather(*tasks, return_exceptions=True),
+                timeout=self.timeout
+            )
+        except asyncio.TimeoutError:
+            logger.warning(f"\n🛑 HARD TIMEOUT REACHED after {self.elapsed():.1f}s")
+            logger.warning(f"   Cancelling all pending queries")
+            self.cancelled = True
+            # Cancel remaining tasks
+            for task in tasks:
+                if isinstance(task, asyncio.Task):
+                    task.cancel()
+        # Remove duplicates
+        seen_urls = set()
+        unique_sources = []
+        for source in self.sources:
+            url = source.get('url', '')
+            if url and url not in seen_urls:
+                seen_urls.add(url)
+                unique_sources.append(source)
+        elapsed = self.elapsed()
+        logger.info(f"\n✅ SCRAPING PHASE STOPPED")
+        logger.info(f"   Total Duration: {elapsed:.1f}s ({int(elapsed)//60}m {int(elapsed)%60}s)")
+        logger.info(f"   Unique Sources: {len(unique_sources)}")
+        logger.info(f"   Status: {'🛑 TIMEOUT' if self.is_timeout() else '✅ COMPLETED'}")
+        return unique_sources
+    async def _fetch_query(self, query: str, num_results: int = 10):
+        """Fetch sources for a single query"""
+        try:
+            sources = await asyncio.to_thread(
+                fetch_sources_multi_query,
+                query,
+                num_results
+            )
+            with self.lock:
+                self.sources.extend(sources)
+            logger.info(f"      ✅ Found {len(sources)} sources")
+        except asyncio.CancelledError:
+            logger.warning(f"      ⏭️  Query cancelled (timeout)")
+        except Exception as e:
+            logger.error(f"      ❌ Error: {e}")
+@router.post("/lexical-analysis", response_model=TeacherLexicalBatchReport)
+async def teacher_lexical_analysis(
+    files: List[UploadFile] = File(...),
+    current_user=Depends(verify_token),
+):
+    if not files:
+        raise HTTPException(status_code=400, detail="No files uploaded")
+    t0 = datetime.utcnow()
+    doc_results: List[LexicalDocResult] = []
+    total_matches = 0
+    logger.info(f"\n{'='*80}")
+    logger.info(f"🔍 LEXICAL ANALYSIS - {len(files)} file(s)")
+    logger.info(f"{'='*80}")
+    for idx, f in enumerate(files, start=1):
+        if not allowed_file(f.filename):
+            raise HTTPException(status_code=400, detail=f"Invalid file type: {f.filename}")
+        raw = await f.read()
+        try:
+            text = extract_text_from_file(raw, f.filename) or ""
+        except ValueError as ve:
+            # Catch over-word files
+            raise HTTPException(status_code=400, detail=str(ve))
+        sentences = get_meaningful_sentences(text)
+        logger.info(f"\n📄 File {idx}: {f.filename}")
+        logger.info(f"   Sentences: {len(sentences)}")
+        logger.info(f"   Words: {len(text.split())}")
+        # ✅ Generate 5 lexical queries
+        queries = generate_five_queries(text)
+        # ✅ WEB SCRAPING WITH 3-MINUTE HARD TIMEOUT (OVERALL)
+        scraper = ScrapingTimeoutManager(timeout_seconds=SCRAPING_TIMEOUT)
+        sources = await scraper.fetch_all_sources(queries, num_results=5)
+        # ✅ RESET TIMEOUT - Scraping phase is done, matching has no time limit
+        from app.utils import web_utils
+        web_utils._scraping_deadline = None
+        web_utils._scraping_start_time = None
+        logger.info(f"   Total unique sources: {len(sources)}")
+        if not sources:
+            logger.warning(f"   ⚠️  No sources found, skipping lexical matching")
+            doc_results.append(LexicalDocResult(
+                id=idx,
+                name=f.filename,
+                author=None,
+                similarity=0.0,
+                flagged=False,
+                wordCount=len(text.split()),
+                matches=[],
+                content=text
+            ))
+            continue
+        matches: List[LexicalMatch] = []
+        highest = 0.0
+        source_matches_count = {}
+        # ✅ MATCHING PHASE (starts immediately after timeout)
+        logger.info(f"\n📊 LEXICAL MATCHING PHASE")
+        logger.info(f"   Comparing {len(sentences)} sentences against {len(sources)} sources...")
+        externals = [
+            {
+                "title": s.get("url", "Unknown"),
+                "text": s.get("content", ""),
+                "source_url": s.get("url", ""),
+                "type": "web",
+            }
+            for s in sources if s.get("content")
+        ]
+        for ext in externals:
+            logger.info(f"      🌐 Source: {ext['source_url'][:60]}...")
+            source_matches_count[ext['source_url']] = 0
+        # Compare each sentence against ALL sources
+        for s in sentences:
+            best_overall_score = 0.0
+            best_overall_match = None
+            best_overall_src = None
+            for ext in externals:
+                # Try exact match first
+                sim = find_exact_matches(s, ext["text"])
+                if sim is not None and sim > best_overall_score:
+                    best_overall_score = sim
+                    best_overall_match = s
+                    best_overall_src = ext
+                    continue
+                # Try partial phrase match
+                pp = find_partial_phrase_match(s, ext["text"])
+                if pp:
+                    phrase, score = pp
+                    if score > best_overall_score:
+                        best_overall_score = score
+                        best_overall_match = phrase
+                        best_overall_src = ext
+            # Add match if found and above threshold (50%)
+            if best_overall_match and best_overall_score > 0.0:
+                pct = round(best_overall_score * 100.0, 1)
+                if pct >= 50:
+                    matches.append(LexicalMatch(
+                        matched_text=best_overall_match,
+                        similarity=pct,
+                        source_type=best_overall_src["type"],
+                        source_title=best_overall_src["title"],
+                        source_url=best_overall_src["source_url"],
+                        section=None,
+                        context="Potential plagiarism detected",
+                    ))
+                    source_matches_count[best_overall_src['source_url']] += 1
+                    highest = max(highest, pct)
+                    total_matches += 1
+                    logger.debug(f"      ✅ Match ({pct}%) with {best_overall_src['source_url'][:50]}")
+        # Better flagging logic considering multiple sources
+        num_sources_with_matches = sum(1 for c in source_matches_count.values() if c > 0)
+        avg_match_score = (sum(m.similarity for m in matches) / len(matches)) if matches else 0.0
+        # Flag if any of these conditions are met:
+        # 1. Single source with high similarity (>85%)
+        # 2. Content plagiarized from 2+ different sources
+        # 3. 3+ matches with average >70%
+        flagged = (
+            highest >= 85 or
+            num_sources_with_matches >= 2 or
+            (len(matches) >= 3 and avg_match_score >= 70)
+        )
+        logger.info(f"   📈 Results:")
+        logger.info(f"      Highest similarity: {highest:.1f}%")
+        logger.info(f"      Total matches: {len(matches)}")
+        logger.info(f"      Sources with matches: {num_sources_with_matches}")
+        logger.info(f"      Average match score: {avg_match_score:.1f}%")
+        logger.info(f"      Flagged: {flagged}")
+        doc_results.append(LexicalDocResult(
+            id=idx,
+            name=f.filename,
+            author=None,
+            similarity=round(highest, 1),
+            flagged=flagged,
+            wordCount=len(text.split()),
+            matches=matches,
+            content=text  # Include full document for frontend
+        ))
+    highest_any = max((d.similarity for d in doc_results), default=0.0)
+    avg = round(sum(d.similarity for d in doc_results) / len(doc_results), 1) if doc_results else 0.0
+    flagged_count = sum(1 for d in doc_results if d.flagged)
+    elapsed = (datetime.utcnow() - t0).total_seconds()
+    mm = int(elapsed // 60)
+    ss = int(elapsed % 60)
+    processing = f"{mm}m {ss:02d}s"
+    logger.info(f"\n{'='*80}")
+    logger.info(f"✅ ANALYSIS COMPLETE")
+    logger.info(f"{'='*80}")
+    logger.info(f"  Documents: {len(doc_results)}")
+    logger.info(f"  Flagged: {flagged_count}")
+    logger.info(f"  Highest: {highest_any}%")
+    logger.info(f"  Average: {avg}%")
+    logger.info(f"  Total Matches: {total_matches}")
+    logger.info(f"  Total Time: {processing}\n")
+    result = TeacherLexicalBatchReport(
+        id="teacher_lexical_batch",
+        name="Teacher Lexical Analysis",
+        uploadDate=datetime.utcnow(),
+        processingTime=processing,
+        documents=doc_results,
+        summary=TeacherLexicalSummary(
+            totalDocuments=len(doc_results),
+            flaggedDocuments=flagged_count,
+            highestSimilarity=highest_any,
+            averageSimilarity=avg,
+            totalMatches=total_matches,
+        ),
+    )
+    # Save to MongoDB
+    try:
+        mongo_client = await get_mongo_client()
+        db = mongo_client.sluethink
+        reports_collection = db.reports
+        # Extract unique sources from all matches
+        all_sources = set()
+        for doc in doc_results:
+            for match in doc.matches:
+                all_sources.add(match.source_url)
+        # Prepare document for MongoDB
+        report_doc = {
+            "name": f"Lexical_Batch_{datetime.utcnow().strftime('%Y%m%d_%H%M%S')}",
+            "analysisType": "lexical",
+            "submittedBy": current_user.get("username", "System"),
+            "uploadDate": datetime.utcnow().strftime("%Y-%m-%d"),
+            "similarity": highest_any,
+            "status": "completed",
+            "flagged": flagged_count > 0,
+            "fileCount": len(doc_results),
+            "processingTime": processing,
+            "avgSimilarity": avg,
+            "sources": list(all_sources),
+            "createdAt": datetime.utcnow(),
+            "userId": current_user.get("sub") or current_user.get("user_id"),
+            # Store full analysis details
+            "documents": [
+                {
+                    "id": doc.id,
+                    "name": doc.name,
+                    "similarity": doc.similarity,
+                    "flagged": doc.flagged,
+                    "wordCount": doc.wordCount,
+                    "matchCount": len(doc.matches),
+                    "matches": [
+                        {
+                            "matched_text": m.matched_text,
+                            "similarity": m.similarity,
+                            "source_url": m.source_url,
+                            "source_title": m.source_title,
+                            "source_type": m.source_type,
+                        }
+                        for m in doc.matches
+                    ]
+                }
+                for doc in doc_results
+            ],
+            "summary": {
+                "totalDocuments": result.summary.totalDocuments,
+                "flaggedDocuments": result.summary.flaggedDocuments,
+                "highestSimilarity": result.summary.highestSimilarity,
+                "averageSimilarity": result.summary.averageSimilarity,
+                "totalMatches": result.summary.totalMatches,
+            }
+        }
+        # Insert into MongoDB
+        insert_result = await reports_collection.insert_one(report_doc)
+        logger.info(f"💾 Report saved to MongoDB with ID: {insert_result.inserted_id}")
+        # Update the result with the MongoDB ID
+        result.id = str(insert_result.inserted_id)
+        mongo_client.close()
+    except Exception as e:
+        logger.error(f"❌ Error saving to MongoDB: {str(e)}")
+    logger.info(f"\n🧾 Returning report:")
+    logger.info(f"   Total Docs: {result.summary.totalDocuments}")
+    logger.info(f"   Flagged Docs: {result.summary.flaggedDocuments}")
+    logger.info(f"   Avg Similarity: {result.summary.averageSimilarity}%")
+    logger.info(f"   Highest Similarity: {result.summary.highestSimilarity}%")
+    logger.info(f"   Total Matches: {result.summary.totalMatches}\n")
+    return result

app/routers/teacher/semantic_analysis.py ADDED Viewed

	@@ -0,0 +1,406 @@

+from fastapi import APIRouter, UploadFile, File, HTTPException, Depends
+from typing import List
+from datetime import datetime
+from fastapi.security import OAuth2PasswordBearer
+from jose import JWTError, jwt
+from motor.motor_asyncio import AsyncIOMotorClient
+import logging
+import asyncio
+import threading
+from concurrent.futures import ThreadPoolExecutor
+from app.config import MONGODB_URI, ALGORITHM, SECRET_KEY
+from app.schemas.teacher_schemas import (
+    TeacherLexicalBatchReport, TeacherLexicalSummary,
+    LexicalDocResult, LexicalMatch
+)
+from app.utils.file_utils import extract_text_from_file, allowed_file
+from app.utils.semantic_utils import (
+    generate_five_queries,
+    find_semantic_matches,
+)
+from app.utils.web_utils import fetch_sources_multi_query
+from app.utils.ai_detector import detect_ai_similarity
+router = APIRouter(prefix="/teacher", tags=["teacher-semantic"])
+SEMANTIC_THRESHOLD = 0.50
+SCRAPING_TIMEOUT = 180  # 3 minutes total for all queries combined
+oauth2_scheme = OAuth2PasswordBearer(tokenUrl="/token")
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger("semantic_analysis")
+def verify_token(token: str = Depends(oauth2_scheme)):
+    try:
+        return jwt.decode(token, SECRET_KEY, algorithms=[ALGORITHM])
+    except JWTError:
+        raise HTTPException(status_code=401, detail="Invalid or expired token")
+async def get_mongo_client():
+    return AsyncIOMotorClient(MONGODB_URI)
+class ScrapingTimeoutManager:
+    """Manages web scraping with hard 3-minute overall timeout"""
+    def __init__(self, timeout_seconds: int = 180):
+        self.timeout = timeout_seconds
+        self.start_time = None
+        self.sources = []
+        self.executor = ThreadPoolExecutor(max_workers=4)
+        self.lock = threading.Lock()
+        self.cancelled = False
+    def elapsed(self) -> float:
+        """Get elapsed time in seconds"""
+        if self.start_time is None:
+            return 0.0
+        return (datetime.utcnow() - self.start_time).total_seconds()
+    def is_timeout(self) -> bool:
+        """Check if 3-minute timeout exceeded"""
+        return self.elapsed() >= self.timeout
+    async def fetch_all_queries(self, queries: List[str], num_results: int = 5) -> List:
+        """
+        Fetch sources for all queries with hard 180-second overall timeout.
+        Immediately stops and starts matching when timeout reached.
+        """
+        self.start_time = datetime.utcnow()
+        self.sources = []
+        logger.info(f"\n🔎 WEB SCRAPING PHASE")
+        logger.info(f"   Max Duration: {self.timeout}s (3 minutes)")
+        logger.info(f"   Queries: {len(queries)}")
+        logger.info(f"   Starting: {self.start_time.strftime('%H:%M:%S')}")
+        # Process all queries in parallel with timeout
+        tasks = []
+        for query_idx, query in enumerate(queries, 1):
+            logger.info(f"\n   Query {query_idx}/{len(queries)}: {query[:60]}...")
+            tasks.append(self._fetch_query(query, num_results))
+        try:
+            # Wait for all tasks with overall timeout
+            await asyncio.wait_for(
+                asyncio.gather(*tasks, return_exceptions=True),
+                timeout=self.timeout
+            )
+        except asyncio.TimeoutError:
+            logger.warning(f"\n🛑 HARD TIMEOUT REACHED after {self.elapsed():.1f}s")
+            logger.warning(f"   Cancelling all pending queries")
+            self.cancelled = True
+            # Cancel remaining tasks
+            for task in tasks:
+                if isinstance(task, asyncio.Task):
+                    task.cancel()
+        # Remove duplicates
+        seen_urls = set()
+        unique_sources = []
+        for source in self.sources:
+            url = source.get('url', '')
+            if url and url not in seen_urls:
+                seen_urls.add(url)
+                unique_sources.append(source)
+        elapsed = self.elapsed()
+        logger.info(f"\n✅ SCRAPING PHASE STOPPED")
+        logger.info(f"   Total Duration: {elapsed:.1f}s ({int(elapsed)//60}m {int(elapsed)%60}s)")
+        logger.info(f"   Unique Sources: {len(unique_sources)}")
+        logger.info(f"   Status: {'🛑 TIMEOUT' if self.is_timeout() else '✅ COMPLETED'}")
+        return unique_sources
+    async def _fetch_query(self, query: str, num_results: int = 5):
+        """Fetch sources for a single query"""
+        try:
+            sources = await asyncio.to_thread(
+                fetch_sources_multi_query,
+                query,
+                num_results
+            )
+            with self.lock:
+                self.sources.extend(sources)
+            logger.info(f"      ✅ Found {len(sources)} sources")
+        except asyncio.CancelledError:
+            logger.warning(f"      ⏭️  Query cancelled (timeout)")
+        except Exception as e:
+            logger.error(f"      ❌ Error: {e}")
+    def cleanup(self):
+        """Clean up executor"""
+        try:
+            self.executor.shutdown(wait=False)
+        except:
+            pass
+@router.post("/semantic-analysis", response_model=TeacherLexicalBatchReport)
+async def teacher_semantic_analysis(
+    files: List[UploadFile] = File(...),
+    current_user=Depends(verify_token),
+):
+    if not files:
+        raise HTTPException(status_code=400, detail="No files uploaded")
+    t0 = datetime.utcnow()
+    doc_results: List[LexicalDocResult] = []
+    total_matches = 0
+    logger.info(f"\n{'='*80}")
+    logger.info(f"🧠 SEMANTIC ANALYSIS - {len(files)} file(s)")
+    logger.info(f"{'='*80}")
+    for idx, f in enumerate(files, start=1):
+        if not allowed_file(f.filename):
+            raise HTTPException(status_code=400, detail=f"Invalid file type: {f.filename}")
+        raw = await f.read()
+        text = extract_text_from_file(raw, f.filename) or ""
+        logger.info(f"\n📄 File {idx}: {f.filename}")
+        logger.info(f"   Words: {len(text.split())}")
+        # ✅ AI DETECTION
+        logger.info(f"🤖 Running AI detection...")
+        ai_similarity = detect_ai_similarity(text)
+        logger.info(f"   AI Similarity: {ai_similarity}")
+        # Generate 3 semantic queries
+        queries = generate_five_queries(text)
+        # ✅ WEB SCRAPING WITH 3-MINUTE HARD TIMEOUT (OVERALL)
+        scraper = ScrapingTimeoutManager(timeout_seconds=SCRAPING_TIMEOUT)
+        try:
+            unique_sources = await scraper.fetch_all_queries(queries, num_results=5)
+        finally:
+            scraper.cleanup()
+        logger.info(f"   Total unique sources: {len(unique_sources)}")
+        if not unique_sources:
+            logger.warning(f"   ⚠️  No sources found, skipping semantic matching")
+            doc_results.append(LexicalDocResult(
+                id=idx,
+                name=f.filename,
+                author=None,
+                similarity=0.0,
+                flagged=False,
+                wordCount=len(text.split()),
+                matches=[],
+                content=text[:5000],
+                ai_similarity=ai_similarity
+            ))
+            continue
+        # ✅ MATCHING PHASE (starts immediately after timeout)
+        logger.info(f"\n📊 SEMANTIC MATCHING PHASE")
+        logger.info(f"   Comparing against {len(unique_sources)} sources...")
+        matches: List[LexicalMatch] = []
+        highest = 0.0
+        source_matches_count = {}
+        # Prepare externals
+        externals = [
+            {
+                "title": s.get("url", "Unknown"),
+                "text": s.get("content", ""),
+                "source_url": s.get("url", ""),
+                "type": "web",
+            }
+            for s in unique_sources if s.get("content")
+        ]
+        for ext_idx, ext in enumerate(externals, 1):
+            logger.info(f"      Source {ext_idx}/{len(externals)}: {ext['source_url'][:60]}...")
+            source_matches_count[ext['source_url']] = 0
+            try:
+                # Semantic comparison
+                semantic_matches = find_semantic_matches(
+                    text,
+                    ext["text"],
+                    threshold=SEMANTIC_THRESHOLD
+                )
+                logger.info(f"         Found {len(semantic_matches)} semantic matches")
+                for match in semantic_matches:
+                    similarity_pct = round(match['similarity'] * 100, 1)
+                    matches.append(LexicalMatch(
+                        matched_text=match['doc_text'][:300],
+                        similarity=similarity_pct,
+                        source_type=ext["type"],
+                        source_title=ext["title"],
+                        source_url=ext["source_url"],
+                        section=None,
+                        context="Semantic similarity detected (possible paraphrasing)",
+                    ))
+                    source_matches_count[ext['source_url']] += 1
+                    highest = max(highest, similarity_pct)
+                    total_matches += 1
+                    logger.debug(f"            Match: {similarity_pct}% - {match['doc_text'][:50]}...")
+            except Exception as e:
+                logger.error(f"         Error matching source: {e}")
+                continue
+        # Deduplicate matches
+        logger.info(f"   🔄 Deduplicating {len(matches)} matches...")
+        unique_matches_dict = {}
+        for match in matches:
+            key = match.matched_text.lower().strip()
+            if key not in unique_matches_dict or match.similarity > unique_matches_dict[key].similarity:
+                unique_matches_dict[key] = match
+        matches = list(unique_matches_dict.values())
+        logger.info(f"   ✅ Deduplicated to {len(matches)} unique matches")
+        # Recalculate metrics
+        highest = max((m.similarity for m in matches), default=0.0)
+        source_matches_count = {}
+        for match in matches:
+            source_matches_count[match.source_url] = source_matches_count.get(match.source_url, 0) + 1
+        # Flagging logic
+        num_sources_with_matches = sum(1 for c in source_matches_count.values() if c > 0)
+        avg_match_score = (sum(m.similarity for m in matches) / len(matches)) if matches else 0.0
+        flagged = (
+            highest >= 80 or
+            num_sources_with_matches >= 2 or
+            (len(matches) >= 2 and avg_match_score >= 70)
+        )
+        logger.info(f"   📈 Results:")
+        logger.info(f"      Highest: {highest:.1f}%")
+        logger.info(f"      Total matches: {len(matches)}")
+        logger.info(f"      Sources with matches: {num_sources_with_matches}")
+        logger.info(f"      Average: {avg_match_score:.1f}%")
+        logger.info(f"      Flagged: {flagged}")
+        doc_results.append(LexicalDocResult(
+            id=idx,
+            name=f.filename,
+            author=None,
+            similarity=round(highest, 1),
+            flagged=flagged,
+            wordCount=len(text.split()),
+            matches=matches,
+            content=text[:5000],
+            ai_similarity=ai_similarity
+        ))
+    # Final summary
+    highest_any = max((d.similarity for d in doc_results), default=0.0)
+    avg = round(sum(d.similarity for d in doc_results) / len(doc_results), 1) if doc_results else 0.0
+    flagged_count = sum(1 for d in doc_results if d.flagged)
+    avg_ai_similarity = round(sum(d.ai_similarity for d in doc_results) / len(doc_results), 3) if doc_results else 0.0
+    elapsed = (datetime.utcnow() - t0).total_seconds()
+    mm = int(elapsed // 60)
+    ss = int(elapsed % 60)
+    processing = f"{mm}m {ss:02d}s"
+    logger.info(f"\n{'='*80}")
+    logger.info(f"✅ ANALYSIS COMPLETE")
+    logger.info(f"{'='*80}")
+    logger.info(f"  Documents: {len(doc_results)}")
+    logger.info(f"  Flagged: {flagged_count}")
+    logger.info(f"  Highest Semantic Similarity: {highest_any}%")
+    logger.info(f"  Average Semantic Similarity: {avg}%")
+    logger.info(f"  Average AI Similarity: {avg_ai_similarity}")
+    logger.info(f"  Total Matches: {total_matches}")
+    logger.info(f"  Total Time: {processing}\n")
+    result = TeacherLexicalBatchReport(
+        id="teacher_semantic_batch",
+        name="Teacher Semantic Analysis",
+        uploadDate=datetime.utcnow(),
+        processingTime=processing,
+        documents=doc_results,
+        summary=TeacherLexicalSummary(
+            totalDocuments=len(doc_results),
+            flaggedDocuments=flagged_count,
+            highestSimilarity=highest_any,
+            averageSimilarity=avg,
+            totalMatches=total_matches,
+            averageAiSimilarity=avg_ai_similarity,
+        ),
+    )
+    # Save to MongoDB
+    try:
+        mongo_client = await get_mongo_client()
+        db = mongo_client.sluethink
+        reports_collection = db.reports
+        all_sources = set()
+        for doc in doc_results:
+            for match in doc.matches:
+                all_sources.add(match.source_url)
+        report_doc = {
+            "name": f"Semantic_Batch_{datetime.utcnow().strftime('%Y%m%d_%H%M%S')}",
+            "analysisType": "semantic",
+            "submittedBy": current_user.get("username", "System"),
+            "uploadDate": datetime.utcnow().strftime("%Y-%m-%d"),
+            "similarity": highest_any,
+            "aiSimilarity": avg_ai_similarity,
+            "status": "completed",
+            "flagged": flagged_count > 0,
+            "fileCount": len(doc_results),
+            "processingTime": processing,
+            "avgSimilarity": avg,
+            "sources": list(all_sources),
+            "createdAt": datetime.utcnow(),
+            "userId": current_user.get("sub") or current_user.get("user_id"),
+            "documents": [
+                {
+                    "id": doc.id,
+                    "name": doc.name,
+                    "similarity": doc.similarity,
+                    "aiSimilarity": doc.ai_similarity,
+                    "flagged": doc.flagged,
+                    "wordCount": doc.wordCount,
+                    "matchCount": len(doc.matches),
+                    "matches": [
+                        {
+                            "matched_text": m.matched_text,
+                            "similarity": m.similarity,
+                            "source_url": m.source_url,
+                            "source_title": m.source_title,
+                            "source_type": m.source_type,
+                        }
+                        for m in doc.matches
+                    ]
+                }
+                for doc in doc_results
+            ],
+            "summary": {
+                "totalDocuments": result.summary.totalDocuments,
+                "flaggedDocuments": result.summary.flaggedDocuments,
+                "highestSimilarity": result.summary.highestSimilarity,
+                "averageSimilarity": result.summary.averageSimilarity,
+                "averageAiSimilarity": result.summary.averageAiSimilarity,
+                "totalMatches": result.summary.totalMatches,
+            }
+        }
+        insert_result = await reports_collection.insert_one(report_doc)
+        logger.info(f"💾 Saved to MongoDB: {insert_result.inserted_id}")
+        result.id = str(insert_result.inserted_id)
+        mongo_client.close()
+    except Exception as e:
+        logger.error(f"❌ MongoDB error: {str(e)}")
+    return result

app/schemas/__pycache__/plagiarism_schemas.cpython-312.pyc ADDED Viewed

Binary file (1.21 kB). View file

app/schemas/__pycache__/report_schemas.cpython-312.pyc ADDED Viewed

Binary file (1.07 kB). View file

app/schemas/__pycache__/schemas.cpython-312.pyc ADDED Viewed

Binary file (2.16 kB). View file

app/schemas/__pycache__/sources_schemas.cpython-312.pyc ADDED Viewed

Binary file (526 Bytes). View file

app/schemas/__pycache__/teacher_schemas.cpython-312.pyc ADDED Viewed

Binary file (8.09 kB). View file

app/schemas/plagiarism_schemas.py ADDED Viewed

	@@ -0,0 +1,22 @@

+from pydantic import BaseModel
+from typing import List
+class MatchDetail(BaseModel):
+    matched_text: str
+    similarity: float
+    source_type: str    # "news" or "academic"
+    source_title: str
+    source_url: str
+class SentenceResult(BaseModel):
+    original_sentence: str
+    normalized_sentence: str
+    match_type: str     # "full_sentence", "partial_phrase", "no_match"
+    matches: List[MatchDetail]
+class PlagiarismResponse(BaseModel):
+    checked_sentences: int
+    checked_sources: int
+    results: List[SentenceResult]

app/schemas/report_schemas.py ADDED Viewed

	@@ -0,0 +1,21 @@

+from pydantic import BaseModel
+from typing import List
+from datetime import datetime
+from app.schemas.plagiarism_schemas import MatchDetail
+class ReportSummary(BaseModel):
+    id: str                # MongoDB’s ObjectId as string
+    name: str
+    date: datetime
+    similarity: float       # highest sentence‐level similarity (0–100)
+    sources: List[str]      # unique list of source titles used
+    word_count: int
+    time_spent: str         # e.g. "00:00"
+    flagged: bool           # true if similarity > 70%
+class ReportDetail(BaseModel):
+    id: str
+    name: str
+    content: str
+    plagiarism_data: List[MatchDetail]

app/schemas/sources_schemas.py ADDED Viewed

	@@ -0,0 +1,9 @@

+from pydantic import BaseModel
+class SourceData(BaseModel):
+    id: str          # MongoDB ObjectId as string
+    title: str
+    text: str
+    source_url: str
+    type: str

app/schemas/teacher_schemas.py ADDED Viewed

	@@ -0,0 +1,185 @@

+from pydantic import BaseModel, Field
+from typing import List, Optional
+from datetime import datetime
+# ---- Shared ----
+class DocumentInfo(BaseModel):
+    id: int
+    name: str
+    author: Optional[str] = None
+class OverlapDetail(BaseModel):
+    # For lexical/internal
+    fromDoc: str
+    toDoc: str
+    text: str
+    similarity: float  # percent (0–100)
+    sectionA: Optional[str] = None
+    sectionB: Optional[str] = None
+    context: Optional[str] = None
+class ComparisonDetail(BaseModel):
+    id: str         # "i-j"
+    docA: str
+    docB: str
+    similarity: float  # percent (0–100)
+    flagged: bool
+    overlaps: List[OverlapDetail] = Field(default_factory=list)
+    contentA: str = ""
+    contentB: str = ""
+class InternalReportSummary(BaseModel):
+    totalDocuments: int
+    totalComparisons: int
+    flaggedComparisons: int
+    highestSimilarity: float
+    averageSimilarity: Optional[float] = None
+class InternalReportDetail(BaseModel):
+    id: str
+    name: str
+    analysisType: str = "internal"
+    uploadDate: datetime
+    processingTime: str
+    status: str = "completed"
+    documents: List[DocumentInfo]
+    comparisons: List[ComparisonDetail]
+    summary: InternalReportSummary
+class LexicalMatch(BaseModel):
+    matched_text: str
+    similarity: float
+    source_type: str
+    source_title: str
+    source_url: str
+    section: Optional[str] = None
+    context: Optional[str] = None
+class LexicalDocResult(BaseModel):
+    id: int
+    name: str
+    author: Optional[str] = None
+    similarity: float  # overall percent
+    flagged: bool
+    wordCount: Optional[int] = None
+    matches: List[LexicalMatch] = Field(default_factory=list)
+    content: Optional[str] = None
+    ai_similarity: float = 0.0
+class TeacherLexicalSummary(BaseModel):
+    totalDocuments: int
+    flaggedDocuments: int
+    highestSimilarity: float
+    averageSimilarity: Optional[float] = None
+    totalMatches: int
+    averageAiSimilarity: float = 0.0
+class TeacherLexicalBatchReport(BaseModel):
+    id: str
+    name: str
+    analysisType: str = "lexical"
+    uploadDate: datetime
+    processingTime: str
+    status: str = "completed"
+    documents: List[LexicalDocResult]
+    summary: TeacherLexicalSummary
+# ---- Teacher Semantic (internal/external) ----
+class SemanticOverlap(BaseModel):
+    textA: str
+    textB: str
+    cosine: float     # 0–1
+    cosine_pct: float # 0–100
+    sectionA: Optional[str] = None
+    sectionB: Optional[str] = None
+    confidence: str   # "high" | "medium" | "low"
+class SemanticComparison(BaseModel):
+    id: str
+    docA: str
+    docB: str
+    similarity: float   # aggregated cosine percent
+    flagged: bool
+    overlaps: List[SemanticOverlap] = Field(default_factory=list)
+class TeacherSemanticReport(BaseModel):
+    id: str
+    name: str
+    analysisType: str = "semantic"
+    mode: str           # "internal" | "external"
+    uploadDate: datetime
+    processingTime: str
+    status: str = "completed"
+    documents: List[DocumentInfo]
+    comparisons: List[SemanticComparison]
+    summary: InternalReportSummary
+    narrative: Optional[str] = None
+from pydantic import BaseModel, Field
+from typing import List, Optional
+from datetime import datetime
+class CodeMatch(BaseModel):
+    matched_code: str
+    similarity: float
+    source_type: str  # 'peer', 'github', 'stackoverflow', 'web'
+    source_title: str
+    source_url: Optional[str] = None
+    match_type: str  # 'exact', 'structural', 'token_sequence'
+    line_start: Optional[int] = None
+    line_end: Optional[int] = None
+    context: Optional[str] = None
+class CodeFunction(BaseModel):
+    name: str
+    start_line: int
+    end_line: int
+    code: str
+    complexity: int  # Cyclomatic complexity
+    tokens: List[str]
+    ast_hash: str
+class CodeDocResult(BaseModel):
+    id: int
+    name: str
+    author: Optional[str] = None
+    similarity: float
+    flagged: bool
+    lineCount: int
+    functionCount: int
+    matches: List[CodeMatch]
+    functions: List[CodeFunction]
+    content: str  # Full code content
+    language: str
+class CodeAnalysisSummary(BaseModel):
+    totalDocuments: int
+    flaggedDocuments: int
+    highestSimilarity: float
+    averageSimilarity: float
+    totalMatches: int
+    peerMatches: int
+    externalMatches: int
+class TeacherCodeBatchReport(BaseModel):
+    id: str
+    name: str
+    uploadDate: datetime
+    processingTime: str
+    documents: List[CodeDocResult]
+    summary: CodeAnalysisSummary
+    assignmentTopic: Optional[str] = None
+class InternalMatch(BaseModel):
+    """Match between two student submissions"""
+    student1_id: int
+    student2_id: int
+    student1_name: str
+    student2_name: str
+    similarity: float
+    match_type: str
+    matched_functions: List[str]

app/utils/__pycache__/ai_detector.cpython-312.pyc ADDED Viewed

Binary file (3.22 kB). View file

app/utils/__pycache__/code_comparism.cpython-312.pyc ADDED Viewed

Binary file (12 kB). View file

app/utils/__pycache__/code_detection.cpython-312.pyc ADDED Viewed

Binary file (13.5 kB). View file

app/utils/__pycache__/code_plagiarism_integration.cpython-312.pyc ADDED Viewed

Binary file (10.6 kB). View file

app/utils/__pycache__/code_utils.cpython-312.pyc ADDED Viewed

Binary file (22.8 kB). View file

app/utils/__pycache__/file_utils.cpython-312.pyc ADDED Viewed

Binary file (2.42 kB). View file

app/utils/__pycache__/lexical_utils.cpython-312.pyc ADDED Viewed

Binary file (13.9 kB). View file

app/utils/__pycache__/semantic_utils.cpython-312.pyc ADDED Viewed

Binary file (15 kB). View file

app/utils/__pycache__/text_utils.cpython-312.pyc ADDED Viewed

Binary file (10.8 kB). View file

app/utils/__pycache__/web_utils.cpython-312.pyc ADDED Viewed

Binary file (27.2 kB). View file

app/utils/ai_detector.py ADDED Viewed

	@@ -0,0 +1,80 @@

+"""
+AI detection using HuggingFace fakespot-ai/roberta-base-ai-text-detection-v1
+Fast, accurate ML model. Returns AI similarity score (0-1).
+"""
+import logging
+import os
+from huggingface_hub import InferenceClient
+from app.config import HF_TOKEN
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger("ai_detector")
+# Initialize HF client
+try:
+    hf_client = InferenceClient(api_key=HF_TOKEN)
+    logger.info("✓ HuggingFace client initialized")
+except Exception as e:
+    logger.error(f"❌ HF_TOKEN not available: {e}")
+    hf_client = None
+def detect_ai_similarity(text: str) -> float:
+    """
+    Detect AI-generated text using HuggingFace model.
+    Fast, accurate ML-based detection.
+    Returns:
+        float: 0.0-1.0 where 1.0 = definitely AI, 0.0 = definitely human
+    """
+    try:
+        logger.info("Starting AI detection (HuggingFace model)...")
+        if not text or len(text) < 20:
+            logger.warning("Text too short for detection")
+            return 0.0
+        if hf_client is None:
+            logger.error("HuggingFace client not initialized")
+            return 0.0
+        # Truncate to avoid token limits
+        text = text[:2000]
+        logger.info("Sending request to HuggingFace...")
+        result = hf_client.text_classification(
+            text,
+            model="fakespot-ai/roberta-base-ai-text-detection-v1",
+        )
+        logger.info(f"API response: {result}")
+        # Extract top result
+        if not result or len(result) == 0:
+            logger.error("No result from API")
+            return 0.0
+        top_result = result[0]
+        label = top_result.get('label', '').upper()
+        confidence = top_result.get('score', 0.5)
+        logger.info(f"Classification: {label} (confidence: {confidence:.4f})")
+        # Map to 0-1 score
+        if label == 'AI':
+            ai_score = confidence
+        elif label == 'HUMAN':
+            ai_score = 1.0 - confidence
+        else:
+            ai_score = 0.5
+        ai_score = min(max(ai_score, 0.0), 1.0)  # Clamp to 0-1
+        logger.info(f"✅ AI detection complete: {ai_score:.3f}")
+        return round(ai_score, 3)
+    except Exception as e:
+        logger.error(f"❌ AI detection error: {str(e)}", exc_info=True)
+        return 0.0

app/utils/file_utils.py ADDED Viewed

	@@ -0,0 +1,36 @@

+import tempfile
+from pdfminer.high_level import extract_text as extract_pdf_text
+from docx import Document as DocxDocument
+from app.config import ALLOWED_EXTENSIONS
+def allowed_file(filename: str) -> bool:
+    return "." in filename and filename.rsplit(".", 1)[1].lower() in ALLOWED_EXTENSIONS
+def extract_text_from_file(content_bytes: bytes, filename: str, max_words: int = 500) -> str:
+    ext = filename.rsplit(".", 1)[1].lower()
+    text = ""
+    try:
+        if ext == "txt":
+            text = content_bytes.decode("utf-8", errors="ignore")
+        elif ext == "pdf":
+            with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp:
+                tmp.write(content_bytes)
+                tmp.flush()
+                text = extract_pdf_text(tmp.name)
+        elif ext == "docx":
+            with tempfile.NamedTemporaryFile(suffix=".docx", delete=False) as tmp:
+                tmp.write(content_bytes)
+                tmp.flush()
+                doc = DocxDocument(tmp.name)
+                text = "\n".join([p.text for p in doc.paragraphs])
+    except Exception:
+        text = ""
+    # Word count check
+    word_count = len(text.split())
+    if word_count > max_words:
+        raise ValueError(f"File exceeds {max_words} words (found {word_count}).")
+    return text

app/utils/lexical_utils.py ADDED Viewed

	@@ -0,0 +1,293 @@

+import re
+from typing import List, Optional, Tuple, Set
+import nltk
+from nltk.tokenize import sent_tokenize, word_tokenize
+from nltk.corpus import stopwords
+from nltk.stem import WordNetLemmatizer
+from rapidfuzz.distance import Levenshtein
+from app.config import (
+    MIN_WORDS_PER_SENTENCE,
+    MIN_SENTENCE_LENGTH,
+    SEQUENCE_THRESHOLD,
+    EXACT_MATCH_SCORE,
+)
+nltk.download("punkt")
+nltk.download("stopwords")
+nltk.download("wordnet")
+nltk.download('punkt_tab')
+lemmatizer = WordNetLemmatizer()
+def normalize_text(text: str) -> str:
+    if not text:
+        return ""
+    text = text.lower()
+    text = text.replace("\u00ad", "")
+    text = re.sub(r"-\s*\n\s*", "", text)
+    text = text.replace("\n", " ")
+    text = re.sub(r"[^\x20-\x7E]+", " ", text)
+    text = (text.replace("'", "'").replace("'", "'")
+                 .replace(""", '"').replace(""", '"')
+                 .replace("—", "-").replace("–", "-"))
+    text = re.sub(r"[^\w\s-]", " ", text)
+    text = re.sub(r"\s+", " ", text).strip()
+    tokens = word_tokenize(text)
+    lemmas = [lemmatizer.lemmatize(tok) for tok in tokens]
+    normalized = " ".join(lemmas)
+    normalized = re.sub(r"\s+", " ", normalized).strip()
+    return normalized
+def get_meaningful_sentences(text: str) -> List[str]:
+    """Extract meaningful sentences from text."""
+    sentences = sent_tokenize(text or "")
+    filtered = []
+    for s in sentences:
+        words = word_tokenize(s)
+        if len(words) >= MIN_WORDS_PER_SENTENCE and len(s.strip()) >= MIN_SENTENCE_LENGTH:
+            filtered.append(s.strip())
+    return filtered
+def extract_keywords(text: str, max_keywords: int = 5) -> List[str]:
+    """Extract top keywords from text for search queries."""
+    words = word_tokenize((text or "").lower())
+    stop_words = set(stopwords.words("english"))
+    filtered = [w for w in words if w.isalpha() and w not in stop_words and len(w) > 3]
+    freq = nltk.FreqDist(filtered)
+    return [word for word, _ in freq.most_common(max_keywords)]
+def _word_shingles(norm_text: str, k: int = 7) -> List[str]:
+    tokens = norm_text.split()
+    if len(tokens) < k:
+        return []
+    return [" ".join(tokens[i:i+k]) for i in range(len(tokens) - k + 1)]
+def _shingle_sets(a_norm: str, b_norm: str, k: int = 7) -> Tuple[Set[str], Set[str]]:
+    return set(_word_shingles(a_norm, k)), set(_word_shingles(b_norm, k))
+def _jaccard(a: Set[str], b: Set[str]) -> float:
+    if not a and not b:
+        return 0.0
+    inter = len(a & b)
+    union = len(a | b) or 1
+    return inter / union
+def _containment(a: Set[str], b: Set[str]) -> float:
+    if not a:
+        return 0.0
+    inter = len(a & b)
+    return inter / len(a)
+def _winnowing_hashes(norm_text: str, k: int = 7, w: int = 4) -> List[Tuple[int, int]]:
+    tokens = norm_text.split()
+    if len(tokens) < k:
+        return []
+    shingles = [" ".join(tokens[i:i+k]) for i in range(len(tokens) - k + 1)]
+    hashes = [(hash(s) & 0xFFFFFFFF, i) for i, s in enumerate(shingles)]
+    if w <= 1 or len(hashes) <= w:
+        return list(dict.fromkeys(hashes))
+    fps: List[Tuple[int, int]] = []
+    last_min_abs = -1
+    for i in range(0, len(hashes) - w + 1):
+        window = hashes[i:i+w]
+        min_hash, min_idx = None, None
+        for j, (h, _) in enumerate(window):
+            if (min_hash is None) or (h < min_hash) or (h == min_hash and j > (min_idx or -1)):
+                min_hash, min_idx = h, j
+        abs_idx = i + (min_idx or 0)
+        if abs_idx != last_min_abs:
+            fps.append(hashes[abs_idx])
+            last_min_abs = abs_idx
+    return list(dict.fromkeys(fps))
+def _winnowing_overlap(a_fp: List[Tuple[int, int]], b_fp: List[Tuple[int, int]]) -> float:
+    a_set, b_set = set(a_fp), set(b_fp)
+    if not a_set or not b_set:
+        return 0.0
+    shared = len(a_set & b_set)
+    denom = min(len(a_set), len(b_set)) or 1
+    return shared / denom
+def _exact_substring(norm_sentence: str, norm_external: str) -> bool:
+    if not norm_sentence or not norm_external:
+        return False
+    return norm_external.find(norm_sentence) != -1
+def _levenshtein_sim(a: str, b: str) -> float:
+    if not a or not b:
+        return 0.0
+    return Levenshtein.normalized_similarity(a, b)
+def _lcs_length(a: str, b: str) -> int:
+    if not a or not b:
+        return 0
+    prev = [0] * (len(b) + 1)
+    best = 0
+    for i in range(1, len(a) + 1):
+        curr = [0]
+        ai = a[i-1]
+        for j in range(1, len(b) + 1):
+            if ai == b[j-1]:
+                v = prev[j-1] + 1
+                curr.append(v)
+                if v > best:
+                    best = v
+            else:
+                curr.append(0)
+        prev = curr
+    return best
+def _extract_phrases(norm_text: str, min_words: int = 3, max_words: int = 7) -> List[str]:
+    """Extract all phrases of varying lengths for partial match detection."""
+    tokens = norm_text.split()
+    phrases = []
+    for k in range(min_words, min(max_words + 1, len(tokens) + 1)):
+        for i in range(len(tokens) - k + 1):
+            phrases.append(" ".join(tokens[i:i+k]))
+    return phrases
+def _phrase_containment_match(norm_sentence: str, norm_external: str) -> Optional[Tuple[str, float]]:
+    """Check if ANY phrase (3-7 words) from sentence appears in external text."""
+    phrases = _extract_phrases(norm_sentence, min_words=3, max_words=7)
+    best_phrase = None
+    best_score = 0.0
+    for phrase in phrases:
+        if phrase in norm_external:
+            score = len(phrase.split()) / len(norm_sentence.split())
+            if score > best_score:
+                best_score = score
+                best_phrase = phrase
+    if best_phrase and best_score >= 0.4:
+        return best_phrase, round(best_score, 3)
+    return None
+def find_exact_matches(sentence: str, external_text: str) -> Optional[float]:
+    """
+    Full-sentence lexical match with multiple strategies:
+      1) Exact substring match → EXACT_MATCH_SCORE (100%)
+      2) Winnowing fingerprints → robust plagiarism detection
+      3) Edit-distance (Levenshtein) → catches paraphrased content
+      4) LCS (longest common substring) → catches missed overlaps
+    """
+    norm_s = normalize_text(sentence)
+    norm_e = normalize_text(external_text)
+    if len(norm_s) < MIN_SENTENCE_LENGTH:
+        return None
+    # 1) Exact full-sentence match
+    if _exact_substring(norm_s, norm_e):
+        return EXACT_MATCH_SCORE
+    # 2) Winnowing overlap
+    win_sim = _winnowing_overlap(
+        _winnowing_hashes(norm_s, k=5, w=4),
+        _winnowing_hashes(norm_e, k=5, w=4),
+    )
+    if win_sim >= SEQUENCE_THRESHOLD * 0.9:
+        return round(win_sim, 3)
+    # 3) Edit-distance fallback
+    lev = _levenshtein_sim(norm_s, norm_e)
+    if lev >= SEQUENCE_THRESHOLD:
+        return round(lev, 3)
+    # 4) LCS fallback
+    lcs_len = _lcs_length(norm_s, norm_e)
+    LCS_MIN_CHARS = 15
+    if lcs_len >= LCS_MIN_CHARS:
+        return round(lcs_len / max(1, len(norm_s)), 3)
+    return None
+def find_partial_phrase_match(sentence: str, external_text: str) -> Optional[Tuple[str, float]]:
+    """
+    Partial reuse detection via multiple strategies:
+      1) Phrase containment (3-7 word phrases)
+      2) Shingle-based Jaccard/Containment
+      3) Edit distance as fallback
+    """
+    norm_s = normalize_text(sentence)
+    norm_e = normalize_text(external_text)
+    if not norm_s or not norm_e:
+        return None
+    # Strategy 1: Check if any 3-7 word phrase appears verbatim
+    phrase_match = _phrase_containment_match(norm_s, norm_e)
+    if phrase_match:
+        return sentence, phrase_match[1]
+    # Strategy 2: 7-word shingles
+    A, B = _shingle_sets(norm_s, norm_e, k=7)
+    if A and B:
+        jac = _jaccard(A, B)
+        con = _containment(A, B)
+        score = max(jac, con)
+        if score >= SEQUENCE_THRESHOLD * 0.8:
+            return sentence, round(score, 3)
+    # Strategy 3: Smaller shingles (5-word) for more matches
+    A_small, B_small = _shingle_sets(norm_s, norm_e, k=5)
+    if A_small and B_small:
+        jac_small = _jaccard(A_small, B_small)
+        con_small = _containment(A_small, B_small)
+        score_small = max(jac_small, con_small)
+        if score_small >= SEQUENCE_THRESHOLD * 0.75:
+            return sentence, round(score_small, 3)
+    # Strategy 4: Edit distance as last resort
+    lev = _levenshtein_sim(norm_s, norm_e)
+    if lev >= SEQUENCE_THRESHOLD * 0.85:
+        return sentence, round(lev, 3)
+    return None
+def find_partial_phrase_match_for_internal(sentence: str, external_text: str) -> Optional[Tuple[str, float]]:
+    """
+    Wrapper around find_partial_phrase_match that extracts the actual matched phrase
+    that appears in BOTH documents, not just the full sentence from the first doc.
+    """
+    result = find_partial_phrase_match(sentence, external_text)
+    if not result:
+        return None
+    matched_text, score = result
+    norm_s = normalize_text(sentence)
+    norm_e = normalize_text(external_text)
+    if not norm_s or not norm_e:
+        return result
+    # Find the longest common substring that appears in both
+    words_s = norm_s.split()
+    words_e = norm_e.split()
+    best_common = ""
+    best_len = 0
+    # Try all consecutive word sequences from sentence A
+    for i in range(len(words_s)):
+        for j in range(i + 1, len(words_s) + 1):
+            phrase = ' '.join(words_s[i:j])
+            # Check if this phrase exists in document B
+            if phrase in norm_e:
+                phrase_len = len(phrase)
+                if phrase_len > best_len:
+                    best_common = phrase
+                    best_len = phrase_len
+    # If we found a common phrase, return it
+    if best_common:
+        return best_common, score
+    # Fallback: return original result
+    return matched_text, score

app/utils/semantic_utils.py ADDED Viewed

	@@ -0,0 +1,342 @@

+"""
+Semantic analysis with MiniLM embeddings for paraphrasing detection.
+Detects even heavily paraphrased content from LLMs like ChatGPT.
+"""
+import re
+from typing import List, Dict
+import logging
+import numpy as np
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger("semantic_utils")
+# Lazy load model to avoid multiple loads
+_encoder_model = None
+def get_encoder():
+    """Lazy load SentenceTransformer MiniLM model"""
+    global _encoder_model
+    if _encoder_model is None:
+        logger.info("🔄 Loading SentenceTransformer MiniLM-L6-v2 model...")
+        try:
+            from sentence_transformers import SentenceTransformer
+            _encoder_model = SentenceTransformer('all-MiniLM-L6-v2')
+            logger.info("✅ Model loaded successfully")
+        except ImportError:
+            logger.error("❌ sentence-transformers not installed. Install with: pip install sentence-transformers")
+            raise
+    return _encoder_model
+def split_into_sentences(text: str) -> List[str]:
+    """Split text into sentences, filtering out junk"""
+    sentences = re.split(r'(?<=[.!?])\s+', text)
+    cleaned = []
+    for sent in sentences:
+        sent = sent.strip()
+        if len(sent) > 20 and len(sent.split()) >= 5:
+            cleaned.append(sent)
+    return cleaned
+def extract_key_sentences(text: str, num_sentences: int = 5) -> List[str]:
+    """
+    Extract the most important sentences from text based on:
+    - Position (first and last sentences often important)
+    - Length (15-30 words is optimal)
+    - Keyword frequency
+    """
+    sentences = split_into_sentences(text)
+    if len(sentences) <= num_sentences:
+        return sentences
+    scored_sentences = []
+    for idx, sent in enumerate(sentences):
+        score = 0.0
+        word_count = len(sent.split())
+        # Position score
+        if idx < len(sentences) * 0.2 or idx > len(sentences) * 0.8:
+            score += 0.3
+        # Length score
+        if 15 <= word_count <= 30:
+            score += 0.3
+        elif 10 <= word_count <= 40:
+            score += 0.15
+        # Keyword diversity
+        words = set(sent.lower().split())
+        common_words = {'the', 'a', 'an', 'and', 'or', 'is', 'are', 'was', 'be', 'it', 'this', 'that'}
+        unique_words = len(words - common_words)
+        score += min(unique_words / 10, 0.4)
+        scored_sentences.append((score, sent))
+    scored_sentences.sort(reverse=True)
+    key_sents = [sent for _, sent in scored_sentences[:num_sentences]]
+    result = []
+    for sent in sentences:
+        if sent in key_sents:
+            result.append(sent)
+    return result
+def generate_five_queries(text: str, max_words: int = 3000) -> List[str]:
+    """
+    Generate 5 high-quality semantic search queries from document.
+    Query 1: Beginning (main topic)
+    Query 2: Early Middle
+    Query 3: Center (supporting evidence)
+    Query 4: Late Middle
+    Query 5: End (conclusions)
+    """
+    logger.info("   🔍 Generating 5 semantic queries from content...")
+    words = text.split()
+    if len(words) > max_words:
+        text = ' '.join(words[:max_words])
+    sentences = split_into_sentences(text)
+    if len(sentences) < 5:
+        # Fallback for very short text
+        logger.warning("   ⚠️  Very short document, using basic queries")
+        return [
+            ' '.join(words[:30]),
+            ' '.join(words[max(0, len(words)//5):max(0, len(words)//5)+30]),
+            ' '.join(words[max(0, 2*len(words)//5):max(0, 2*len(words)//5)+30]),
+            ' '.join(words[max(0, 3*len(words)//5):max(0, 3*len(words)//5)+30]),
+            ' '.join(words[max(0, 4*len(words)//5):max(0, 4*len(words)//5)+30])
+        ]
+    queries = []
+    total_sentences = len(sentences)
+    # ✅ Query 1: BEGINNING - First 3-4 sentences
+    beginning_end = min(4, total_sentences // 5)
+    query1_sents = sentences[:beginning_end]
+    query1 = ' '.join(query1_sents)
+    queries.append(query1)
+    logger.debug(f"   Query 1 (Beginning) length: {len(query1.split())} words")
+    # ✅ Query 2: EARLY MIDDLE - First third
+    early_start = beginning_end
+    early_end = min(early_start + 4, total_sentences // 3)
+    query2_sents = sentences[early_start:early_end]
+    query2 = ' '.join(query2_sents)
+    queries.append(query2)
+    logger.debug(f"   Query 2 (Early Middle) length: {len(query2.split())} words")
+    # ✅ Query 3: CENTER - Middle section (3-4 sentences)
+    mid_start = max(early_end, total_sentences // 3)
+    mid_end = min(mid_start + 4, 2 * total_sentences // 3)
+    query3_sents = sentences[mid_start:mid_end]
+    query3 = ' '.join(query3_sents)
+    queries.append(query3)
+    logger.debug(f"   Query 3 (Center) length: {len(query3.split())} words")
+    # ✅ Query 4: LATE MIDDLE - Second two-thirds
+    late_start = mid_end
+    late_end = min(late_start + 4, 2 * total_sentences // 3 + (total_sentences // 3))
+    query4_sents = sentences[late_start:late_end]
+    query4 = ' '.join(query4_sents)
+    queries.append(query4)
+    logger.debug(f"   Query 4 (Late Middle) length: {len(query4.split())} words")
+    # ✅ Query 5: END - Last 3-4 sentences
+    end_start = max(late_end, total_sentences - 4)
+    query5_sents = sentences[end_start:]
+    query5 = ' '.join(query5_sents)
+    queries.append(query5)
+    logger.debug(f"   Query 5 (End) length: {len(query5.split())} words")
+    # ✅ Clean and validate queries
+    final_queries = []
+    for q in queries:
+        q = q.strip()
+        if len(q.split()) >= 15:  # Minimum 15 words for good search
+            final_queries.append(q)
+    logger.info(f"   ✅ Generated {len(final_queries)} queries:")
+    for i, q in enumerate(final_queries, 1):
+        word_count = len(q.split())
+        preview = q[:80] + "..." if len(q) > 80 else q
+        logger.info(f"      Query {i} ({word_count} words): {preview}")
+    return final_queries
+def find_semantic_matches(
+    doc_text: str,
+    source_text: str,
+    threshold: float = 0.50
+) -> List[Dict]:
+    """
+    Find semantically similar passages using MiniLM embeddings.
+    Detects paraphrased content from LLMs.
+    threshold: 0.65+ = high confidence, 0.50+ = catch paraphrasing, 0.35+ = catch weak matches
+    """
+    try:
+        encoder = get_encoder()
+    except ImportError:
+        logger.warning("⚠️  SentenceTransformer not available, falling back to string matching")
+        return _fallback_semantic_matches(doc_text, source_text, threshold)
+    # Split into sentences
+    doc_sentences = split_into_sentences(doc_text)
+    source_sentences = split_into_sentences(source_text)
+    if not doc_sentences or not source_sentences:
+        return []
+    logger.debug(f"   Encoding {len(doc_sentences)} doc sentences + {len(source_sentences)} source sentences...")
+    # Encode all sentences
+    try:
+        doc_embeddings = encoder.encode(doc_sentences, convert_to_numpy=True, show_progress_bar=False)
+        source_embeddings = encoder.encode(source_sentences, convert_to_numpy=True, show_progress_bar=False)
+    except Exception as e:
+        logger.error(f"   Encoding error: {e}, falling back to string matching")
+        return _fallback_semantic_matches(doc_text, source_text, threshold)
+    matches = []
+    matched_source_indices = set()
+    # Compare using cosine similarity
+    for doc_idx, doc_emb in enumerate(doc_embeddings):
+        best_similarity = 0.0
+        best_source_idx = -1
+        for source_idx, source_emb in enumerate(source_embeddings):
+            if source_idx in matched_source_indices:
+                continue
+            # Cosine similarity
+            similarity = np.dot(doc_emb, source_emb) / (
+                np.linalg.norm(doc_emb) * np.linalg.norm(source_emb) + 1e-8
+            )
+            if similarity > best_similarity:
+                best_similarity = similarity
+                best_source_idx = source_idx
+        # Record if above threshold
+        if best_similarity >= threshold and best_source_idx >= 0:
+            if best_source_idx not in matched_source_indices:
+                matched_source_indices.add(best_source_idx)
+                matches.append({
+                    'doc_text': doc_sentences[doc_idx],
+                    'source_text': source_sentences[best_source_idx],
+                    'similarity': float(best_similarity),
+                    'doc_index': doc_idx,
+                    'source_index': best_source_idx
+                })
+    logger.debug(f"   Found {len(matches)} semantic matches (threshold: {threshold})")
+    return matches
+def _fallback_semantic_matches(doc_text: str, source_text: str, threshold: float) -> List[Dict]:
+    """Fallback to string matching if embeddings not available"""
+    from difflib import SequenceMatcher
+    doc_sentences = split_into_sentences(doc_text)
+    source_sentences = split_into_sentences(source_text)
+    if not doc_sentences or not source_sentences:
+        return []
+    matches = []
+    matched_source_indices = set()
+    for doc_idx, doc_sent in enumerate(doc_sentences):
+        best_similarity = 0.0
+        best_source_idx = -1
+        for source_idx, source_sent in enumerate(source_sentences):
+            if source_idx in matched_source_indices:
+                continue
+            ratio = SequenceMatcher(None, doc_sent.lower(), source_sent.lower()).ratio()
+            if ratio > best_similarity:
+                best_similarity = ratio
+                best_source_idx = source_idx
+        if best_similarity >= threshold and best_source_idx >= 0:
+            if best_source_idx not in matched_source_indices:
+                matched_source_indices.add(best_source_idx)
+                matches.append({
+                    'doc_text': doc_sentences[doc_idx],
+                    'source_text': source_sentences[best_source_idx],
+                    'similarity': float(best_similarity),
+                    'doc_index': doc_idx,
+                    'source_index': best_source_idx
+                })
+    return matches
+def calculate_semantic_similarity(text1: str, text2: str) -> float:
+    """Calculate semantic similarity between two texts"""
+    try:
+        encoder = get_encoder()
+        embeddings = encoder.encode([text1, text2], convert_to_numpy=True, show_progress_bar=False)
+        similarity = np.dot(embeddings[0], embeddings[1]) / (
+            np.linalg.norm(embeddings[0]) * np.linalg.norm(embeddings[1]) + 1e-8
+        )
+        return float(similarity)
+    except:
+        from difflib import SequenceMatcher
+        return SequenceMatcher(None, text1.lower(), text2.lower()).ratio()
+def compare_semantic_chunks(
+    doc_text: str,
+    source_text: str,
+    chunk_size: int = 200,
+    threshold: float = 0.65
+) -> List[Dict]:
+    """
+    Compare document chunks against source chunks using semantic similarity.
+    """
+    def chunk_text(text, size):
+        words = text.split()
+        chunks = []
+        for i in range(0, len(words), size):
+            chunk = ' '.join(words[i:i+size])
+            if len(chunk.split()) >= 20:
+                chunks.append(chunk)
+        return chunks
+    doc_chunks = chunk_text(doc_text, chunk_size)
+    source_chunks = chunk_text(source_text, chunk_size)
+    if not doc_chunks or not source_chunks:
+        return []
+    try:
+        encoder = get_encoder()
+        doc_embeddings = encoder.encode(doc_chunks, convert_to_numpy=True, show_progress_bar=False)
+        source_embeddings = encoder.encode(source_chunks, convert_to_numpy=True, show_progress_bar=False)
+        matches = []
+        for doc_emb in doc_embeddings:
+            for source_emb in source_embeddings:
+                similarity = np.dot(doc_emb, source_emb) / (
+                    np.linalg.norm(doc_emb) * np.linalg.norm(source_emb) + 1e-8
+                )
+                if similarity >= threshold:
+                    matches.append({
+                        'doc_text': doc_chunks[0][:200] + "...",
+                        'source_text': source_chunks[0][:200] + "...",
+                        'similarity': float(similarity),
+                    })
+        return matches
+    except:
+        return []

app/utils/web_utils.py ADDED Viewed

	@@ -0,0 +1,545 @@

+from typing import Dict, List, Optional
+import requests
+from bs4 import BeautifulSoup
+import time
+from app.config import API_KEYS, SEARCH_ENGINE_IDS
+import logging
+import hashlib
+from functools import lru_cache
+from concurrent.futures import ThreadPoolExecutor, as_completed, TimeoutError as FuturesTimeoutError
+from requests.adapters import HTTPAdapter
+from urllib3.util.retry import Retry
+from urllib.parse import urlparse
+from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeoutError
+import re
+import threading
+from ..logger import logger
+logger.info("Scraper started")
+# Optional fallback
+try:
+    import cloudscraper
+    CLOUDSCRAPER_AVAILABLE = True
+except Exception:
+    CLOUDSCRAPER_AVAILABLE = False
+# ---- OPTIMIZED Configuration ----
+MAX_WORKERS = 8  # Increased for parallelism
+REQUEST_TIMEOUT = 5  # Reduced from 6 - fail faster
+PER_URL_TIMEOUT = 8  # Reduced from 10
+POLITENESS_DELAY = 0.1  # Reduced from 0.25
+GOOGLE_NUM_DEFAULT = 10
+MIN_TEXT_LENGTH = 700
+GOOGLE_API_TIMEOUT = 6
+BRAVE_API_KEY = "BSAE_jMY2tpTa_jYwCkcaiddxmzLs7m"
+BRAVE_ENDPOINT = "https://api.search.brave.com/res/v1/web/search"
+_api_key_index = 0
+_api_key_lock = threading.Lock()
+def _get_next_api_credentials():
+    """Get next API key and search engine ID in round-robin fashion"""
+    global _api_key_index
+    with _api_key_lock:
+        key = API_KEYS[_api_key_index]
+        engine_id = SEARCH_ENGINE_IDS[_api_key_index]
+        _api_key_index = (_api_key_index + 1) % len(API_KEYS)
+    return key, engine_id
+# ✅ HARD TIMEOUT: 3 minutes (180 seconds) for ALL scraping
+MULTI_QUERY_TIMEOUT = 180
+# Blacklist slow/unscrapeable domains
+BLACKLIST_DOMAINS = {
+    'neurips.cc',
+    'icml.cc',
+    'jmlr.org',
+    'researchgate.net',
+    'arxiv.org',
+    'springer.com',
+    'nature.com',
+    'nips.cc',
+    'iccv2023.thecvf.com'
+}
+# Logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger("scraper")
+# ---- Session ----
+def _make_session() -> requests.Session:
+    s = requests.Session()
+    retries = Retry(
+        total=1,  # Only 1 retry - fail fast
+        backoff_factor=0.1,  # Minimal backoff
+        status_forcelist=[429, 500, 502, 503, 504],
+        allowed_methods=["GET", "POST"],
+        respect_retry_after_header=False  # Don't wait for Retry-After
+    )
+    s.mount("https://", HTTPAdapter(max_retries=retries, pool_connections=20, pool_maxsize=20))
+    s.mount("http://", HTTPAdapter(max_retries=retries, pool_connections=20, pool_maxsize=20))
+    s.headers.update({
+        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
+        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+        "Accept-Language": "en-US,en;q=0.5",
+        "Connection": "keep-alive",
+    })
+    return s
+_SESSION = _make_session()
+# ---- Global timeout tracking ----
+_scraping_start_time = None
+_scraping_deadline = None
+def _set_scraping_deadline():
+    """Set the scraping deadline to 3 minutes from now"""
+    global _scraping_start_time, _scraping_deadline
+    _scraping_start_time = time.time()
+    _scraping_deadline = _scraping_start_time + MULTI_QUERY_TIMEOUT
+def _time_remaining() -> float:
+    """Get remaining time in seconds"""
+    if _scraping_deadline is None:
+        return MULTI_QUERY_TIMEOUT
+    remaining = _scraping_deadline - time.time()
+    return max(0, remaining)
+def _is_timeout_exceeded() -> bool:
+    """Check if timeout has been exceeded"""
+    return _time_remaining() <= 0
+# ---- Helpers ----
+def _normalize_whitespace(s: str) -> str:
+    return " ".join(s.split())
+def _text_hash(text: str) -> str:
+    return hashlib.sha256(text.encode("utf-8")).hexdigest()
+def _clean_soup(soup: BeautifulSoup, prefer_main: bool = True, max_chars: Optional[int] = None) -> str:
+    for junk in soup(["script", "style", "nav", "footer", "noscript", "header"]):
+        junk.decompose()
+    parts = []
+    if prefer_main:
+        main = soup.find(["main", "article", "section"])
+        if main:
+            elems = main.find_all(["p", "h1", "h2", "h3", "li"])
+        else:
+            elems = soup.find_all(["p", "h1", "h2", "h3"])
+    else:
+        elems = soup.find_all(["p", "li"])
+    for el in elems:
+        t = el.get_text(separator=" ", strip=True)
+        if t and len(t) > 30:
+            parts.append(t)
+    text = _normalize_whitespace(" ".join(parts))
+    if max_chars and len(text) > max_chars:
+        return text[:max_chars]
+    return text
+# ---- Google Search ----
+@lru_cache(maxsize=256)
+def google_search(query: str, num_results: int = 10):
+    api_key, search_engine_id = _get_next_api_credentials()
+    if not api_key or not search_engine_id:
+        logger.warning("Missing API_KEY or SEARCH_ENGINE_ID in config.py")
+        return []
+    url = "https://www.googleapis.com/customsearch/v1"
+    params = {"key": api_key, "cx": search_engine_id, "q": query, "num": num_results}
+    try:
+        r = _SESSION.get(url, params=params, timeout=GOOGLE_API_TIMEOUT)
+        r.raise_for_status()
+        data = r.json()
+        items = data.get("items", []) or []
+        out = []
+        for i in items:
+            out.append({
+                "link": i.get("link"),
+                "title": i.get("title", ""),
+                "snippet": i.get("snippet", "")
+            })
+        logger.info(f"google_search: got {len(out)} items for '{query[:60]}'")
+        return out
+    except Exception as e:
+        logger.warning(f"google_search failed: {e}")
+        return []
+# ---- IMPROVED: Cloudscraper with early exit ----
+def scrape_with_cloudscraper(url: str, timeout: int = 8):
+    if not CLOUDSCRAPER_AVAILABLE:
+        return ""
+    try:
+        logger.debug(f"cloudscraper: GET {url}")
+        scraper = cloudscraper.create_scraper()
+        r = scraper.get(url, timeout=timeout)
+        r.raise_for_status()
+        soup = BeautifulSoup(r.text, "html.parser")
+        text = _clean_soup(soup, prefer_main=True, max_chars=20000)
+        if text and len(text) > MIN_TEXT_LENGTH:
+            logger.info(f"   ✅ Scraped {len(text)} chars (cloudscraper) for {url}")
+            return text
+        return ""
+    except Exception as e:
+        logger.debug(f"cloudscraper error for {url}: {e}")
+        return ""
+# ---- IMPROVED: Playwright with hard timeout ----
+def scrape_with_playwright(url: str, timeout: int = 8):
+    """Scrape JS-heavy pages with aggressive waiting and content extraction."""
+    try:
+        logger.debug(f"Playwright: GET {url}")
+        with sync_playwright() as p:
+            browser = p.chromium.launch(
+                headless=True,
+                args=[
+                    "--disable-blink-features=AutomationControlled",
+                    "--disable-dev-shm-usage",
+                    "--no-sandbox",
+                    "--disable-gpu"
+                ]
+            )
+            context = browser.new_context(
+                user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
+            )
+            context.set_default_timeout(timeout * 1000)
+            page = context.new_page()
+            try:
+                # Navigate with longer timeout for JS-heavy sites
+                page.goto(url, wait_until="networkidle", timeout=timeout * 1000)
+                # Wait for common content containers to load
+                try:
+                    page.wait_for_selector("article, [role='article'], .post-content, .blog-content, main", timeout=3000)
+                except:
+                    pass  # Selector may not exist, continue anyway
+                # Extended wait for dynamic content
+                page.wait_for_timeout(3000)  # 3 seconds for rendering
+                # Scroll down to load lazy-loaded images/content
+                page.evaluate("""
+                    async () => {
+                        await new Promise((resolve) => {
+                            let totalHeight = 0;
+                            const distance = 100;
+                            const timer = setInterval(() => {
+                                window.scrollBy(0, distance);
+                                totalHeight += distance;
+                                if (totalHeight >= document.body.scrollHeight) {
+                                    clearInterval(timer);
+                                    resolve();
+                                }
+                            }, 100);
+                        });
+                    }
+                """)
+                # Wait after scrolling
+                page.wait_for_timeout(1500)
+                # Scroll back to top
+                page.evaluate("window.scrollTo(0, 0)")
+                page.wait_for_timeout(500)
+                html = page.content()
+            finally:
+                context.close()
+                browser.close()
+            soup = BeautifulSoup(html, "html.parser")
+            text = _clean_soup(soup, prefer_main=True, max_chars=25000)
+            if text and len(text) > MIN_TEXT_LENGTH:
+                logger.info(f"   ✅ Scraped {len(text)} chars (Playwright) for {url}")
+                return text
+            # If we got very little content, try a less aggressive cleanup
+            logger.debug(f"Initial extraction got {len(text)} chars, trying aggressive extraction")
+            text_aggressive = _clean_soup(soup, prefer_main=False, max_chars=25000)
+            if text_aggressive and len(text_aggressive) > MIN_TEXT_LENGTH and len(text_aggressive) > len(text):
+                logger.info(f"   ✅ Scraped {len(text_aggressive)} chars (Playwright aggressive) for {url}")
+                return text_aggressive
+            logger.debug(f"Playwright extraction minimal for {url}: {len(text)} chars")
+            return text if text else ""
+    except (PlaywrightTimeoutError, Exception) as e:
+        logger.debug(f"Playwright error for {url}: {e}")
+        return ""
+# ---- IMPROVED: Smart fallback strategy ----
+def scrape_page(url: str, timeout: int = 5):
+    """Scrape pages in order: requests → cloudscraper → Playwright."""
+    # ✅ HARD CHECK: Exit immediately if timeout exceeded
+    if _is_timeout_exceeded():
+        logger.warning(f"Scraping timeout exceeded, skipping {url}")
+        return ""
+    try:
+        domain = urlparse(url).netloc.lower()
+        # Skip blacklisted domains entirely
+        if any(bd in domain for bd in BLACKLIST_DOMAINS):
+            logger.info(f"Skipping blacklisted domain: {domain}")
+            return ""
+        # --- 1️⃣ Requests (fastest) ---
+        headers = {
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
+            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+            "Accept-Language": "en-US,en;q=0.5",
+        }
+        try:
+            logger.debug(f"requests: GET {url}")
+            r = _SESSION.get(url, headers=headers, timeout=timeout, allow_redirects=True)
+            r.raise_for_status()
+            soup = BeautifulSoup(r.text, "html.parser")
+            text = _clean_soup(soup, prefer_main=True, max_chars=20000)
+            if text and len(text) > MIN_TEXT_LENGTH:
+                logger.info(f"   ✅ Scraped {len(text)} chars (requests) for {url}")
+                return text
+        except Exception as e:
+            logger.debug(f"requests failed for {url}: {e}")
+        # ✅ HARD CHECK: Exit if timeout exceeded between attempts
+        if _is_timeout_exceeded():
+            logger.warning(f"Scraping timeout exceeded, stopping fallback methods for {url}")
+            return ""
+        # --- 2️⃣ Cloudscraper (medium) ---
+        if CLOUDSCRAPER_AVAILABLE:
+            try:
+                logger.debug(f"cloudscraper: GET {url}")
+                scraper = cloudscraper.create_scraper()
+                r = scraper.get(url, timeout=timeout)
+                r.raise_for_status()
+                soup = BeautifulSoup(r.text, "html.parser")
+                text = _clean_soup(soup, prefer_main=True, max_chars=20000)
+                if text and len(text) > MIN_TEXT_LENGTH:
+                    logger.info(f"   ✅ Scraped {len(text)} chars (cloudscraper) for {url}")
+                    return text
+            except Exception as e:
+                logger.debug(f"cloudscraper failed for {url}: {e}")
+        # ✅ HARD CHECK: Exit if timeout exceeded before Playwright
+        if _is_timeout_exceeded():
+            logger.warning(f"Scraping timeout exceeded, skipping Playwright for {url}")
+            return ""
+        # --- 3️⃣ Playwright (heaviest) ---
+        try:
+            logger.debug(f"Playwright: GET {url}")
+            res = scrape_with_playwright(url, timeout=12)
+            if res and len(res) > MIN_TEXT_LENGTH:
+                return res
+        except Exception as e:
+            logger.debug(f"Playwright failed for {url}: {e}")
+    except Exception as e:
+        logger.debug(f"All scrapers failed for {url}: {e}")
+    return ""
+# ---- IMPROVED: Parallel fetch with HARD OVERALL TIMEOUT ----
+def fetch_sources(query: str, num_results: int = 10):
+    """Fetch sources for a single query"""
+    logger.info(f"Fetching sources for query: '{query[:60]}'")
+    items = google_search(query, num_results=num_results)
+    if not items:
+        logger.warning("No URLs returned from Google Search")
+        return []
+    urls = [it["link"] for it in items if it.get("link")]
+    snippets = {it["link"]: it.get("snippet", "") for it in items if it.get("link")}
+    domain_last_hit: Dict[str, float] = {}
+    def _scrape_task(u: str) -> Dict:
+        try:
+            dom = urlparse(u).netloc
+            last = domain_last_hit.get(dom, 0.0)
+            now = time.time()
+            delta = now - last
+            if delta < POLITENESS_DELAY:
+                time.sleep(POLITENESS_DELAY - delta)
+            domain_last_hit[dom] = time.time()
+            logger.info(f"Scraping URL: {u}")
+            text = scrape_page(u, timeout=REQUEST_TIMEOUT)
+            if text:
+                return {"url": u, "content": text, "method": "scraped", "len": len(text), "hash": _text_hash(text[:4000])}
+            else:
+                return {"url": u, "content": "", "method": "failed", "len": 0, "hash": ""}
+        except Exception as e:
+            logger.debug(f"Exception in _scrape_task for {u}: {e}")
+            return {"url": u, "content": "", "method": "error", "len": 0, "hash": ""}
+    results = []
+    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as ex:
+        future_to_url = {ex.submit(_scrape_task, u): u for u in urls}
+        for fut in as_completed(future_to_url):
+            u = future_to_url[fut]
+            try:
+                r = fut.result(timeout=PER_URL_TIMEOUT)
+                if r.get("content"):
+                    results.append(r)
+            except FuturesTimeoutError:
+                logger.warning(f"Timeout scraping {u}")
+            except Exception as e:
+                logger.warning(f"Error scraping {u}: {e}")
+    if not results:
+        logger.info("No pages scraped; using snippets")
+        for u in urls:
+            snip = snippets.get(u, "")
+            if snip:
+                results.append({"url": u, "content": snip, "method": "google_snippet", "len": len(snip), "hash": _text_hash(snip)})
+        return results
+    # Deduplicate
+    seen: Dict[str, Dict] = {}
+    for r in results:
+        h = r.get("hash") or _text_hash(r.get("content", ""))
+        cur = seen.get(h)
+        if not cur or (r.get("len", 0) > cur.get("len", 0)):
+            seen[h] = r
+    deduped = list(seen.values())
+    # Fill from snippets if needed
+    if len(deduped) < num_results:
+        for u in urls:
+            if any(x["url"] == u for x in deduped):
+                continue
+            snip = snippets.get(u, "")
+            if snip:
+                deduped.append({"url": u, "content": snip, "method": "google_snippet", "len": len(snip), "hash": _text_hash(snip)})
+            if len(deduped) >= num_results:
+                break
+    logger.info(f"Fetched {len(deduped)} sources for query")
+    return deduped
+# ---- NEW: Multi-query fetch - accepts pre-generated queries ----
+def fetch_sources_multi_query(query: str, num_results: int = 10) -> List[Dict[str, str]]:
+    """
+    Accept a single pre-generated query and fetch sources.
+    NO internal query generation - just scrape this query.
+    """
+    _set_scraping_deadline()
+    logger.info(f"Processing single query with overall 3-minute timeout")
+    all_sources: Dict[str, Dict] = {}
+    DOC_EXTENSIONS = [".pdf", ".doc", ".docx", ".odf", ".xls", ".xlsx", ".ppt", ".pptx"]
+    lock = threading.Lock()
+    def _process_url(u: str, items: List) -> Optional[Dict]:
+        """Process single URL with timeout check"""
+        # ✅ HARD CHECK: Exit immediately if timeout exceeded
+        if _is_timeout_exceeded():
+            return None
+        if u in all_sources:
+            return None
+        if any(ext in u.lower() for ext in DOC_EXTENSIONS):
+            logger.info(f"Skipping document URL: {u}")
+            return None
+        logger.info(f"Scraping URL: {u}")
+        try:
+            # Try scraping the page
+            text_content = scrape_page(u, timeout=REQUEST_TIMEOUT)
+            if text_content and len(text_content) > MIN_TEXT_LENGTH:
+                logger.info(f"   ✅ Scraped {len(text_content)} chars for {u}")
+                return {"url": u, "content": text_content, "source_url": u}
+            # If scraping failed, try snippet as fallback
+            snippet = next((it.get("snippet", "") for it in items if it.get("link") == u), "")
+            if snippet and len(snippet) > 50:
+                logger.info(f"   ⚠️ Using snippet ({len(snippet)} chars) for {u}")
+                return {"url": u, "content": snippet, "source_url": u}
+            logger.warning(f"   ❌ No content extracted for {u}")
+            return None
+        except Exception as e:
+            logger.debug(f"Error scraping {u}: {e}")
+            return None
+    # ✅ HARD CHECK: Exit if timeout exceeded
+    if _is_timeout_exceeded():
+        logger.warning(f"Timeout exceeded, skipping query")
+        return []
+    logger.info(f"Query: '{query[:60]}'")
+    items = google_search(query, num_results=num_results)
+    if not items:
+        return []
+    urls = [it["link"] for it in items if it.get("link")]
+    # Process URLs for this query in parallel
+    with ThreadPoolExecutor(max_workers=4) as url_ex:
+        url_futures = {url_ex.submit(_process_url, u, items): u for u in urls}
+        for fut in as_completed(url_futures):
+            # ✅ HARD CHECK: Exit if timeout exceeded
+            if _is_timeout_exceeded():
+                logger.warning(f"Timeout exceeded, stopping URL processing")
+                for f in url_futures:
+                    f.cancel()
+                break
+            try:
+                result = fut.result(timeout=PER_URL_TIMEOUT)
+                if result:
+                    with lock:
+                        all_sources[result['url']] = result
+                    logger.info(f"   ✅ Added: {result['url'][:50]}")
+            except FuturesTimeoutError:
+                logger.warning(f"Timeout on URL")
+            except Exception as e:
+                logger.debug(f"Error: {e}")
+            time.sleep(0.05)
+    res = list(all_sources.values())
+    elapsed = time.time() - _scraping_start_time if _scraping_start_time else 0
+    logger.info(f"Sources for this query: {len(res)} (elapsed: {elapsed:.1f}s)")
+    return res
+def fetch_brave_sources(query: str, num_results: int = 5) -> list[dict]:
+    """Fetch from Brave Search API."""
+    headers = {
+        "Accept": "application/json",
+        "X-API-KEY": BRAVE_API_KEY
+    }
+    params = {"q": query, "count": num_results}
+    try:
+        resp = requests.get(BRAVE_ENDPOINT, headers=headers, params=params, timeout=6)
+        resp.raise_for_status()
+        data = resp.json()
+        results = []
+        for item in data.get("webPages", []):
+            results.append({
+                "title": item.get("url") or "Unknown",
+                "content": item.get("snippet") or "",
+            })
+        return results
+    except Exception as e:
+        logging.warning(f"Brave search failed for query '{query}': {e}")
+        return []
+def prepare_brave_query(text: str, min_len: int = 20, max_len: int = 200) -> str:
+    t = re.sub(r"\s+", " ", text).strip()
+    if len(t) < min_len:
+        return None
+    if len(t) > max_len:
+        t = t[:max_len].rsplit(" ", 1)[0]
+    return t