add ai similarity
Browse files- .DS_Store +0 -0
- app/.DS_Store +0 -0
- app/__pycache__/config.cpython-312.pyc +0 -0
- app/__pycache__/logger.cpython-312.pyc +0 -0
- app/__pycache__/main.cpython-312.pyc +0 -0
- app/config.py +46 -0
- app/dependencies/__pycache__/auth.cpython-312.pyc +0 -0
- app/dependencies/auth.py +41 -0
- app/logger.py +13 -0
- app/main.py +26 -0
- app/routers/.DS_Store +0 -0
- app/routers/__pycache__/health.cpython-312.pyc +0 -0
- app/routers/__pycache__/plagiarism.cpython-312.pyc +0 -0
- app/routers/student/__pycache__/lexical_analysis.cpython-312.pyc +0 -0
- app/routers/student/__pycache__/plagiarism.cpython-312.pyc +0 -0
- app/routers/student/lexical_analysis.py +225 -0
- app/routers/teacher/__pycache__/code_analysis.cpython-312.pyc +0 -0
- app/routers/teacher/__pycache__/internal_analysis.cpython-312.pyc +0 -0
- app/routers/teacher/__pycache__/lexical_analysis.cpython-312.pyc +0 -0
- app/routers/teacher/__pycache__/semantic_analysis.cpython-312.pyc +0 -0
- app/routers/teacher/internal_analysis.py +323 -0
- app/routers/teacher/lexical_analysis.py +472 -0
- app/routers/teacher/semantic_analysis.py +406 -0
- app/schemas/__pycache__/plagiarism_schemas.cpython-312.pyc +0 -0
- app/schemas/__pycache__/report_schemas.cpython-312.pyc +0 -0
- app/schemas/__pycache__/schemas.cpython-312.pyc +0 -0
- app/schemas/__pycache__/sources_schemas.cpython-312.pyc +0 -0
- app/schemas/__pycache__/teacher_schemas.cpython-312.pyc +0 -0
- app/schemas/plagiarism_schemas.py +22 -0
- app/schemas/report_schemas.py +21 -0
- app/schemas/sources_schemas.py +9 -0
- app/schemas/teacher_schemas.py +185 -0
- app/utils/__pycache__/ai_detector.cpython-312.pyc +0 -0
- app/utils/__pycache__/code_comparism.cpython-312.pyc +0 -0
- app/utils/__pycache__/code_detection.cpython-312.pyc +0 -0
- app/utils/__pycache__/code_plagiarism_integration.cpython-312.pyc +0 -0
- app/utils/__pycache__/code_utils.cpython-312.pyc +0 -0
- app/utils/__pycache__/file_utils.cpython-312.pyc +0 -0
- app/utils/__pycache__/lexical_utils.cpython-312.pyc +0 -0
- app/utils/__pycache__/semantic_utils.cpython-312.pyc +0 -0
- app/utils/__pycache__/text_utils.cpython-312.pyc +0 -0
- app/utils/__pycache__/web_utils.cpython-312.pyc +0 -0
- app/utils/ai_detector.py +80 -0
- app/utils/file_utils.py +36 -0
- app/utils/lexical_utils.py +293 -0
- app/utils/semantic_utils.py +342 -0
- app/utils/web_utils.py +545 -0
.DS_Store
ADDED
|
Binary file (6.15 kB). View file
|
|
|
app/.DS_Store
ADDED
|
Binary file (6.15 kB). View file
|
|
|
app/__pycache__/config.cpython-312.pyc
ADDED
|
Binary file (1.99 kB). View file
|
|
|
app/__pycache__/logger.cpython-312.pyc
ADDED
|
Binary file (587 Bytes). View file
|
|
|
app/__pycache__/main.cpython-312.pyc
ADDED
|
Binary file (1.29 kB). View file
|
|
|
app/config.py
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from dotenv import load_dotenv
|
| 3 |
+
|
| 4 |
+
load_dotenv()
|
| 5 |
+
|
| 6 |
+
# βββββ API Keys & URLs βββββ
|
| 7 |
+
MONGODB_URI = os.getenv("MONGODB_URI", "mongodb+srv://localhost:27017/sluethink")
|
| 8 |
+
|
| 9 |
+
# βββββ Similarity thresholds βββββ
|
| 10 |
+
MIN_WORDS_PER_SENTENCE = 4
|
| 11 |
+
MIN_SENTENCE_LENGTH = 30
|
| 12 |
+
SEQUENCE_THRESHOLD = 0.75
|
| 13 |
+
TFIDF_THRESHOLD = 0.80
|
| 14 |
+
SUB_PHRASE_TFIDF_MIN = 0.50
|
| 15 |
+
EXACT_MATCH_SCORE = 1.0
|
| 16 |
+
|
| 17 |
+
# βββββ File support βββββ
|
| 18 |
+
ALLOWED_EXTENSIONS = {"txt", "pdf", "docx"}
|
| 19 |
+
JWT_SECRET_KEY = os.getenv("JWT_SECRET_KEY", "your_nextauth_secret")
|
| 20 |
+
JWT_ALGORITHM = os.getenv("JWT_ALGORITHM", "HS256")
|
| 21 |
+
HF_TOKEN = os.getenv("HF_TOKEN", "")
|
| 22 |
+
|
| 23 |
+
API_KEYS = os.getenv("API_KEYS", "").split(",") if os.getenv("API_KEYS") else []
|
| 24 |
+
SEARCH_ENGINE_IDS = os.getenv("SEARCH_ENGINE_IDS", "").split(",") if os.getenv("SEARCH_ENGINE_IDS") else []
|
| 25 |
+
|
| 26 |
+
SECRET_KEY = os.getenv("SECRET_KEY", "change-me")
|
| 27 |
+
ALGORITHM = os.getenv("ALGORITHM", "HS256")
|
| 28 |
+
|
| 29 |
+
MIN_FUNCTION_LINES = 5
|
| 30 |
+
MIN_CODE_BLOCK_LINES = 3
|
| 31 |
+
STRUCTURAL_SIMILARITY_THRESHOLD = 0.75
|
| 32 |
+
TOKEN_SIMILARITY_THRESHOLD = 0.70
|
| 33 |
+
EXACT_MATCH_THRESHOLD = 0.90
|
| 34 |
+
|
| 35 |
+
MAX_QUERIES_PER_SUBMISSION = 3
|
| 36 |
+
RESULTS_PER_QUERY = 10
|
| 37 |
+
MAX_SOURCES_TO_ANALYZE = 8
|
| 38 |
+
|
| 39 |
+
REQUEST_TIMEOUT = 6
|
| 40 |
+
MAX_SCRAPE_WORKERS = 4
|
| 41 |
+
POLITENESS_DELAY = 0.2
|
| 42 |
+
|
| 43 |
+
ALLOWED_CODE_EXTENSIONS = {'.py', '.java', '.cpp', '.c', '.js', '.jsx', '.ts', '.tsx', '.cs', '.rb', '.go', '.php'}
|
| 44 |
+
MAX_FILE_SIZE_MB = 5
|
| 45 |
+
|
| 46 |
+
LOG_LEVEL = "INFO"
|
app/dependencies/__pycache__/auth.cpython-312.pyc
ADDED
|
Binary file (787 Bytes). View file
|
|
|
app/dependencies/auth.py
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# app/dependencies/auth.py
|
| 2 |
+
|
| 3 |
+
from jose import JWTError, jwt
|
| 4 |
+
from fastapi import Depends, HTTPException
|
| 5 |
+
from fastapi.security import OAuth2PasswordBearer
|
| 6 |
+
from motor.motor_asyncio import AsyncIOMotorClient
|
| 7 |
+
from bson import ObjectId
|
| 8 |
+
|
| 9 |
+
from app.config import MONGODB_URI, JWT_SECRET_KEY, JWT_ALGORITHM
|
| 10 |
+
# from app.schemas.report_schemas import User
|
| 11 |
+
|
| 12 |
+
oauth2_scheme = OAuth2PasswordBearer(tokenUrl="/auth/login")
|
| 13 |
+
|
| 14 |
+
async def get_mongo_client():
|
| 15 |
+
return AsyncIOMotorClient(MONGODB_URI)
|
| 16 |
+
|
| 17 |
+
# async def get_current_user(
|
| 18 |
+
# token: str = Depends(oauth2_scheme),
|
| 19 |
+
# mongo_client: AsyncIOMotorClient = Depends(get_mongo_client),
|
| 20 |
+
# ):
|
| 21 |
+
# # 1) Decode the JWT
|
| 22 |
+
# try:
|
| 23 |
+
# payload = jwt.decode(token, JWT_SECRET_KEY, algorithms=[JWT_ALGORITHM])
|
| 24 |
+
# user_id: str = payload.get("id")
|
| 25 |
+
# if not user_id:
|
| 26 |
+
# raise HTTPException(status_code=401, detail="Invalid token payload")
|
| 27 |
+
# except JWTError:
|
| 28 |
+
# raise HTTPException(status_code=401, detail="Could not validate credentials")
|
| 29 |
+
|
| 30 |
+
# # 2) Retrieve the user document from MongoDB
|
| 31 |
+
# db = mongo_client.get_default_database()
|
| 32 |
+
# users_col = db["users"]
|
| 33 |
+
# user_doc = await users_col.find_one({"_id": ObjectId(user_id)})
|
| 34 |
+
# if not user_doc:
|
| 35 |
+
# raise HTTPException(status_code=404, detail="User not found")
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
# user_doc["id"] = str(user_doc["_id"])
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
# return User(**user_doc)
|
app/logger.py
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# logger.py
|
| 2 |
+
import logging
|
| 3 |
+
|
| 4 |
+
logging.basicConfig(
|
| 5 |
+
level=logging.INFO,
|
| 6 |
+
format='%(asctime)s [%(levelname)s] %(name)s: %(message)s',
|
| 7 |
+
handlers=[
|
| 8 |
+
logging.FileHandler("scraper.log"),
|
| 9 |
+
logging.StreamHandler()
|
| 10 |
+
]
|
| 11 |
+
)
|
| 12 |
+
|
| 13 |
+
logger = logging.getLogger("scraper")
|
app/main.py
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
from fastapi import FastAPI
|
| 3 |
+
from app.routers.student.lexical_analysis import router as student_lexical_router
|
| 4 |
+
from app.routers.teacher.semantic_analysis import router as teacher_semantic_router
|
| 5 |
+
from app.routers.teacher.lexical_analysis import router as teacher_lexical_router
|
| 6 |
+
from app.routers.teacher.internal_analysis import router as teacher_internal_router
|
| 7 |
+
|
| 8 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 9 |
+
|
| 10 |
+
logging.basicConfig(level=logging.INFO)
|
| 11 |
+
logger = logging.getLogger(__name__)
|
| 12 |
+
|
| 13 |
+
app = FastAPI()
|
| 14 |
+
|
| 15 |
+
app.add_middleware(
|
| 16 |
+
CORSMiddleware,
|
| 17 |
+
allow_origins=["http://localhost:3000", "https://plagiarism-detection-frontend.vercel.app", "*"],
|
| 18 |
+
allow_credentials=True,
|
| 19 |
+
allow_methods=["*"],
|
| 20 |
+
allow_headers=["*"],
|
| 21 |
+
)
|
| 22 |
+
|
| 23 |
+
app.include_router(student_lexical_router)
|
| 24 |
+
app.include_router(teacher_internal_router)
|
| 25 |
+
app.include_router(teacher_lexical_router)
|
| 26 |
+
app.include_router(teacher_semantic_router)
|
app/routers/.DS_Store
ADDED
|
Binary file (6.15 kB). View file
|
|
|
app/routers/__pycache__/health.cpython-312.pyc
ADDED
|
Binary file (151 Bytes). View file
|
|
|
app/routers/__pycache__/plagiarism.cpython-312.pyc
ADDED
|
Binary file (8.67 kB). View file
|
|
|
app/routers/student/__pycache__/lexical_analysis.cpython-312.pyc
ADDED
|
Binary file (9.74 kB). View file
|
|
|
app/routers/student/__pycache__/plagiarism.cpython-312.pyc
ADDED
|
Binary file (8.68 kB). View file
|
|
|
app/routers/student/lexical_analysis.py
ADDED
|
@@ -0,0 +1,225 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fastapi import APIRouter, UploadFile, File, HTTPException, Depends
|
| 2 |
+
from typing import Optional
|
| 3 |
+
from datetime import datetime
|
| 4 |
+
from fastapi.security import OAuth2PasswordBearer
|
| 5 |
+
from jose import JWTError, jwt
|
| 6 |
+
from motor.motor_asyncio import AsyncIOMotorClient
|
| 7 |
+
import os
|
| 8 |
+
|
| 9 |
+
from app.config import MONGODB_URI,ALGORITHM, SECRET_KEY
|
| 10 |
+
from app.schemas.teacher_schemas import (
|
| 11 |
+
LexicalMatch
|
| 12 |
+
)
|
| 13 |
+
from app.utils.file_utils import extract_text_from_file, allowed_file
|
| 14 |
+
from app.utils.lexical_utils import (
|
| 15 |
+
get_meaningful_sentences, extract_keywords,
|
| 16 |
+
find_exact_matches, find_partial_phrase_match,
|
| 17 |
+
)
|
| 18 |
+
from app.utils.web_utils import fetch_sources, fetch_sources_multi_query
|
| 19 |
+
|
| 20 |
+
router = APIRouter(prefix="/student", tags=["student-lexical"])
|
| 21 |
+
|
| 22 |
+
LEXICAL_DOC_THRESHOLD = 0.85 # 85%
|
| 23 |
+
oauth2_scheme = OAuth2PasswordBearer(tokenUrl="/token")
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def verify_token(token: str = Depends(oauth2_scheme)):
|
| 28 |
+
try:
|
| 29 |
+
return jwt.decode(token, SECRET_KEY, algorithms=[ALGORITHM])
|
| 30 |
+
except JWTError:
|
| 31 |
+
raise HTTPException(status_code=401, detail="Invalid or expired token")
|
| 32 |
+
|
| 33 |
+
async def get_mongo_client():
|
| 34 |
+
return AsyncIOMotorClient(MONGODB_URI)
|
| 35 |
+
|
| 36 |
+
@router.post("/lexical-analysis")
|
| 37 |
+
async def student_lexical_analysis(
|
| 38 |
+
file: UploadFile = File(...),
|
| 39 |
+
current_user=Depends(verify_token),
|
| 40 |
+
):
|
| 41 |
+
if not file:
|
| 42 |
+
raise HTTPException(status_code=400, detail="No file uploaded")
|
| 43 |
+
|
| 44 |
+
t0 = datetime.utcnow()
|
| 45 |
+
total_matches = 0
|
| 46 |
+
|
| 47 |
+
print(f"π Starting student lexical analysis for uploaded file...")
|
| 48 |
+
|
| 49 |
+
# Process single file
|
| 50 |
+
if not allowed_file(file.filename):
|
| 51 |
+
raise HTTPException(status_code=400, detail=f"Invalid file type: {file.filename}")
|
| 52 |
+
|
| 53 |
+
raw = await file.read()
|
| 54 |
+
text = extract_text_from_file(raw, file.filename) or ""
|
| 55 |
+
sentences = get_meaningful_sentences(text)
|
| 56 |
+
|
| 57 |
+
print(f"\nπ Processing file: {file.filename}")
|
| 58 |
+
print(f" β€ Extracted {len(sentences)} sentences")
|
| 59 |
+
print(f" β€ Approx word count: {len(text.split())}")
|
| 60 |
+
|
| 61 |
+
# Build search query from keywords
|
| 62 |
+
sources = fetch_sources_multi_query(text, num_results=10)
|
| 63 |
+
print(f" β€ Found {len(sources)} online sources from diverse queries")
|
| 64 |
+
|
| 65 |
+
if not sources:
|
| 66 |
+
raise HTTPException(status_code=404, detail=f"No sources found online for {file.filename}")
|
| 67 |
+
|
| 68 |
+
matches = []
|
| 69 |
+
highest = 0.0
|
| 70 |
+
source_matches_count = {}
|
| 71 |
+
|
| 72 |
+
externals = [
|
| 73 |
+
{
|
| 74 |
+
"title": s.get("url", "Unknown"),
|
| 75 |
+
"text": s.get("content", ""),
|
| 76 |
+
"source_url": s.get("url", ""),
|
| 77 |
+
"type": "web",
|
| 78 |
+
}
|
| 79 |
+
for s in sources if s.get("content")
|
| 80 |
+
]
|
| 81 |
+
|
| 82 |
+
for ext in externals:
|
| 83 |
+
print(f" π Source: {ext['source_url'][:60]}...")
|
| 84 |
+
source_matches_count[ext['source_url']] = 0
|
| 85 |
+
|
| 86 |
+
# Compare each sentence against ALL sources
|
| 87 |
+
for s in sentences:
|
| 88 |
+
best_overall_score = 0.0
|
| 89 |
+
best_overall_match = None
|
| 90 |
+
best_overall_src = None
|
| 91 |
+
|
| 92 |
+
for ext in externals:
|
| 93 |
+
# Try exact match first
|
| 94 |
+
sim = find_exact_matches(s, ext["text"])
|
| 95 |
+
if sim is not None and sim > best_overall_score:
|
| 96 |
+
best_overall_score = sim
|
| 97 |
+
best_overall_match = s
|
| 98 |
+
best_overall_src = ext
|
| 99 |
+
continue
|
| 100 |
+
|
| 101 |
+
# Try partial phrase match
|
| 102 |
+
pp = find_partial_phrase_match(s, ext["text"])
|
| 103 |
+
if pp:
|
| 104 |
+
phrase, score = pp
|
| 105 |
+
if score > best_overall_score:
|
| 106 |
+
best_overall_score = score
|
| 107 |
+
best_overall_match = phrase
|
| 108 |
+
best_overall_src = ext
|
| 109 |
+
|
| 110 |
+
# Add match if found and above threshold (50%)
|
| 111 |
+
if best_overall_match and best_overall_score > 0.0:
|
| 112 |
+
pct = round(best_overall_score * 100.0, 1)
|
| 113 |
+
|
| 114 |
+
if pct >= 50:
|
| 115 |
+
matches.append({
|
| 116 |
+
"matched_text": best_overall_match,
|
| 117 |
+
"similarity": pct,
|
| 118 |
+
"source_type": best_overall_src["type"],
|
| 119 |
+
"source_title": best_overall_src["title"],
|
| 120 |
+
"source_url": best_overall_src["source_url"],
|
| 121 |
+
"context": "Potential plagiarism detected",
|
| 122 |
+
})
|
| 123 |
+
source_matches_count[best_overall_src['source_url']] += 1
|
| 124 |
+
highest = max(highest, pct)
|
| 125 |
+
total_matches += 1
|
| 126 |
+
print(f" β
Match ({pct}%) with {best_overall_src['source_url'][:50]}")
|
| 127 |
+
|
| 128 |
+
# Better flagging logic considering multiple sources
|
| 129 |
+
num_sources_with_matches = sum(1 for c in source_matches_count.values() if c > 0)
|
| 130 |
+
avg_match_score = (sum(m["similarity"] for m in matches) / len(matches)) if matches else 0.0
|
| 131 |
+
|
| 132 |
+
# Flag if any of these conditions are met:
|
| 133 |
+
# 1. Single source with high similarity (>85%)
|
| 134 |
+
# 2. Content plagiarized from 2+ different sources
|
| 135 |
+
# 3. 3+ matches with average >70%
|
| 136 |
+
flagged = (
|
| 137 |
+
highest >= 85 or
|
| 138 |
+
num_sources_with_matches >= 2 or
|
| 139 |
+
(len(matches) >= 3 and avg_match_score >= 70)
|
| 140 |
+
)
|
| 141 |
+
|
| 142 |
+
print(f" β€ Highest similarity: {highest:.1f}%")
|
| 143 |
+
print(f" β€ Total matches: {len(matches)}")
|
| 144 |
+
print(f" β€ Sources with matches: {num_sources_with_matches}")
|
| 145 |
+
print(f" β€ Average match score: {avg_match_score:.1f}%")
|
| 146 |
+
print(f" β€ Flagged: {flagged}")
|
| 147 |
+
|
| 148 |
+
elapsed = (datetime.utcnow() - t0).total_seconds()
|
| 149 |
+
mm = int(elapsed // 60)
|
| 150 |
+
ss = int(elapsed % 60)
|
| 151 |
+
processing_time = f"{mm}m {ss:02d}s"
|
| 152 |
+
|
| 153 |
+
print("\nβ
Analysis completed!")
|
| 154 |
+
print(f" β€ Flagged: {flagged}")
|
| 155 |
+
print(f" β€ Highest Similarity: {highest}%")
|
| 156 |
+
print(f" β€ Average Similarity: {avg_match_score:.1f}%")
|
| 157 |
+
print(f" β€ Processing Time: {processing_time}")
|
| 158 |
+
|
| 159 |
+
# Extract unique sources
|
| 160 |
+
all_sources = list(set(m["source_url"] for m in matches))
|
| 161 |
+
|
| 162 |
+
# Build response
|
| 163 |
+
result = {
|
| 164 |
+
"id": None, # Will be set after MongoDB insert
|
| 165 |
+
"name": file.filename,
|
| 166 |
+
"content": text,
|
| 167 |
+
"matches": matches,
|
| 168 |
+
"similarity": round(highest, 1),
|
| 169 |
+
"flagged": flagged,
|
| 170 |
+
"wordCount": len(text.split()),
|
| 171 |
+
"processingTime": processing_time,
|
| 172 |
+
"totalMatches": total_matches,
|
| 173 |
+
"averageSimilarity": round(avg_match_score, 1),
|
| 174 |
+
"sources": all_sources,
|
| 175 |
+
"uploadDate": datetime.utcnow().isoformat(),
|
| 176 |
+
}
|
| 177 |
+
|
| 178 |
+
# Save to MongoDB
|
| 179 |
+
try:
|
| 180 |
+
mongo_client = await get_mongo_client()
|
| 181 |
+
db = mongo_client.sluethink
|
| 182 |
+
reports_collection = db.reports
|
| 183 |
+
|
| 184 |
+
# Prepare document for MongoDB
|
| 185 |
+
report_doc = {
|
| 186 |
+
"name": file.filename,
|
| 187 |
+
"analysisType": "lexical",
|
| 188 |
+
"submittedBy": current_user.get("username", "System"),
|
| 189 |
+
"uploadDate": datetime.utcnow().strftime("%Y-%m-%d"),
|
| 190 |
+
"similarity": highest,
|
| 191 |
+
"status": "completed",
|
| 192 |
+
"flagged": flagged,
|
| 193 |
+
"fileCount": 1,
|
| 194 |
+
"processingTime": processing_time,
|
| 195 |
+
"avgSimilarity": avg_match_score,
|
| 196 |
+
"sources": all_sources,
|
| 197 |
+
"createdAt": datetime.utcnow(),
|
| 198 |
+
"userId": current_user.get("sub") or current_user.get("user_id"),
|
| 199 |
+
"content": text,
|
| 200 |
+
"wordCount": len(text.split()),
|
| 201 |
+
"matches": matches,
|
| 202 |
+
"totalMatches": total_matches,
|
| 203 |
+
}
|
| 204 |
+
|
| 205 |
+
# Insert into MongoDB
|
| 206 |
+
insert_result = await reports_collection.insert_one(report_doc)
|
| 207 |
+
print(f"\nπΎ Report saved to MongoDB with ID: {insert_result.inserted_id}")
|
| 208 |
+
|
| 209 |
+
# Update the result with the MongoDB ID
|
| 210 |
+
result["id"] = str(insert_result.inserted_id)
|
| 211 |
+
|
| 212 |
+
mongo_client.close()
|
| 213 |
+
|
| 214 |
+
except Exception as e:
|
| 215 |
+
print(f"\nβ Error saving to MongoDB: {str(e)}")
|
| 216 |
+
# Don't fail the request if MongoDB save fails
|
| 217 |
+
result["id"] = "temp_id"
|
| 218 |
+
|
| 219 |
+
print(f"\nπ§Ύ Returning report:\n"
|
| 220 |
+
f" Flagged: {flagged}\n"
|
| 221 |
+
f" Avg Similarity: {avg_match_score:.1f}%\n"
|
| 222 |
+
f" Highest Similarity: {highest}%\n"
|
| 223 |
+
f" Total Matches: {total_matches}")
|
| 224 |
+
|
| 225 |
+
return result
|
app/routers/teacher/__pycache__/code_analysis.cpython-312.pyc
ADDED
|
Binary file (16.5 kB). View file
|
|
|
app/routers/teacher/__pycache__/internal_analysis.cpython-312.pyc
ADDED
|
Binary file (14 kB). View file
|
|
|
app/routers/teacher/__pycache__/lexical_analysis.cpython-312.pyc
ADDED
|
Binary file (24.5 kB). View file
|
|
|
app/routers/teacher/__pycache__/semantic_analysis.cpython-312.pyc
ADDED
|
Binary file (21.9 kB). View file
|
|
|
app/routers/teacher/internal_analysis.py
ADDED
|
@@ -0,0 +1,323 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fastapi import APIRouter, UploadFile, File, HTTPException, Depends
|
| 2 |
+
from typing import List, Tuple, Set
|
| 3 |
+
from datetime import datetime
|
| 4 |
+
from fastapi.security import OAuth2PasswordBearer
|
| 5 |
+
from jose import JWTError, jwt
|
| 6 |
+
from motor.motor_asyncio import AsyncIOMotorClient
|
| 7 |
+
|
| 8 |
+
from app.schemas.teacher_schemas import (
|
| 9 |
+
DocumentInfo, OverlapDetail, ComparisonDetail,
|
| 10 |
+
InternalReportDetail, InternalReportSummary
|
| 11 |
+
)
|
| 12 |
+
from app.utils.file_utils import extract_text_from_file, allowed_file
|
| 13 |
+
from app.utils.lexical_utils import (
|
| 14 |
+
find_partial_phrase_match_for_internal,
|
| 15 |
+
get_meaningful_sentences,
|
| 16 |
+
find_exact_matches,
|
| 17 |
+
find_partial_phrase_match,
|
| 18 |
+
)
|
| 19 |
+
from app.config import MONGODB_URI,ALGORITHM, SECRET_KEY
|
| 20 |
+
|
| 21 |
+
router = APIRouter(prefix="/teacher", tags=["teacher-internal"])
|
| 22 |
+
|
| 23 |
+
LEXICAL_PAIR_THRESHOLD = 0.50 # 50% - pairs above this are flagged
|
| 24 |
+
OVERLAP_MIN_TOKENS = 12
|
| 25 |
+
|
| 26 |
+
# Add these new thresholds for color coding:
|
| 27 |
+
HIGH_SIMILARITY_THRESHOLD = 0.85 # 85% - Red (very high)
|
| 28 |
+
MEDIUM_SIMILARITY_THRESHOLD = 0.70 # 70% - Yellow (medium)
|
| 29 |
+
LOW_SIMILARITY_THRESHOLD = 0.50
|
| 30 |
+
oauth2_scheme = OAuth2PasswordBearer(tokenUrl="/token")
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def verify_token(token: str = Depends(oauth2_scheme)):
|
| 34 |
+
try:
|
| 35 |
+
return jwt.decode(token, SECRET_KEY, algorithms=[ALGORITHM])
|
| 36 |
+
except JWTError:
|
| 37 |
+
raise HTTPException(status_code=401, detail="Invalid or expired token")
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
async def get_mongo_client():
|
| 41 |
+
return AsyncIOMotorClient(MONGODB_URI)
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def _percent(x: float) -> float:
|
| 45 |
+
return round(float(x) * 100.0, 1)
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
def _ordered_pair_key(i: int, j: int) -> str:
|
| 49 |
+
a, b = (i, j) if i < j else (j, i)
|
| 50 |
+
return f"{a}-{b}"
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
def _aggregate_pair_score(overlaps: List[OverlapDetail]) -> float:
|
| 54 |
+
return max((o.similarity for o in overlaps), default=0.0)
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
def _create_overlap_key(name_a: str, name_b: str, text: str, similarity: float, context: str) -> str:
|
| 58 |
+
"""Create unique key for overlap deduplication - includes context to distinguish different match types"""
|
| 59 |
+
# Normalize text to handle whitespace variations
|
| 60 |
+
text_normalized = ' '.join(text.split())
|
| 61 |
+
return f"{name_a}|{name_b}|{text_normalized}|{similarity}|{context}"
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
def _extract_matched_text_from_sentence(sent_b: str, phrase: str) -> str:
|
| 65 |
+
"""Extract the actual text from sent_b that matches the phrase"""
|
| 66 |
+
if not sent_b or not phrase:
|
| 67 |
+
return phrase
|
| 68 |
+
|
| 69 |
+
# Normalize both for comparison
|
| 70 |
+
phrase_normalized = ' '.join(phrase.split()).lower()
|
| 71 |
+
sent_normalized = ' '.join(sent_b.split()).lower()
|
| 72 |
+
sent_b_normalized = ' '.join(sent_b.split()) # Keep original casing
|
| 73 |
+
|
| 74 |
+
# If phrase exists in sentence, extract it as-is from original
|
| 75 |
+
if phrase_normalized in sent_normalized:
|
| 76 |
+
start_idx = sent_normalized.find(phrase_normalized)
|
| 77 |
+
end_idx = start_idx + len(phrase_normalized)
|
| 78 |
+
return sent_b_normalized[start_idx:end_idx].strip()
|
| 79 |
+
|
| 80 |
+
# If not found exactly, try to find similar chunks
|
| 81 |
+
# Split into words and try to find the best match
|
| 82 |
+
phrase_words = phrase_normalized.split()
|
| 83 |
+
sent_words = sent_normalized.split()
|
| 84 |
+
|
| 85 |
+
# Look for the phrase words in the sentence
|
| 86 |
+
for i in range(len(sent_words) - len(phrase_words) + 1):
|
| 87 |
+
if sent_words[i:i+len(phrase_words)] == phrase_words:
|
| 88 |
+
return ' '.join(sent_b_normalized.split()[i:i+len(phrase_words)])
|
| 89 |
+
|
| 90 |
+
# Fallback: return the phrase as-is
|
| 91 |
+
return phrase
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
def _find_overlaps_for_pair(
|
| 95 |
+
name_a: str, sents_a: List[str],
|
| 96 |
+
name_b: str, sents_b: List[str],
|
| 97 |
+
seen_overlaps: Set[str]
|
| 98 |
+
) -> List[OverlapDetail]:
|
| 99 |
+
"""Find all overlaps between two document's sentences"""
|
| 100 |
+
overlaps: List[OverlapDetail] = []
|
| 101 |
+
|
| 102 |
+
for sent_a in sents_a:
|
| 103 |
+
# Check exact matches
|
| 104 |
+
for sent_b in sents_b:
|
| 105 |
+
exact_score = find_exact_matches(sent_a, sent_b)
|
| 106 |
+
if exact_score is not None:
|
| 107 |
+
sim_pct = _percent(exact_score)
|
| 108 |
+
if sim_pct >= LEXICAL_PAIR_THRESHOLD * 100:
|
| 109 |
+
context = "Exact/near-exact sentence overlap"
|
| 110 |
+
overlap_key = _create_overlap_key(name_a, name_b, sent_a, sim_pct, context)
|
| 111 |
+
if overlap_key not in seen_overlaps:
|
| 112 |
+
seen_overlaps.add(overlap_key)
|
| 113 |
+
overlaps.append(OverlapDetail(
|
| 114 |
+
fromDoc=name_a,
|
| 115 |
+
toDoc=name_b,
|
| 116 |
+
text=sent_a,
|
| 117 |
+
similarity=sim_pct,
|
| 118 |
+
sectionA=sent_a,
|
| 119 |
+
sectionB=sent_b,
|
| 120 |
+
context=context,
|
| 121 |
+
))
|
| 122 |
+
|
| 123 |
+
# Check partial phrase matches
|
| 124 |
+
best_partial = None
|
| 125 |
+
best_score = 0.0
|
| 126 |
+
best_sent_b = None
|
| 127 |
+
|
| 128 |
+
for sent_b in sents_b:
|
| 129 |
+
partial_result = find_partial_phrase_match_for_internal(sent_a, sent_b)
|
| 130 |
+
if partial_result:
|
| 131 |
+
phrase, score = partial_result
|
| 132 |
+
print(f"DEBUG: Partial match - phrase: {phrase[:80]}, score: {score}")
|
| 133 |
+
if score > best_score:
|
| 134 |
+
best_score = score
|
| 135 |
+
best_partial = phrase
|
| 136 |
+
best_sent_b = sent_b
|
| 137 |
+
|
| 138 |
+
# Add best partial match if it meets threshold
|
| 139 |
+
if best_partial and best_sent_b and len(best_partial.split()) >= OVERLAP_MIN_TOKENS:
|
| 140 |
+
sim_pct = _percent(best_score)
|
| 141 |
+
if sim_pct >= LEXICAL_PAIR_THRESHOLD * 100:
|
| 142 |
+
context = "High-overlap phrase (shingle/containment)"
|
| 143 |
+
overlap_key = _create_overlap_key(name_a, name_b, best_partial, sim_pct, context)
|
| 144 |
+
if overlap_key not in seen_overlaps:
|
| 145 |
+
seen_overlaps.add(overlap_key)
|
| 146 |
+
overlaps.append(OverlapDetail(
|
| 147 |
+
fromDoc=name_a,
|
| 148 |
+
toDoc=name_b,
|
| 149 |
+
text=best_partial,
|
| 150 |
+
similarity=sim_pct,
|
| 151 |
+
sectionA=sent_a,
|
| 152 |
+
sectionB=best_sent_b,
|
| 153 |
+
context=context,
|
| 154 |
+
))
|
| 155 |
+
|
| 156 |
+
return overlaps
|
| 157 |
+
|
| 158 |
+
@router.post("/internal-analysis", response_model=InternalReportDetail)
|
| 159 |
+
async def internal_analysis(
|
| 160 |
+
files: List[UploadFile] = File(...),
|
| 161 |
+
token_payload: dict = Depends(verify_token),
|
| 162 |
+
mongo: AsyncIOMotorClient = Depends(get_mongo_client),
|
| 163 |
+
):
|
| 164 |
+
if len(files) < 2:
|
| 165 |
+
raise HTTPException(status_code=400, detail="Upload at least 2 files")
|
| 166 |
+
|
| 167 |
+
t0 = datetime.utcnow()
|
| 168 |
+
|
| 169 |
+
# --- Load & sentence-split all docs ---
|
| 170 |
+
docs: List[Tuple[str, List[str]]] = []
|
| 171 |
+
doc_infos: List[DocumentInfo] = []
|
| 172 |
+
doc_texts = {}
|
| 173 |
+
|
| 174 |
+
for idx, f in enumerate(files, start=1):
|
| 175 |
+
if not allowed_file(f.filename):
|
| 176 |
+
raise HTTPException(status_code=400, detail=f"Invalid file type: {f.filename}")
|
| 177 |
+
raw = await f.read()
|
| 178 |
+
text = extract_text_from_file(raw, f.filename) or ""
|
| 179 |
+
sents = get_meaningful_sentences(text)
|
| 180 |
+
doc_infos.append(DocumentInfo(id=idx, name=f.filename, author=None))
|
| 181 |
+
docs.append((f.filename, sents))
|
| 182 |
+
doc_texts[f.filename] = text
|
| 183 |
+
|
| 184 |
+
# --- Pairwise comparisons ---
|
| 185 |
+
comparisons: List[ComparisonDetail] = []
|
| 186 |
+
seen_overlaps: Set[str] = set()
|
| 187 |
+
|
| 188 |
+
for i in range(len(docs)):
|
| 189 |
+
for j in range(i + 1, len(docs)):
|
| 190 |
+
name_a, sents_a = docs[i]
|
| 191 |
+
name_b, sents_b = docs[j]
|
| 192 |
+
|
| 193 |
+
# Find all overlaps for this pair
|
| 194 |
+
overlaps = _find_overlaps_for_pair(
|
| 195 |
+
name_a, sents_a,
|
| 196 |
+
name_b, sents_b,
|
| 197 |
+
seen_overlaps
|
| 198 |
+
)
|
| 199 |
+
|
| 200 |
+
# Calculate pair score and flag if needed
|
| 201 |
+
pair_score = _aggregate_pair_score(overlaps)
|
| 202 |
+
flagged = pair_score >= LEXICAL_PAIR_THRESHOLD * 100
|
| 203 |
+
|
| 204 |
+
comp = ComparisonDetail(
|
| 205 |
+
id=_ordered_pair_key(i + 1, j + 1),
|
| 206 |
+
docA=name_a,
|
| 207 |
+
docB=name_b,
|
| 208 |
+
similarity=round(pair_score, 1),
|
| 209 |
+
flagged=flagged,
|
| 210 |
+
overlaps=overlaps,
|
| 211 |
+
contentA=doc_texts[name_a],
|
| 212 |
+
contentB=doc_texts[name_b],
|
| 213 |
+
)
|
| 214 |
+
if flagged:
|
| 215 |
+
comparisons.append(comp)
|
| 216 |
+
|
| 217 |
+
# --- Compute per-document results ---
|
| 218 |
+
doc_results = []
|
| 219 |
+
total_matches = 0
|
| 220 |
+
flagged_count = 0
|
| 221 |
+
|
| 222 |
+
for d_idx, d in enumerate(doc_infos, start=1):
|
| 223 |
+
name = d.name
|
| 224 |
+
word_count = len(doc_texts[name].split())
|
| 225 |
+
matches = [o for c in comparisons for o in c.overlaps if o.fromDoc == name or o.toDoc == name]
|
| 226 |
+
highest_similarity = max((o.similarity for o in matches), default=0.0)
|
| 227 |
+
flagged = highest_similarity >= LEXICAL_PAIR_THRESHOLD * 100
|
| 228 |
+
if flagged:
|
| 229 |
+
flagged_count += 1
|
| 230 |
+
total_matches += len(matches)
|
| 231 |
+
|
| 232 |
+
doc_results.append({
|
| 233 |
+
"id": d.id,
|
| 234 |
+
"name": d.name,
|
| 235 |
+
"similarity": round(highest_similarity, 1),
|
| 236 |
+
"flagged": flagged,
|
| 237 |
+
"wordCount": word_count,
|
| 238 |
+
"matchCount": len(matches),
|
| 239 |
+
"matches": matches
|
| 240 |
+
})
|
| 241 |
+
|
| 242 |
+
highest_any = max(d['similarity'] for d in doc_results) if doc_results else 0.0
|
| 243 |
+
avg_similarity = round(sum(d['similarity'] for d in doc_results) / len(doc_results), 1) if doc_results else 0.0
|
| 244 |
+
elapsed = (datetime.utcnow() - t0).total_seconds()
|
| 245 |
+
processing = f"{int(elapsed // 60)}m {int(elapsed % 60):02d}s"
|
| 246 |
+
|
| 247 |
+
report = InternalReportDetail(
|
| 248 |
+
id="internal_report",
|
| 249 |
+
name="Internal Plagiarism Check",
|
| 250 |
+
uploadDate=datetime.utcnow(),
|
| 251 |
+
processingTime=processing,
|
| 252 |
+
documents=doc_infos,
|
| 253 |
+
comparisons=comparisons,
|
| 254 |
+
summary=InternalReportSummary(
|
| 255 |
+
totalDocuments=len(doc_results),
|
| 256 |
+
totalComparisons=(len(docs) * (len(docs) - 1)) // 2,
|
| 257 |
+
flaggedComparisons=flagged_count,
|
| 258 |
+
highestSimilarity=round(highest_any, 1),
|
| 259 |
+
averageSimilarity=avg_similarity,
|
| 260 |
+
),
|
| 261 |
+
)
|
| 262 |
+
|
| 263 |
+
# --- Save to MongoDB ---
|
| 264 |
+
try:
|
| 265 |
+
db = mongo.sluethink
|
| 266 |
+
reports_collection = db.reports
|
| 267 |
+
|
| 268 |
+
all_sources = set()
|
| 269 |
+
for comp in comparisons:
|
| 270 |
+
for o in comp.overlaps:
|
| 271 |
+
all_sources.add(o.toDoc)
|
| 272 |
+
|
| 273 |
+
report_doc = {
|
| 274 |
+
"name": f"Internal_{datetime.utcnow().strftime('%Y%m%d_%H%M%S')}",
|
| 275 |
+
"analysisType": "internal",
|
| 276 |
+
"submittedBy": token_payload.get("name", "System"),
|
| 277 |
+
"uploadDate": datetime.utcnow().strftime("%Y-%m-%d"),
|
| 278 |
+
"similarity": highest_any,
|
| 279 |
+
"status": "completed",
|
| 280 |
+
"flagged": flagged_count > 0,
|
| 281 |
+
"fileCount": len(doc_results),
|
| 282 |
+
"processingTime": processing,
|
| 283 |
+
"avgSimilarity": avg_similarity,
|
| 284 |
+
"totalMatches": total_matches,
|
| 285 |
+
"sources": list(all_sources),
|
| 286 |
+
"createdAt": datetime.utcnow(),
|
| 287 |
+
"userId": token_payload.get("sub") or token_payload.get("user_id"),
|
| 288 |
+
"documents": [
|
| 289 |
+
{
|
| 290 |
+
"id": d['id'],
|
| 291 |
+
"name": d['name'],
|
| 292 |
+
"similarity": d['similarity'],
|
| 293 |
+
"flagged": d['flagged'],
|
| 294 |
+
"wordCount": d['wordCount'],
|
| 295 |
+
"matchCount": d['matchCount'],
|
| 296 |
+
"matches": [
|
| 297 |
+
{
|
| 298 |
+
"matched_text": m.text,
|
| 299 |
+
"similarity": m.similarity,
|
| 300 |
+
"source_url": m.toDoc,
|
| 301 |
+
"source_title": m.toDoc,
|
| 302 |
+
"source_type": "internal",
|
| 303 |
+
} for m in d['matches']
|
| 304 |
+
]
|
| 305 |
+
} for d in doc_results
|
| 306 |
+
],
|
| 307 |
+
"summary": {
|
| 308 |
+
"totalDocuments": len(doc_results),
|
| 309 |
+
"flaggedDocuments": flagged_count,
|
| 310 |
+
"highestSimilarity": highest_any,
|
| 311 |
+
"averageSimilarity": avg_similarity,
|
| 312 |
+
"totalMatches": total_matches,
|
| 313 |
+
}
|
| 314 |
+
}
|
| 315 |
+
|
| 316 |
+
insert_result = await reports_collection.insert_one(report_doc)
|
| 317 |
+
print(f"πΎ Report saved to MongoDB with ID: {insert_result.inserted_id}")
|
| 318 |
+
report.id = str(insert_result.inserted_id)
|
| 319 |
+
|
| 320 |
+
except Exception as e:
|
| 321 |
+
print(f"β Error saving to MongoDB: {str(e)}")
|
| 322 |
+
|
| 323 |
+
return report
|
app/routers/teacher/lexical_analysis.py
ADDED
|
@@ -0,0 +1,472 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fastapi import APIRouter, UploadFile, File, HTTPException, Depends
|
| 2 |
+
from typing import List
|
| 3 |
+
from datetime import datetime
|
| 4 |
+
from fastapi.security import OAuth2PasswordBearer
|
| 5 |
+
from jose import JWTError, jwt
|
| 6 |
+
from motor.motor_asyncio import AsyncIOMotorClient
|
| 7 |
+
import logging
|
| 8 |
+
import asyncio
|
| 9 |
+
import threading
|
| 10 |
+
|
| 11 |
+
from app.config import MONGODB_URI, ALGORITHM, SECRET_KEY
|
| 12 |
+
from app.schemas.teacher_schemas import (
|
| 13 |
+
TeacherLexicalBatchReport, TeacherLexicalSummary,
|
| 14 |
+
LexicalDocResult, LexicalMatch
|
| 15 |
+
)
|
| 16 |
+
from app.utils.file_utils import extract_text_from_file, allowed_file
|
| 17 |
+
from app.utils.lexical_utils import (
|
| 18 |
+
get_meaningful_sentences, extract_keywords,
|
| 19 |
+
find_exact_matches, find_partial_phrase_match,
|
| 20 |
+
)
|
| 21 |
+
from app.utils.web_utils import fetch_sources_multi_query
|
| 22 |
+
|
| 23 |
+
router = APIRouter(prefix="/teacher", tags=["teacher-lexical"])
|
| 24 |
+
|
| 25 |
+
LEXICAL_DOC_THRESHOLD = 0.85 # 85%
|
| 26 |
+
|
| 27 |
+
# β
HARD TIMEOUT: 3 minutes (180 seconds) for all queries combined
|
| 28 |
+
SCRAPING_TIMEOUT = 180
|
| 29 |
+
|
| 30 |
+
oauth2_scheme = OAuth2PasswordBearer(tokenUrl="/token")
|
| 31 |
+
logging.basicConfig(level=logging.INFO)
|
| 32 |
+
logger = logging.getLogger("lexical_analysis")
|
| 33 |
+
|
| 34 |
+
def verify_token(token: str = Depends(oauth2_scheme)):
|
| 35 |
+
try:
|
| 36 |
+
return jwt.decode(token, SECRET_KEY, algorithms=[ALGORITHM])
|
| 37 |
+
except JWTError:
|
| 38 |
+
raise HTTPException(status_code=401, detail="Invalid or expired token")
|
| 39 |
+
|
| 40 |
+
async def get_mongo_client():
|
| 41 |
+
return AsyncIOMotorClient(MONGODB_URI)
|
| 42 |
+
|
| 43 |
+
def generate_five_queries(text: str) -> List[str]:
|
| 44 |
+
"""
|
| 45 |
+
Generate 5 high-quality search queries from document.
|
| 46 |
+
Covers: beginning, 1/4, middle, 3/4, end
|
| 47 |
+
"""
|
| 48 |
+
from app.utils.lexical_utils import get_meaningful_sentences
|
| 49 |
+
|
| 50 |
+
logger.info(" π Generating 5 lexical queries from content...")
|
| 51 |
+
|
| 52 |
+
sentences = get_meaningful_sentences(text)
|
| 53 |
+
if len(sentences) < 5:
|
| 54 |
+
logger.warning(" β οΈ Not enough sentences, using fewer queries")
|
| 55 |
+
# Fallback for short documents
|
| 56 |
+
words = text.split()
|
| 57 |
+
return [
|
| 58 |
+
' '.join(words[:30]) if len(words) > 0 else text,
|
| 59 |
+
' '.join(words[max(0, len(words)//4):max(0, len(words)//4)+30]) if len(words) > 30 else text,
|
| 60 |
+
' '.join(words[max(0, len(words)//2):max(0, len(words)//2)+30]) if len(words) > 30 else text,
|
| 61 |
+
]
|
| 62 |
+
|
| 63 |
+
queries = []
|
| 64 |
+
|
| 65 |
+
# β
Query 1: BEGINNING - First 3-4 sentences
|
| 66 |
+
beginning_end = min(4, len(sentences))
|
| 67 |
+
query1 = ' '.join(sentences[:beginning_end])
|
| 68 |
+
queries.append(query1)
|
| 69 |
+
logger.debug(f" Query 1 length: {len(query1.split())} words")
|
| 70 |
+
|
| 71 |
+
# β
Query 2: QUARTER-POINT - Around 25% of document
|
| 72 |
+
quarter_start = max(beginning_end, len(sentences) // 4)
|
| 73 |
+
quarter_end = min(quarter_start + 4, len(sentences))
|
| 74 |
+
query2 = ' '.join(sentences[quarter_start:quarter_end])
|
| 75 |
+
queries.append(query2)
|
| 76 |
+
logger.debug(f" Query 2 length: {len(query2.split())} words")
|
| 77 |
+
|
| 78 |
+
# β
Query 3: MIDDLE - Around 50% of document
|
| 79 |
+
mid_start = max(quarter_end, len(sentences) // 2)
|
| 80 |
+
mid_end = min(mid_start + 4, len(sentences))
|
| 81 |
+
query3 = ' '.join(sentences[mid_start:mid_end])
|
| 82 |
+
queries.append(query3)
|
| 83 |
+
logger.debug(f" Query 3 length: {len(query3.split())} words")
|
| 84 |
+
|
| 85 |
+
# β
Query 4: THREE-QUARTER-POINT - Around 75% of document
|
| 86 |
+
three_quarter_start = max(mid_end, int(len(sentences) * 0.75))
|
| 87 |
+
three_quarter_end = min(three_quarter_start + 4, len(sentences))
|
| 88 |
+
query4 = ' '.join(sentences[three_quarter_start:three_quarter_end])
|
| 89 |
+
queries.append(query4)
|
| 90 |
+
logger.debug(f" Query 4 length: {len(query4.split())} words")
|
| 91 |
+
|
| 92 |
+
# β
Query 5: END - Last 3-4 sentences
|
| 93 |
+
end_start = max(three_quarter_end, len(sentences) - 4)
|
| 94 |
+
query5 = ' '.join(sentences[end_start:])
|
| 95 |
+
queries.append(query5)
|
| 96 |
+
logger.debug(f" Query 5 length: {len(query5.split())} words")
|
| 97 |
+
|
| 98 |
+
# β
Validate queries
|
| 99 |
+
final_queries = []
|
| 100 |
+
for q in queries:
|
| 101 |
+
q = q.strip()
|
| 102 |
+
if len(q.split()) >= 15: # Minimum 15 words for good search
|
| 103 |
+
final_queries.append(q)
|
| 104 |
+
|
| 105 |
+
logger.info(f" β
Generated {len(final_queries)} queries:")
|
| 106 |
+
for i, q in enumerate(final_queries, 1):
|
| 107 |
+
word_count = len(q.split())
|
| 108 |
+
preview = q[:80] + "..." if len(q) > 80 else q
|
| 109 |
+
logger.info(f" Query {i} ({word_count} words): {preview}")
|
| 110 |
+
|
| 111 |
+
return final_queries
|
| 112 |
+
|
| 113 |
+
class ScrapingTimeoutManager:
|
| 114 |
+
"""Manages web scraping with hard 3-minute overall timeout"""
|
| 115 |
+
|
| 116 |
+
def __init__(self, timeout_seconds: int = 180):
|
| 117 |
+
self.timeout = timeout_seconds
|
| 118 |
+
self.start_time = None
|
| 119 |
+
self.sources = []
|
| 120 |
+
self.lock = threading.Lock()
|
| 121 |
+
self.cancelled = False
|
| 122 |
+
|
| 123 |
+
def elapsed(self) -> float:
|
| 124 |
+
"""Get elapsed time in seconds"""
|
| 125 |
+
if self.start_time is None:
|
| 126 |
+
return 0.0
|
| 127 |
+
return (datetime.utcnow() - self.start_time).total_seconds()
|
| 128 |
+
|
| 129 |
+
def is_timeout(self) -> bool:
|
| 130 |
+
"""Check if 3-minute timeout exceeded"""
|
| 131 |
+
return self.elapsed() >= self.timeout
|
| 132 |
+
|
| 133 |
+
async def fetch_all_sources(self, queries: List[str], num_results: int = 10) -> List:
|
| 134 |
+
"""
|
| 135 |
+
Fetch sources for all 5 queries with hard 180-second overall timeout.
|
| 136 |
+
Immediately stops and starts matching when timeout reached.
|
| 137 |
+
"""
|
| 138 |
+
self.start_time = datetime.utcnow()
|
| 139 |
+
self.sources = []
|
| 140 |
+
|
| 141 |
+
logger.info(f"\nπ WEB SCRAPING PHASE")
|
| 142 |
+
logger.info(f" Max Duration: {self.timeout}s (3 minutes)")
|
| 143 |
+
logger.info(f" Queries: {len(queries)}")
|
| 144 |
+
logger.info(f" Starting: {self.start_time.strftime('%H:%M:%S')}")
|
| 145 |
+
|
| 146 |
+
# Process all queries in parallel with timeout
|
| 147 |
+
tasks = []
|
| 148 |
+
for query_idx, query in enumerate(queries, 1):
|
| 149 |
+
logger.info(f"\n Query {query_idx}/{len(queries)}: {query[:60]}...")
|
| 150 |
+
tasks.append(self._fetch_query(query, num_results))
|
| 151 |
+
|
| 152 |
+
try:
|
| 153 |
+
# Wait for all tasks with overall timeout
|
| 154 |
+
await asyncio.wait_for(
|
| 155 |
+
asyncio.gather(*tasks, return_exceptions=True),
|
| 156 |
+
timeout=self.timeout
|
| 157 |
+
)
|
| 158 |
+
except asyncio.TimeoutError:
|
| 159 |
+
logger.warning(f"\nπ HARD TIMEOUT REACHED after {self.elapsed():.1f}s")
|
| 160 |
+
logger.warning(f" Cancelling all pending queries")
|
| 161 |
+
self.cancelled = True
|
| 162 |
+
# Cancel remaining tasks
|
| 163 |
+
for task in tasks:
|
| 164 |
+
if isinstance(task, asyncio.Task):
|
| 165 |
+
task.cancel()
|
| 166 |
+
|
| 167 |
+
# Remove duplicates
|
| 168 |
+
seen_urls = set()
|
| 169 |
+
unique_sources = []
|
| 170 |
+
for source in self.sources:
|
| 171 |
+
url = source.get('url', '')
|
| 172 |
+
if url and url not in seen_urls:
|
| 173 |
+
seen_urls.add(url)
|
| 174 |
+
unique_sources.append(source)
|
| 175 |
+
|
| 176 |
+
elapsed = self.elapsed()
|
| 177 |
+
logger.info(f"\nβ
SCRAPING PHASE STOPPED")
|
| 178 |
+
logger.info(f" Total Duration: {elapsed:.1f}s ({int(elapsed)//60}m {int(elapsed)%60}s)")
|
| 179 |
+
logger.info(f" Unique Sources: {len(unique_sources)}")
|
| 180 |
+
logger.info(f" Status: {'π TIMEOUT' if self.is_timeout() else 'β
COMPLETED'}")
|
| 181 |
+
|
| 182 |
+
return unique_sources
|
| 183 |
+
|
| 184 |
+
async def _fetch_query(self, query: str, num_results: int = 10):
|
| 185 |
+
"""Fetch sources for a single query"""
|
| 186 |
+
try:
|
| 187 |
+
sources = await asyncio.to_thread(
|
| 188 |
+
fetch_sources_multi_query,
|
| 189 |
+
query,
|
| 190 |
+
num_results
|
| 191 |
+
)
|
| 192 |
+
|
| 193 |
+
with self.lock:
|
| 194 |
+
self.sources.extend(sources)
|
| 195 |
+
|
| 196 |
+
logger.info(f" β
Found {len(sources)} sources")
|
| 197 |
+
|
| 198 |
+
except asyncio.CancelledError:
|
| 199 |
+
logger.warning(f" βοΈ Query cancelled (timeout)")
|
| 200 |
+
except Exception as e:
|
| 201 |
+
logger.error(f" β Error: {e}")
|
| 202 |
+
|
| 203 |
+
@router.post("/lexical-analysis", response_model=TeacherLexicalBatchReport)
|
| 204 |
+
async def teacher_lexical_analysis(
|
| 205 |
+
files: List[UploadFile] = File(...),
|
| 206 |
+
current_user=Depends(verify_token),
|
| 207 |
+
):
|
| 208 |
+
if not files:
|
| 209 |
+
raise HTTPException(status_code=400, detail="No files uploaded")
|
| 210 |
+
|
| 211 |
+
t0 = datetime.utcnow()
|
| 212 |
+
doc_results: List[LexicalDocResult] = []
|
| 213 |
+
total_matches = 0
|
| 214 |
+
|
| 215 |
+
logger.info(f"\n{'='*80}")
|
| 216 |
+
logger.info(f"π LEXICAL ANALYSIS - {len(files)} file(s)")
|
| 217 |
+
logger.info(f"{'='*80}")
|
| 218 |
+
|
| 219 |
+
for idx, f in enumerate(files, start=1):
|
| 220 |
+
if not allowed_file(f.filename):
|
| 221 |
+
raise HTTPException(status_code=400, detail=f"Invalid file type: {f.filename}")
|
| 222 |
+
|
| 223 |
+
raw = await f.read()
|
| 224 |
+
try:
|
| 225 |
+
text = extract_text_from_file(raw, f.filename) or ""
|
| 226 |
+
except ValueError as ve:
|
| 227 |
+
# Catch over-word files
|
| 228 |
+
raise HTTPException(status_code=400, detail=str(ve))
|
| 229 |
+
|
| 230 |
+
sentences = get_meaningful_sentences(text)
|
| 231 |
+
|
| 232 |
+
logger.info(f"\nπ File {idx}: {f.filename}")
|
| 233 |
+
logger.info(f" Sentences: {len(sentences)}")
|
| 234 |
+
logger.info(f" Words: {len(text.split())}")
|
| 235 |
+
|
| 236 |
+
# β
Generate 5 lexical queries
|
| 237 |
+
queries = generate_five_queries(text)
|
| 238 |
+
|
| 239 |
+
# β
WEB SCRAPING WITH 3-MINUTE HARD TIMEOUT (OVERALL)
|
| 240 |
+
scraper = ScrapingTimeoutManager(timeout_seconds=SCRAPING_TIMEOUT)
|
| 241 |
+
sources = await scraper.fetch_all_sources(queries, num_results=5)
|
| 242 |
+
|
| 243 |
+
# β
RESET TIMEOUT - Scraping phase is done, matching has no time limit
|
| 244 |
+
from app.utils import web_utils
|
| 245 |
+
web_utils._scraping_deadline = None
|
| 246 |
+
web_utils._scraping_start_time = None
|
| 247 |
+
|
| 248 |
+
logger.info(f" Total unique sources: {len(sources)}")
|
| 249 |
+
|
| 250 |
+
if not sources:
|
| 251 |
+
logger.warning(f" β οΈ No sources found, skipping lexical matching")
|
| 252 |
+
doc_results.append(LexicalDocResult(
|
| 253 |
+
id=idx,
|
| 254 |
+
name=f.filename,
|
| 255 |
+
author=None,
|
| 256 |
+
similarity=0.0,
|
| 257 |
+
flagged=False,
|
| 258 |
+
wordCount=len(text.split()),
|
| 259 |
+
matches=[],
|
| 260 |
+
content=text
|
| 261 |
+
))
|
| 262 |
+
continue
|
| 263 |
+
|
| 264 |
+
matches: List[LexicalMatch] = []
|
| 265 |
+
highest = 0.0
|
| 266 |
+
source_matches_count = {}
|
| 267 |
+
|
| 268 |
+
# β
MATCHING PHASE (starts immediately after timeout)
|
| 269 |
+
logger.info(f"\nπ LEXICAL MATCHING PHASE")
|
| 270 |
+
logger.info(f" Comparing {len(sentences)} sentences against {len(sources)} sources...")
|
| 271 |
+
|
| 272 |
+
externals = [
|
| 273 |
+
{
|
| 274 |
+
"title": s.get("url", "Unknown"),
|
| 275 |
+
"text": s.get("content", ""),
|
| 276 |
+
"source_url": s.get("url", ""),
|
| 277 |
+
"type": "web",
|
| 278 |
+
}
|
| 279 |
+
for s in sources if s.get("content")
|
| 280 |
+
]
|
| 281 |
+
|
| 282 |
+
for ext in externals:
|
| 283 |
+
logger.info(f" π Source: {ext['source_url'][:60]}...")
|
| 284 |
+
source_matches_count[ext['source_url']] = 0
|
| 285 |
+
|
| 286 |
+
# Compare each sentence against ALL sources
|
| 287 |
+
for s in sentences:
|
| 288 |
+
best_overall_score = 0.0
|
| 289 |
+
best_overall_match = None
|
| 290 |
+
best_overall_src = None
|
| 291 |
+
|
| 292 |
+
for ext in externals:
|
| 293 |
+
# Try exact match first
|
| 294 |
+
sim = find_exact_matches(s, ext["text"])
|
| 295 |
+
if sim is not None and sim > best_overall_score:
|
| 296 |
+
best_overall_score = sim
|
| 297 |
+
best_overall_match = s
|
| 298 |
+
best_overall_src = ext
|
| 299 |
+
continue
|
| 300 |
+
|
| 301 |
+
# Try partial phrase match
|
| 302 |
+
pp = find_partial_phrase_match(s, ext["text"])
|
| 303 |
+
if pp:
|
| 304 |
+
phrase, score = pp
|
| 305 |
+
if score > best_overall_score:
|
| 306 |
+
best_overall_score = score
|
| 307 |
+
best_overall_match = phrase
|
| 308 |
+
best_overall_src = ext
|
| 309 |
+
|
| 310 |
+
# Add match if found and above threshold (50%)
|
| 311 |
+
if best_overall_match and best_overall_score > 0.0:
|
| 312 |
+
pct = round(best_overall_score * 100.0, 1)
|
| 313 |
+
|
| 314 |
+
if pct >= 50:
|
| 315 |
+
matches.append(LexicalMatch(
|
| 316 |
+
matched_text=best_overall_match,
|
| 317 |
+
similarity=pct,
|
| 318 |
+
source_type=best_overall_src["type"],
|
| 319 |
+
source_title=best_overall_src["title"],
|
| 320 |
+
source_url=best_overall_src["source_url"],
|
| 321 |
+
section=None,
|
| 322 |
+
context="Potential plagiarism detected",
|
| 323 |
+
))
|
| 324 |
+
source_matches_count[best_overall_src['source_url']] += 1
|
| 325 |
+
highest = max(highest, pct)
|
| 326 |
+
total_matches += 1
|
| 327 |
+
logger.debug(f" β
Match ({pct}%) with {best_overall_src['source_url'][:50]}")
|
| 328 |
+
|
| 329 |
+
# Better flagging logic considering multiple sources
|
| 330 |
+
num_sources_with_matches = sum(1 for c in source_matches_count.values() if c > 0)
|
| 331 |
+
avg_match_score = (sum(m.similarity for m in matches) / len(matches)) if matches else 0.0
|
| 332 |
+
|
| 333 |
+
# Flag if any of these conditions are met:
|
| 334 |
+
# 1. Single source with high similarity (>85%)
|
| 335 |
+
# 2. Content plagiarized from 2+ different sources
|
| 336 |
+
# 3. 3+ matches with average >70%
|
| 337 |
+
flagged = (
|
| 338 |
+
highest >= 85 or
|
| 339 |
+
num_sources_with_matches >= 2 or
|
| 340 |
+
(len(matches) >= 3 and avg_match_score >= 70)
|
| 341 |
+
)
|
| 342 |
+
|
| 343 |
+
logger.info(f" π Results:")
|
| 344 |
+
logger.info(f" Highest similarity: {highest:.1f}%")
|
| 345 |
+
logger.info(f" Total matches: {len(matches)}")
|
| 346 |
+
logger.info(f" Sources with matches: {num_sources_with_matches}")
|
| 347 |
+
logger.info(f" Average match score: {avg_match_score:.1f}%")
|
| 348 |
+
logger.info(f" Flagged: {flagged}")
|
| 349 |
+
|
| 350 |
+
doc_results.append(LexicalDocResult(
|
| 351 |
+
id=idx,
|
| 352 |
+
name=f.filename,
|
| 353 |
+
author=None,
|
| 354 |
+
similarity=round(highest, 1),
|
| 355 |
+
flagged=flagged,
|
| 356 |
+
wordCount=len(text.split()),
|
| 357 |
+
matches=matches,
|
| 358 |
+
content=text # Include full document for frontend
|
| 359 |
+
))
|
| 360 |
+
|
| 361 |
+
highest_any = max((d.similarity for d in doc_results), default=0.0)
|
| 362 |
+
avg = round(sum(d.similarity for d in doc_results) / len(doc_results), 1) if doc_results else 0.0
|
| 363 |
+
flagged_count = sum(1 for d in doc_results if d.flagged)
|
| 364 |
+
|
| 365 |
+
elapsed = (datetime.utcnow() - t0).total_seconds()
|
| 366 |
+
mm = int(elapsed // 60)
|
| 367 |
+
ss = int(elapsed % 60)
|
| 368 |
+
processing = f"{mm}m {ss:02d}s"
|
| 369 |
+
|
| 370 |
+
logger.info(f"\n{'='*80}")
|
| 371 |
+
logger.info(f"β
ANALYSIS COMPLETE")
|
| 372 |
+
logger.info(f"{'='*80}")
|
| 373 |
+
logger.info(f" Documents: {len(doc_results)}")
|
| 374 |
+
logger.info(f" Flagged: {flagged_count}")
|
| 375 |
+
logger.info(f" Highest: {highest_any}%")
|
| 376 |
+
logger.info(f" Average: {avg}%")
|
| 377 |
+
logger.info(f" Total Matches: {total_matches}")
|
| 378 |
+
logger.info(f" Total Time: {processing}\n")
|
| 379 |
+
|
| 380 |
+
result = TeacherLexicalBatchReport(
|
| 381 |
+
id="teacher_lexical_batch",
|
| 382 |
+
name="Teacher Lexical Analysis",
|
| 383 |
+
uploadDate=datetime.utcnow(),
|
| 384 |
+
processingTime=processing,
|
| 385 |
+
documents=doc_results,
|
| 386 |
+
summary=TeacherLexicalSummary(
|
| 387 |
+
totalDocuments=len(doc_results),
|
| 388 |
+
flaggedDocuments=flagged_count,
|
| 389 |
+
highestSimilarity=highest_any,
|
| 390 |
+
averageSimilarity=avg,
|
| 391 |
+
totalMatches=total_matches,
|
| 392 |
+
),
|
| 393 |
+
)
|
| 394 |
+
|
| 395 |
+
# Save to MongoDB
|
| 396 |
+
try:
|
| 397 |
+
mongo_client = await get_mongo_client()
|
| 398 |
+
db = mongo_client.sluethink
|
| 399 |
+
reports_collection = db.reports
|
| 400 |
+
|
| 401 |
+
# Extract unique sources from all matches
|
| 402 |
+
all_sources = set()
|
| 403 |
+
for doc in doc_results:
|
| 404 |
+
for match in doc.matches:
|
| 405 |
+
all_sources.add(match.source_url)
|
| 406 |
+
|
| 407 |
+
# Prepare document for MongoDB
|
| 408 |
+
report_doc = {
|
| 409 |
+
"name": f"Lexical_Batch_{datetime.utcnow().strftime('%Y%m%d_%H%M%S')}",
|
| 410 |
+
"analysisType": "lexical",
|
| 411 |
+
"submittedBy": current_user.get("username", "System"),
|
| 412 |
+
"uploadDate": datetime.utcnow().strftime("%Y-%m-%d"),
|
| 413 |
+
"similarity": highest_any,
|
| 414 |
+
"status": "completed",
|
| 415 |
+
"flagged": flagged_count > 0,
|
| 416 |
+
"fileCount": len(doc_results),
|
| 417 |
+
"processingTime": processing,
|
| 418 |
+
"avgSimilarity": avg,
|
| 419 |
+
"sources": list(all_sources),
|
| 420 |
+
"createdAt": datetime.utcnow(),
|
| 421 |
+
"userId": current_user.get("sub") or current_user.get("user_id"),
|
| 422 |
+
# Store full analysis details
|
| 423 |
+
"documents": [
|
| 424 |
+
{
|
| 425 |
+
"id": doc.id,
|
| 426 |
+
"name": doc.name,
|
| 427 |
+
"similarity": doc.similarity,
|
| 428 |
+
"flagged": doc.flagged,
|
| 429 |
+
"wordCount": doc.wordCount,
|
| 430 |
+
"matchCount": len(doc.matches),
|
| 431 |
+
"matches": [
|
| 432 |
+
{
|
| 433 |
+
"matched_text": m.matched_text,
|
| 434 |
+
"similarity": m.similarity,
|
| 435 |
+
"source_url": m.source_url,
|
| 436 |
+
"source_title": m.source_title,
|
| 437 |
+
"source_type": m.source_type,
|
| 438 |
+
}
|
| 439 |
+
for m in doc.matches
|
| 440 |
+
]
|
| 441 |
+
}
|
| 442 |
+
for doc in doc_results
|
| 443 |
+
],
|
| 444 |
+
"summary": {
|
| 445 |
+
"totalDocuments": result.summary.totalDocuments,
|
| 446 |
+
"flaggedDocuments": result.summary.flaggedDocuments,
|
| 447 |
+
"highestSimilarity": result.summary.highestSimilarity,
|
| 448 |
+
"averageSimilarity": result.summary.averageSimilarity,
|
| 449 |
+
"totalMatches": result.summary.totalMatches,
|
| 450 |
+
}
|
| 451 |
+
}
|
| 452 |
+
|
| 453 |
+
# Insert into MongoDB
|
| 454 |
+
insert_result = await reports_collection.insert_one(report_doc)
|
| 455 |
+
logger.info(f"πΎ Report saved to MongoDB with ID: {insert_result.inserted_id}")
|
| 456 |
+
|
| 457 |
+
# Update the result with the MongoDB ID
|
| 458 |
+
result.id = str(insert_result.inserted_id)
|
| 459 |
+
|
| 460 |
+
mongo_client.close()
|
| 461 |
+
|
| 462 |
+
except Exception as e:
|
| 463 |
+
logger.error(f"β Error saving to MongoDB: {str(e)}")
|
| 464 |
+
|
| 465 |
+
logger.info(f"\nπ§Ύ Returning report:")
|
| 466 |
+
logger.info(f" Total Docs: {result.summary.totalDocuments}")
|
| 467 |
+
logger.info(f" Flagged Docs: {result.summary.flaggedDocuments}")
|
| 468 |
+
logger.info(f" Avg Similarity: {result.summary.averageSimilarity}%")
|
| 469 |
+
logger.info(f" Highest Similarity: {result.summary.highestSimilarity}%")
|
| 470 |
+
logger.info(f" Total Matches: {result.summary.totalMatches}\n")
|
| 471 |
+
|
| 472 |
+
return result
|
app/routers/teacher/semantic_analysis.py
ADDED
|
@@ -0,0 +1,406 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fastapi import APIRouter, UploadFile, File, HTTPException, Depends
|
| 2 |
+
from typing import List
|
| 3 |
+
from datetime import datetime
|
| 4 |
+
from fastapi.security import OAuth2PasswordBearer
|
| 5 |
+
from jose import JWTError, jwt
|
| 6 |
+
from motor.motor_asyncio import AsyncIOMotorClient
|
| 7 |
+
import logging
|
| 8 |
+
import asyncio
|
| 9 |
+
import threading
|
| 10 |
+
from concurrent.futures import ThreadPoolExecutor
|
| 11 |
+
|
| 12 |
+
from app.config import MONGODB_URI, ALGORITHM, SECRET_KEY
|
| 13 |
+
from app.schemas.teacher_schemas import (
|
| 14 |
+
TeacherLexicalBatchReport, TeacherLexicalSummary,
|
| 15 |
+
LexicalDocResult, LexicalMatch
|
| 16 |
+
)
|
| 17 |
+
from app.utils.file_utils import extract_text_from_file, allowed_file
|
| 18 |
+
from app.utils.semantic_utils import (
|
| 19 |
+
generate_five_queries,
|
| 20 |
+
find_semantic_matches,
|
| 21 |
+
)
|
| 22 |
+
from app.utils.web_utils import fetch_sources_multi_query
|
| 23 |
+
from app.utils.ai_detector import detect_ai_similarity
|
| 24 |
+
|
| 25 |
+
router = APIRouter(prefix="/teacher", tags=["teacher-semantic"])
|
| 26 |
+
|
| 27 |
+
SEMANTIC_THRESHOLD = 0.50
|
| 28 |
+
SCRAPING_TIMEOUT = 180 # 3 minutes total for all queries combined
|
| 29 |
+
|
| 30 |
+
oauth2_scheme = OAuth2PasswordBearer(tokenUrl="/token")
|
| 31 |
+
logging.basicConfig(level=logging.INFO)
|
| 32 |
+
logger = logging.getLogger("semantic_analysis")
|
| 33 |
+
|
| 34 |
+
def verify_token(token: str = Depends(oauth2_scheme)):
|
| 35 |
+
try:
|
| 36 |
+
return jwt.decode(token, SECRET_KEY, algorithms=[ALGORITHM])
|
| 37 |
+
except JWTError:
|
| 38 |
+
raise HTTPException(status_code=401, detail="Invalid or expired token")
|
| 39 |
+
|
| 40 |
+
async def get_mongo_client():
|
| 41 |
+
return AsyncIOMotorClient(MONGODB_URI)
|
| 42 |
+
|
| 43 |
+
class ScrapingTimeoutManager:
|
| 44 |
+
"""Manages web scraping with hard 3-minute overall timeout"""
|
| 45 |
+
|
| 46 |
+
def __init__(self, timeout_seconds: int = 180):
|
| 47 |
+
self.timeout = timeout_seconds
|
| 48 |
+
self.start_time = None
|
| 49 |
+
self.sources = []
|
| 50 |
+
self.executor = ThreadPoolExecutor(max_workers=4)
|
| 51 |
+
self.lock = threading.Lock()
|
| 52 |
+
self.cancelled = False
|
| 53 |
+
|
| 54 |
+
def elapsed(self) -> float:
|
| 55 |
+
"""Get elapsed time in seconds"""
|
| 56 |
+
if self.start_time is None:
|
| 57 |
+
return 0.0
|
| 58 |
+
return (datetime.utcnow() - self.start_time).total_seconds()
|
| 59 |
+
|
| 60 |
+
def is_timeout(self) -> bool:
|
| 61 |
+
"""Check if 3-minute timeout exceeded"""
|
| 62 |
+
return self.elapsed() >= self.timeout
|
| 63 |
+
|
| 64 |
+
async def fetch_all_queries(self, queries: List[str], num_results: int = 5) -> List:
|
| 65 |
+
"""
|
| 66 |
+
Fetch sources for all queries with hard 180-second overall timeout.
|
| 67 |
+
Immediately stops and starts matching when timeout reached.
|
| 68 |
+
"""
|
| 69 |
+
self.start_time = datetime.utcnow()
|
| 70 |
+
self.sources = []
|
| 71 |
+
|
| 72 |
+
logger.info(f"\nπ WEB SCRAPING PHASE")
|
| 73 |
+
logger.info(f" Max Duration: {self.timeout}s (3 minutes)")
|
| 74 |
+
logger.info(f" Queries: {len(queries)}")
|
| 75 |
+
logger.info(f" Starting: {self.start_time.strftime('%H:%M:%S')}")
|
| 76 |
+
|
| 77 |
+
# Process all queries in parallel with timeout
|
| 78 |
+
tasks = []
|
| 79 |
+
for query_idx, query in enumerate(queries, 1):
|
| 80 |
+
logger.info(f"\n Query {query_idx}/{len(queries)}: {query[:60]}...")
|
| 81 |
+
tasks.append(self._fetch_query(query, num_results))
|
| 82 |
+
|
| 83 |
+
try:
|
| 84 |
+
# Wait for all tasks with overall timeout
|
| 85 |
+
await asyncio.wait_for(
|
| 86 |
+
asyncio.gather(*tasks, return_exceptions=True),
|
| 87 |
+
timeout=self.timeout
|
| 88 |
+
)
|
| 89 |
+
except asyncio.TimeoutError:
|
| 90 |
+
logger.warning(f"\nπ HARD TIMEOUT REACHED after {self.elapsed():.1f}s")
|
| 91 |
+
logger.warning(f" Cancelling all pending queries")
|
| 92 |
+
self.cancelled = True
|
| 93 |
+
# Cancel remaining tasks
|
| 94 |
+
for task in tasks:
|
| 95 |
+
if isinstance(task, asyncio.Task):
|
| 96 |
+
task.cancel()
|
| 97 |
+
|
| 98 |
+
# Remove duplicates
|
| 99 |
+
seen_urls = set()
|
| 100 |
+
unique_sources = []
|
| 101 |
+
for source in self.sources:
|
| 102 |
+
url = source.get('url', '')
|
| 103 |
+
if url and url not in seen_urls:
|
| 104 |
+
seen_urls.add(url)
|
| 105 |
+
unique_sources.append(source)
|
| 106 |
+
|
| 107 |
+
elapsed = self.elapsed()
|
| 108 |
+
logger.info(f"\nβ
SCRAPING PHASE STOPPED")
|
| 109 |
+
logger.info(f" Total Duration: {elapsed:.1f}s ({int(elapsed)//60}m {int(elapsed)%60}s)")
|
| 110 |
+
logger.info(f" Unique Sources: {len(unique_sources)}")
|
| 111 |
+
logger.info(f" Status: {'π TIMEOUT' if self.is_timeout() else 'β
COMPLETED'}")
|
| 112 |
+
|
| 113 |
+
return unique_sources
|
| 114 |
+
|
| 115 |
+
async def _fetch_query(self, query: str, num_results: int = 5):
|
| 116 |
+
"""Fetch sources for a single query"""
|
| 117 |
+
try:
|
| 118 |
+
sources = await asyncio.to_thread(
|
| 119 |
+
fetch_sources_multi_query,
|
| 120 |
+
query,
|
| 121 |
+
num_results
|
| 122 |
+
)
|
| 123 |
+
|
| 124 |
+
with self.lock:
|
| 125 |
+
self.sources.extend(sources)
|
| 126 |
+
|
| 127 |
+
logger.info(f" β
Found {len(sources)} sources")
|
| 128 |
+
|
| 129 |
+
except asyncio.CancelledError:
|
| 130 |
+
logger.warning(f" βοΈ Query cancelled (timeout)")
|
| 131 |
+
except Exception as e:
|
| 132 |
+
logger.error(f" β Error: {e}")
|
| 133 |
+
|
| 134 |
+
def cleanup(self):
|
| 135 |
+
"""Clean up executor"""
|
| 136 |
+
try:
|
| 137 |
+
self.executor.shutdown(wait=False)
|
| 138 |
+
except:
|
| 139 |
+
pass
|
| 140 |
+
|
| 141 |
+
@router.post("/semantic-analysis", response_model=TeacherLexicalBatchReport)
|
| 142 |
+
async def teacher_semantic_analysis(
|
| 143 |
+
files: List[UploadFile] = File(...),
|
| 144 |
+
current_user=Depends(verify_token),
|
| 145 |
+
):
|
| 146 |
+
if not files:
|
| 147 |
+
raise HTTPException(status_code=400, detail="No files uploaded")
|
| 148 |
+
|
| 149 |
+
t0 = datetime.utcnow()
|
| 150 |
+
doc_results: List[LexicalDocResult] = []
|
| 151 |
+
total_matches = 0
|
| 152 |
+
|
| 153 |
+
logger.info(f"\n{'='*80}")
|
| 154 |
+
logger.info(f"π§ SEMANTIC ANALYSIS - {len(files)} file(s)")
|
| 155 |
+
logger.info(f"{'='*80}")
|
| 156 |
+
|
| 157 |
+
for idx, f in enumerate(files, start=1):
|
| 158 |
+
if not allowed_file(f.filename):
|
| 159 |
+
raise HTTPException(status_code=400, detail=f"Invalid file type: {f.filename}")
|
| 160 |
+
|
| 161 |
+
raw = await f.read()
|
| 162 |
+
text = extract_text_from_file(raw, f.filename) or ""
|
| 163 |
+
|
| 164 |
+
logger.info(f"\nπ File {idx}: {f.filename}")
|
| 165 |
+
logger.info(f" Words: {len(text.split())}")
|
| 166 |
+
|
| 167 |
+
# β
AI DETECTION
|
| 168 |
+
logger.info(f"π€ Running AI detection...")
|
| 169 |
+
ai_similarity = detect_ai_similarity(text)
|
| 170 |
+
logger.info(f" AI Similarity: {ai_similarity}")
|
| 171 |
+
|
| 172 |
+
# Generate 3 semantic queries
|
| 173 |
+
queries = generate_five_queries(text)
|
| 174 |
+
|
| 175 |
+
# β
WEB SCRAPING WITH 3-MINUTE HARD TIMEOUT (OVERALL)
|
| 176 |
+
scraper = ScrapingTimeoutManager(timeout_seconds=SCRAPING_TIMEOUT)
|
| 177 |
+
try:
|
| 178 |
+
unique_sources = await scraper.fetch_all_queries(queries, num_results=5)
|
| 179 |
+
finally:
|
| 180 |
+
scraper.cleanup()
|
| 181 |
+
|
| 182 |
+
logger.info(f" Total unique sources: {len(unique_sources)}")
|
| 183 |
+
|
| 184 |
+
if not unique_sources:
|
| 185 |
+
logger.warning(f" β οΈ No sources found, skipping semantic matching")
|
| 186 |
+
doc_results.append(LexicalDocResult(
|
| 187 |
+
id=idx,
|
| 188 |
+
name=f.filename,
|
| 189 |
+
author=None,
|
| 190 |
+
similarity=0.0,
|
| 191 |
+
flagged=False,
|
| 192 |
+
wordCount=len(text.split()),
|
| 193 |
+
matches=[],
|
| 194 |
+
content=text[:5000],
|
| 195 |
+
ai_similarity=ai_similarity
|
| 196 |
+
))
|
| 197 |
+
continue
|
| 198 |
+
|
| 199 |
+
# β
MATCHING PHASE (starts immediately after timeout)
|
| 200 |
+
logger.info(f"\nπ SEMANTIC MATCHING PHASE")
|
| 201 |
+
logger.info(f" Comparing against {len(unique_sources)} sources...")
|
| 202 |
+
|
| 203 |
+
matches: List[LexicalMatch] = []
|
| 204 |
+
highest = 0.0
|
| 205 |
+
source_matches_count = {}
|
| 206 |
+
|
| 207 |
+
# Prepare externals
|
| 208 |
+
externals = [
|
| 209 |
+
{
|
| 210 |
+
"title": s.get("url", "Unknown"),
|
| 211 |
+
"text": s.get("content", ""),
|
| 212 |
+
"source_url": s.get("url", ""),
|
| 213 |
+
"type": "web",
|
| 214 |
+
}
|
| 215 |
+
for s in unique_sources if s.get("content")
|
| 216 |
+
]
|
| 217 |
+
|
| 218 |
+
for ext_idx, ext in enumerate(externals, 1):
|
| 219 |
+
logger.info(f" Source {ext_idx}/{len(externals)}: {ext['source_url'][:60]}...")
|
| 220 |
+
source_matches_count[ext['source_url']] = 0
|
| 221 |
+
|
| 222 |
+
try:
|
| 223 |
+
# Semantic comparison
|
| 224 |
+
semantic_matches = find_semantic_matches(
|
| 225 |
+
text,
|
| 226 |
+
ext["text"],
|
| 227 |
+
threshold=SEMANTIC_THRESHOLD
|
| 228 |
+
)
|
| 229 |
+
|
| 230 |
+
logger.info(f" Found {len(semantic_matches)} semantic matches")
|
| 231 |
+
|
| 232 |
+
for match in semantic_matches:
|
| 233 |
+
similarity_pct = round(match['similarity'] * 100, 1)
|
| 234 |
+
|
| 235 |
+
matches.append(LexicalMatch(
|
| 236 |
+
matched_text=match['doc_text'][:300],
|
| 237 |
+
similarity=similarity_pct,
|
| 238 |
+
source_type=ext["type"],
|
| 239 |
+
source_title=ext["title"],
|
| 240 |
+
source_url=ext["source_url"],
|
| 241 |
+
section=None,
|
| 242 |
+
context="Semantic similarity detected (possible paraphrasing)",
|
| 243 |
+
))
|
| 244 |
+
|
| 245 |
+
source_matches_count[ext['source_url']] += 1
|
| 246 |
+
highest = max(highest, similarity_pct)
|
| 247 |
+
total_matches += 1
|
| 248 |
+
|
| 249 |
+
logger.debug(f" Match: {similarity_pct}% - {match['doc_text'][:50]}...")
|
| 250 |
+
|
| 251 |
+
except Exception as e:
|
| 252 |
+
logger.error(f" Error matching source: {e}")
|
| 253 |
+
continue
|
| 254 |
+
|
| 255 |
+
# Deduplicate matches
|
| 256 |
+
logger.info(f" π Deduplicating {len(matches)} matches...")
|
| 257 |
+
unique_matches_dict = {}
|
| 258 |
+
|
| 259 |
+
for match in matches:
|
| 260 |
+
key = match.matched_text.lower().strip()
|
| 261 |
+
if key not in unique_matches_dict or match.similarity > unique_matches_dict[key].similarity:
|
| 262 |
+
unique_matches_dict[key] = match
|
| 263 |
+
|
| 264 |
+
matches = list(unique_matches_dict.values())
|
| 265 |
+
logger.info(f" β
Deduplicated to {len(matches)} unique matches")
|
| 266 |
+
|
| 267 |
+
# Recalculate metrics
|
| 268 |
+
highest = max((m.similarity for m in matches), default=0.0)
|
| 269 |
+
source_matches_count = {}
|
| 270 |
+
for match in matches:
|
| 271 |
+
source_matches_count[match.source_url] = source_matches_count.get(match.source_url, 0) + 1
|
| 272 |
+
|
| 273 |
+
# Flagging logic
|
| 274 |
+
num_sources_with_matches = sum(1 for c in source_matches_count.values() if c > 0)
|
| 275 |
+
avg_match_score = (sum(m.similarity for m in matches) / len(matches)) if matches else 0.0
|
| 276 |
+
|
| 277 |
+
flagged = (
|
| 278 |
+
highest >= 80 or
|
| 279 |
+
num_sources_with_matches >= 2 or
|
| 280 |
+
(len(matches) >= 2 and avg_match_score >= 70)
|
| 281 |
+
)
|
| 282 |
+
|
| 283 |
+
logger.info(f" π Results:")
|
| 284 |
+
logger.info(f" Highest: {highest:.1f}%")
|
| 285 |
+
logger.info(f" Total matches: {len(matches)}")
|
| 286 |
+
logger.info(f" Sources with matches: {num_sources_with_matches}")
|
| 287 |
+
logger.info(f" Average: {avg_match_score:.1f}%")
|
| 288 |
+
logger.info(f" Flagged: {flagged}")
|
| 289 |
+
|
| 290 |
+
doc_results.append(LexicalDocResult(
|
| 291 |
+
id=idx,
|
| 292 |
+
name=f.filename,
|
| 293 |
+
author=None,
|
| 294 |
+
similarity=round(highest, 1),
|
| 295 |
+
flagged=flagged,
|
| 296 |
+
wordCount=len(text.split()),
|
| 297 |
+
matches=matches,
|
| 298 |
+
content=text[:5000],
|
| 299 |
+
ai_similarity=ai_similarity
|
| 300 |
+
))
|
| 301 |
+
|
| 302 |
+
# Final summary
|
| 303 |
+
highest_any = max((d.similarity for d in doc_results), default=0.0)
|
| 304 |
+
avg = round(sum(d.similarity for d in doc_results) / len(doc_results), 1) if doc_results else 0.0
|
| 305 |
+
flagged_count = sum(1 for d in doc_results if d.flagged)
|
| 306 |
+
avg_ai_similarity = round(sum(d.ai_similarity for d in doc_results) / len(doc_results), 3) if doc_results else 0.0
|
| 307 |
+
|
| 308 |
+
elapsed = (datetime.utcnow() - t0).total_seconds()
|
| 309 |
+
mm = int(elapsed // 60)
|
| 310 |
+
ss = int(elapsed % 60)
|
| 311 |
+
processing = f"{mm}m {ss:02d}s"
|
| 312 |
+
|
| 313 |
+
logger.info(f"\n{'='*80}")
|
| 314 |
+
logger.info(f"β
ANALYSIS COMPLETE")
|
| 315 |
+
logger.info(f"{'='*80}")
|
| 316 |
+
logger.info(f" Documents: {len(doc_results)}")
|
| 317 |
+
logger.info(f" Flagged: {flagged_count}")
|
| 318 |
+
logger.info(f" Highest Semantic Similarity: {highest_any}%")
|
| 319 |
+
logger.info(f" Average Semantic Similarity: {avg}%")
|
| 320 |
+
logger.info(f" Average AI Similarity: {avg_ai_similarity}")
|
| 321 |
+
logger.info(f" Total Matches: {total_matches}")
|
| 322 |
+
logger.info(f" Total Time: {processing}\n")
|
| 323 |
+
|
| 324 |
+
result = TeacherLexicalBatchReport(
|
| 325 |
+
id="teacher_semantic_batch",
|
| 326 |
+
name="Teacher Semantic Analysis",
|
| 327 |
+
uploadDate=datetime.utcnow(),
|
| 328 |
+
processingTime=processing,
|
| 329 |
+
documents=doc_results,
|
| 330 |
+
summary=TeacherLexicalSummary(
|
| 331 |
+
totalDocuments=len(doc_results),
|
| 332 |
+
flaggedDocuments=flagged_count,
|
| 333 |
+
highestSimilarity=highest_any,
|
| 334 |
+
averageSimilarity=avg,
|
| 335 |
+
totalMatches=total_matches,
|
| 336 |
+
averageAiSimilarity=avg_ai_similarity,
|
| 337 |
+
),
|
| 338 |
+
)
|
| 339 |
+
|
| 340 |
+
# Save to MongoDB
|
| 341 |
+
try:
|
| 342 |
+
mongo_client = await get_mongo_client()
|
| 343 |
+
db = mongo_client.sluethink
|
| 344 |
+
reports_collection = db.reports
|
| 345 |
+
|
| 346 |
+
all_sources = set()
|
| 347 |
+
for doc in doc_results:
|
| 348 |
+
for match in doc.matches:
|
| 349 |
+
all_sources.add(match.source_url)
|
| 350 |
+
|
| 351 |
+
report_doc = {
|
| 352 |
+
"name": f"Semantic_Batch_{datetime.utcnow().strftime('%Y%m%d_%H%M%S')}",
|
| 353 |
+
"analysisType": "semantic",
|
| 354 |
+
"submittedBy": current_user.get("username", "System"),
|
| 355 |
+
"uploadDate": datetime.utcnow().strftime("%Y-%m-%d"),
|
| 356 |
+
"similarity": highest_any,
|
| 357 |
+
"aiSimilarity": avg_ai_similarity,
|
| 358 |
+
"status": "completed",
|
| 359 |
+
"flagged": flagged_count > 0,
|
| 360 |
+
"fileCount": len(doc_results),
|
| 361 |
+
"processingTime": processing,
|
| 362 |
+
"avgSimilarity": avg,
|
| 363 |
+
"sources": list(all_sources),
|
| 364 |
+
"createdAt": datetime.utcnow(),
|
| 365 |
+
"userId": current_user.get("sub") or current_user.get("user_id"),
|
| 366 |
+
"documents": [
|
| 367 |
+
{
|
| 368 |
+
"id": doc.id,
|
| 369 |
+
"name": doc.name,
|
| 370 |
+
"similarity": doc.similarity,
|
| 371 |
+
"aiSimilarity": doc.ai_similarity,
|
| 372 |
+
"flagged": doc.flagged,
|
| 373 |
+
"wordCount": doc.wordCount,
|
| 374 |
+
"matchCount": len(doc.matches),
|
| 375 |
+
"matches": [
|
| 376 |
+
{
|
| 377 |
+
"matched_text": m.matched_text,
|
| 378 |
+
"similarity": m.similarity,
|
| 379 |
+
"source_url": m.source_url,
|
| 380 |
+
"source_title": m.source_title,
|
| 381 |
+
"source_type": m.source_type,
|
| 382 |
+
}
|
| 383 |
+
for m in doc.matches
|
| 384 |
+
]
|
| 385 |
+
}
|
| 386 |
+
for doc in doc_results
|
| 387 |
+
],
|
| 388 |
+
"summary": {
|
| 389 |
+
"totalDocuments": result.summary.totalDocuments,
|
| 390 |
+
"flaggedDocuments": result.summary.flaggedDocuments,
|
| 391 |
+
"highestSimilarity": result.summary.highestSimilarity,
|
| 392 |
+
"averageSimilarity": result.summary.averageSimilarity,
|
| 393 |
+
"averageAiSimilarity": result.summary.averageAiSimilarity,
|
| 394 |
+
"totalMatches": result.summary.totalMatches,
|
| 395 |
+
}
|
| 396 |
+
}
|
| 397 |
+
|
| 398 |
+
insert_result = await reports_collection.insert_one(report_doc)
|
| 399 |
+
logger.info(f"πΎ Saved to MongoDB: {insert_result.inserted_id}")
|
| 400 |
+
result.id = str(insert_result.inserted_id)
|
| 401 |
+
mongo_client.close()
|
| 402 |
+
|
| 403 |
+
except Exception as e:
|
| 404 |
+
logger.error(f"β MongoDB error: {str(e)}")
|
| 405 |
+
|
| 406 |
+
return result
|
app/schemas/__pycache__/plagiarism_schemas.cpython-312.pyc
ADDED
|
Binary file (1.21 kB). View file
|
|
|
app/schemas/__pycache__/report_schemas.cpython-312.pyc
ADDED
|
Binary file (1.07 kB). View file
|
|
|
app/schemas/__pycache__/schemas.cpython-312.pyc
ADDED
|
Binary file (2.16 kB). View file
|
|
|
app/schemas/__pycache__/sources_schemas.cpython-312.pyc
ADDED
|
Binary file (526 Bytes). View file
|
|
|
app/schemas/__pycache__/teacher_schemas.cpython-312.pyc
ADDED
|
Binary file (8.09 kB). View file
|
|
|
app/schemas/plagiarism_schemas.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pydantic import BaseModel
|
| 2 |
+
from typing import List
|
| 3 |
+
|
| 4 |
+
class MatchDetail(BaseModel):
|
| 5 |
+
matched_text: str
|
| 6 |
+
similarity: float
|
| 7 |
+
source_type: str # "news" or "academic"
|
| 8 |
+
source_title: str
|
| 9 |
+
source_url: str
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class SentenceResult(BaseModel):
|
| 13 |
+
original_sentence: str
|
| 14 |
+
normalized_sentence: str
|
| 15 |
+
match_type: str # "full_sentence", "partial_phrase", "no_match"
|
| 16 |
+
matches: List[MatchDetail]
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
class PlagiarismResponse(BaseModel):
|
| 20 |
+
checked_sentences: int
|
| 21 |
+
checked_sources: int
|
| 22 |
+
results: List[SentenceResult]
|
app/schemas/report_schemas.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pydantic import BaseModel
|
| 2 |
+
from typing import List
|
| 3 |
+
from datetime import datetime
|
| 4 |
+
from app.schemas.plagiarism_schemas import MatchDetail
|
| 5 |
+
|
| 6 |
+
class ReportSummary(BaseModel):
|
| 7 |
+
id: str # MongoDBβs ObjectId as string
|
| 8 |
+
name: str
|
| 9 |
+
date: datetime
|
| 10 |
+
similarity: float # highest sentenceβlevel similarity (0β100)
|
| 11 |
+
sources: List[str] # unique list of source titles used
|
| 12 |
+
word_count: int
|
| 13 |
+
time_spent: str # e.g. "00:00"
|
| 14 |
+
flagged: bool # true if similarity > 70%
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
class ReportDetail(BaseModel):
|
| 18 |
+
id: str
|
| 19 |
+
name: str
|
| 20 |
+
content: str
|
| 21 |
+
plagiarism_data: List[MatchDetail]
|
app/schemas/sources_schemas.py
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pydantic import BaseModel
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
class SourceData(BaseModel):
|
| 5 |
+
id: str # MongoDB ObjectId as string
|
| 6 |
+
title: str
|
| 7 |
+
text: str
|
| 8 |
+
source_url: str
|
| 9 |
+
type: str
|
app/schemas/teacher_schemas.py
ADDED
|
@@ -0,0 +1,185 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pydantic import BaseModel, Field
|
| 2 |
+
from typing import List, Optional
|
| 3 |
+
from datetime import datetime
|
| 4 |
+
|
| 5 |
+
# ---- Shared ----
|
| 6 |
+
|
| 7 |
+
class DocumentInfo(BaseModel):
|
| 8 |
+
id: int
|
| 9 |
+
name: str
|
| 10 |
+
author: Optional[str] = None
|
| 11 |
+
|
| 12 |
+
class OverlapDetail(BaseModel):
|
| 13 |
+
# For lexical/internal
|
| 14 |
+
fromDoc: str
|
| 15 |
+
toDoc: str
|
| 16 |
+
text: str
|
| 17 |
+
similarity: float # percent (0β100)
|
| 18 |
+
sectionA: Optional[str] = None
|
| 19 |
+
sectionB: Optional[str] = None
|
| 20 |
+
context: Optional[str] = None
|
| 21 |
+
|
| 22 |
+
class ComparisonDetail(BaseModel):
|
| 23 |
+
id: str # "i-j"
|
| 24 |
+
docA: str
|
| 25 |
+
docB: str
|
| 26 |
+
similarity: float # percent (0β100)
|
| 27 |
+
flagged: bool
|
| 28 |
+
overlaps: List[OverlapDetail] = Field(default_factory=list)
|
| 29 |
+
contentA: str = ""
|
| 30 |
+
contentB: str = ""
|
| 31 |
+
|
| 32 |
+
class InternalReportSummary(BaseModel):
|
| 33 |
+
totalDocuments: int
|
| 34 |
+
totalComparisons: int
|
| 35 |
+
flaggedComparisons: int
|
| 36 |
+
highestSimilarity: float
|
| 37 |
+
averageSimilarity: Optional[float] = None
|
| 38 |
+
|
| 39 |
+
class InternalReportDetail(BaseModel):
|
| 40 |
+
id: str
|
| 41 |
+
name: str
|
| 42 |
+
analysisType: str = "internal"
|
| 43 |
+
uploadDate: datetime
|
| 44 |
+
processingTime: str
|
| 45 |
+
status: str = "completed"
|
| 46 |
+
documents: List[DocumentInfo]
|
| 47 |
+
comparisons: List[ComparisonDetail]
|
| 48 |
+
summary: InternalReportSummary
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
class LexicalMatch(BaseModel):
|
| 53 |
+
matched_text: str
|
| 54 |
+
similarity: float
|
| 55 |
+
source_type: str
|
| 56 |
+
source_title: str
|
| 57 |
+
source_url: str
|
| 58 |
+
section: Optional[str] = None
|
| 59 |
+
context: Optional[str] = None
|
| 60 |
+
|
| 61 |
+
class LexicalDocResult(BaseModel):
|
| 62 |
+
id: int
|
| 63 |
+
name: str
|
| 64 |
+
author: Optional[str] = None
|
| 65 |
+
similarity: float # overall percent
|
| 66 |
+
flagged: bool
|
| 67 |
+
wordCount: Optional[int] = None
|
| 68 |
+
matches: List[LexicalMatch] = Field(default_factory=list)
|
| 69 |
+
content: Optional[str] = None
|
| 70 |
+
ai_similarity: float = 0.0
|
| 71 |
+
|
| 72 |
+
class TeacherLexicalSummary(BaseModel):
|
| 73 |
+
totalDocuments: int
|
| 74 |
+
flaggedDocuments: int
|
| 75 |
+
highestSimilarity: float
|
| 76 |
+
averageSimilarity: Optional[float] = None
|
| 77 |
+
totalMatches: int
|
| 78 |
+
averageAiSimilarity: float = 0.0
|
| 79 |
+
|
| 80 |
+
class TeacherLexicalBatchReport(BaseModel):
|
| 81 |
+
id: str
|
| 82 |
+
name: str
|
| 83 |
+
analysisType: str = "lexical"
|
| 84 |
+
uploadDate: datetime
|
| 85 |
+
processingTime: str
|
| 86 |
+
status: str = "completed"
|
| 87 |
+
documents: List[LexicalDocResult]
|
| 88 |
+
summary: TeacherLexicalSummary
|
| 89 |
+
|
| 90 |
+
# ---- Teacher Semantic (internal/external) ----
|
| 91 |
+
|
| 92 |
+
class SemanticOverlap(BaseModel):
|
| 93 |
+
textA: str
|
| 94 |
+
textB: str
|
| 95 |
+
cosine: float # 0β1
|
| 96 |
+
cosine_pct: float # 0β100
|
| 97 |
+
sectionA: Optional[str] = None
|
| 98 |
+
sectionB: Optional[str] = None
|
| 99 |
+
confidence: str # "high" | "medium" | "low"
|
| 100 |
+
|
| 101 |
+
class SemanticComparison(BaseModel):
|
| 102 |
+
id: str
|
| 103 |
+
docA: str
|
| 104 |
+
docB: str
|
| 105 |
+
similarity: float # aggregated cosine percent
|
| 106 |
+
flagged: bool
|
| 107 |
+
overlaps: List[SemanticOverlap] = Field(default_factory=list)
|
| 108 |
+
|
| 109 |
+
class TeacherSemanticReport(BaseModel):
|
| 110 |
+
id: str
|
| 111 |
+
name: str
|
| 112 |
+
analysisType: str = "semantic"
|
| 113 |
+
mode: str # "internal" | "external"
|
| 114 |
+
uploadDate: datetime
|
| 115 |
+
processingTime: str
|
| 116 |
+
status: str = "completed"
|
| 117 |
+
documents: List[DocumentInfo]
|
| 118 |
+
comparisons: List[SemanticComparison]
|
| 119 |
+
summary: InternalReportSummary
|
| 120 |
+
narrative: Optional[str] = None
|
| 121 |
+
|
| 122 |
+
from pydantic import BaseModel, Field
|
| 123 |
+
from typing import List, Optional
|
| 124 |
+
from datetime import datetime
|
| 125 |
+
|
| 126 |
+
class CodeMatch(BaseModel):
|
| 127 |
+
matched_code: str
|
| 128 |
+
similarity: float
|
| 129 |
+
source_type: str # 'peer', 'github', 'stackoverflow', 'web'
|
| 130 |
+
source_title: str
|
| 131 |
+
source_url: Optional[str] = None
|
| 132 |
+
match_type: str # 'exact', 'structural', 'token_sequence'
|
| 133 |
+
line_start: Optional[int] = None
|
| 134 |
+
line_end: Optional[int] = None
|
| 135 |
+
context: Optional[str] = None
|
| 136 |
+
|
| 137 |
+
class CodeFunction(BaseModel):
|
| 138 |
+
name: str
|
| 139 |
+
start_line: int
|
| 140 |
+
end_line: int
|
| 141 |
+
code: str
|
| 142 |
+
complexity: int # Cyclomatic complexity
|
| 143 |
+
tokens: List[str]
|
| 144 |
+
ast_hash: str
|
| 145 |
+
|
| 146 |
+
class CodeDocResult(BaseModel):
|
| 147 |
+
id: int
|
| 148 |
+
name: str
|
| 149 |
+
author: Optional[str] = None
|
| 150 |
+
similarity: float
|
| 151 |
+
flagged: bool
|
| 152 |
+
lineCount: int
|
| 153 |
+
functionCount: int
|
| 154 |
+
matches: List[CodeMatch]
|
| 155 |
+
functions: List[CodeFunction]
|
| 156 |
+
content: str # Full code content
|
| 157 |
+
language: str
|
| 158 |
+
|
| 159 |
+
class CodeAnalysisSummary(BaseModel):
|
| 160 |
+
totalDocuments: int
|
| 161 |
+
flaggedDocuments: int
|
| 162 |
+
highestSimilarity: float
|
| 163 |
+
averageSimilarity: float
|
| 164 |
+
totalMatches: int
|
| 165 |
+
peerMatches: int
|
| 166 |
+
externalMatches: int
|
| 167 |
+
|
| 168 |
+
class TeacherCodeBatchReport(BaseModel):
|
| 169 |
+
id: str
|
| 170 |
+
name: str
|
| 171 |
+
uploadDate: datetime
|
| 172 |
+
processingTime: str
|
| 173 |
+
documents: List[CodeDocResult]
|
| 174 |
+
summary: CodeAnalysisSummary
|
| 175 |
+
assignmentTopic: Optional[str] = None
|
| 176 |
+
|
| 177 |
+
class InternalMatch(BaseModel):
|
| 178 |
+
"""Match between two student submissions"""
|
| 179 |
+
student1_id: int
|
| 180 |
+
student2_id: int
|
| 181 |
+
student1_name: str
|
| 182 |
+
student2_name: str
|
| 183 |
+
similarity: float
|
| 184 |
+
match_type: str
|
| 185 |
+
matched_functions: List[str]
|
app/utils/__pycache__/ai_detector.cpython-312.pyc
ADDED
|
Binary file (3.22 kB). View file
|
|
|
app/utils/__pycache__/code_comparism.cpython-312.pyc
ADDED
|
Binary file (12 kB). View file
|
|
|
app/utils/__pycache__/code_detection.cpython-312.pyc
ADDED
|
Binary file (13.5 kB). View file
|
|
|
app/utils/__pycache__/code_plagiarism_integration.cpython-312.pyc
ADDED
|
Binary file (10.6 kB). View file
|
|
|
app/utils/__pycache__/code_utils.cpython-312.pyc
ADDED
|
Binary file (22.8 kB). View file
|
|
|
app/utils/__pycache__/file_utils.cpython-312.pyc
ADDED
|
Binary file (2.42 kB). View file
|
|
|
app/utils/__pycache__/lexical_utils.cpython-312.pyc
ADDED
|
Binary file (13.9 kB). View file
|
|
|
app/utils/__pycache__/semantic_utils.cpython-312.pyc
ADDED
|
Binary file (15 kB). View file
|
|
|
app/utils/__pycache__/text_utils.cpython-312.pyc
ADDED
|
Binary file (10.8 kB). View file
|
|
|
app/utils/__pycache__/web_utils.cpython-312.pyc
ADDED
|
Binary file (27.2 kB). View file
|
|
|
app/utils/ai_detector.py
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
AI detection using HuggingFace fakespot-ai/roberta-base-ai-text-detection-v1
|
| 3 |
+
Fast, accurate ML model. Returns AI similarity score (0-1).
|
| 4 |
+
"""
|
| 5 |
+
import logging
|
| 6 |
+
import os
|
| 7 |
+
from huggingface_hub import InferenceClient
|
| 8 |
+
from app.config import HF_TOKEN
|
| 9 |
+
logging.basicConfig(level=logging.INFO)
|
| 10 |
+
logger = logging.getLogger("ai_detector")
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
# Initialize HF client
|
| 14 |
+
try:
|
| 15 |
+
hf_client = InferenceClient(api_key=HF_TOKEN)
|
| 16 |
+
logger.info("β HuggingFace client initialized")
|
| 17 |
+
except Exception as e:
|
| 18 |
+
logger.error(f"β HF_TOKEN not available: {e}")
|
| 19 |
+
hf_client = None
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def detect_ai_similarity(text: str) -> float:
|
| 23 |
+
"""
|
| 24 |
+
Detect AI-generated text using HuggingFace model.
|
| 25 |
+
Fast, accurate ML-based detection.
|
| 26 |
+
|
| 27 |
+
Returns:
|
| 28 |
+
float: 0.0-1.0 where 1.0 = definitely AI, 0.0 = definitely human
|
| 29 |
+
"""
|
| 30 |
+
try:
|
| 31 |
+
logger.info("Starting AI detection (HuggingFace model)...")
|
| 32 |
+
|
| 33 |
+
if not text or len(text) < 20:
|
| 34 |
+
logger.warning("Text too short for detection")
|
| 35 |
+
return 0.0
|
| 36 |
+
|
| 37 |
+
if hf_client is None:
|
| 38 |
+
logger.error("HuggingFace client not initialized")
|
| 39 |
+
return 0.0
|
| 40 |
+
|
| 41 |
+
# Truncate to avoid token limits
|
| 42 |
+
text = text[:2000]
|
| 43 |
+
|
| 44 |
+
logger.info("Sending request to HuggingFace...")
|
| 45 |
+
result = hf_client.text_classification(
|
| 46 |
+
text,
|
| 47 |
+
model="fakespot-ai/roberta-base-ai-text-detection-v1",
|
| 48 |
+
)
|
| 49 |
+
|
| 50 |
+
logger.info(f"API response: {result}")
|
| 51 |
+
|
| 52 |
+
# Extract top result
|
| 53 |
+
if not result or len(result) == 0:
|
| 54 |
+
logger.error("No result from API")
|
| 55 |
+
return 0.0
|
| 56 |
+
|
| 57 |
+
top_result = result[0]
|
| 58 |
+
label = top_result.get('label', '').upper()
|
| 59 |
+
confidence = top_result.get('score', 0.5)
|
| 60 |
+
|
| 61 |
+
logger.info(f"Classification: {label} (confidence: {confidence:.4f})")
|
| 62 |
+
|
| 63 |
+
# Map to 0-1 score
|
| 64 |
+
if label == 'AI':
|
| 65 |
+
ai_score = confidence
|
| 66 |
+
elif label == 'HUMAN':
|
| 67 |
+
ai_score = 1.0 - confidence
|
| 68 |
+
else:
|
| 69 |
+
ai_score = 0.5
|
| 70 |
+
|
| 71 |
+
ai_score = min(max(ai_score, 0.0), 1.0) # Clamp to 0-1
|
| 72 |
+
logger.info(f"β
AI detection complete: {ai_score:.3f}")
|
| 73 |
+
|
| 74 |
+
return round(ai_score, 3)
|
| 75 |
+
|
| 76 |
+
except Exception as e:
|
| 77 |
+
logger.error(f"β AI detection error: {str(e)}", exc_info=True)
|
| 78 |
+
return 0.0
|
| 79 |
+
|
| 80 |
+
|
app/utils/file_utils.py
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import tempfile
|
| 2 |
+
from pdfminer.high_level import extract_text as extract_pdf_text
|
| 3 |
+
from docx import Document as DocxDocument
|
| 4 |
+
from app.config import ALLOWED_EXTENSIONS
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
def allowed_file(filename: str) -> bool:
|
| 8 |
+
return "." in filename and filename.rsplit(".", 1)[1].lower() in ALLOWED_EXTENSIONS
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def extract_text_from_file(content_bytes: bytes, filename: str, max_words: int = 500) -> str:
|
| 12 |
+
ext = filename.rsplit(".", 1)[1].lower()
|
| 13 |
+
text = ""
|
| 14 |
+
try:
|
| 15 |
+
if ext == "txt":
|
| 16 |
+
text = content_bytes.decode("utf-8", errors="ignore")
|
| 17 |
+
elif ext == "pdf":
|
| 18 |
+
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp:
|
| 19 |
+
tmp.write(content_bytes)
|
| 20 |
+
tmp.flush()
|
| 21 |
+
text = extract_pdf_text(tmp.name)
|
| 22 |
+
elif ext == "docx":
|
| 23 |
+
with tempfile.NamedTemporaryFile(suffix=".docx", delete=False) as tmp:
|
| 24 |
+
tmp.write(content_bytes)
|
| 25 |
+
tmp.flush()
|
| 26 |
+
doc = DocxDocument(tmp.name)
|
| 27 |
+
text = "\n".join([p.text for p in doc.paragraphs])
|
| 28 |
+
except Exception:
|
| 29 |
+
text = ""
|
| 30 |
+
|
| 31 |
+
# Word count check
|
| 32 |
+
word_count = len(text.split())
|
| 33 |
+
if word_count > max_words:
|
| 34 |
+
raise ValueError(f"File exceeds {max_words} words (found {word_count}).")
|
| 35 |
+
|
| 36 |
+
return text
|
app/utils/lexical_utils.py
ADDED
|
@@ -0,0 +1,293 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
from typing import List, Optional, Tuple, Set
|
| 3 |
+
import nltk
|
| 4 |
+
from nltk.tokenize import sent_tokenize, word_tokenize
|
| 5 |
+
from nltk.corpus import stopwords
|
| 6 |
+
from nltk.stem import WordNetLemmatizer
|
| 7 |
+
from rapidfuzz.distance import Levenshtein
|
| 8 |
+
|
| 9 |
+
from app.config import (
|
| 10 |
+
MIN_WORDS_PER_SENTENCE,
|
| 11 |
+
MIN_SENTENCE_LENGTH,
|
| 12 |
+
SEQUENCE_THRESHOLD,
|
| 13 |
+
EXACT_MATCH_SCORE,
|
| 14 |
+
)
|
| 15 |
+
|
| 16 |
+
nltk.download("punkt")
|
| 17 |
+
nltk.download("stopwords")
|
| 18 |
+
nltk.download("wordnet")
|
| 19 |
+
nltk.download('punkt_tab')
|
| 20 |
+
|
| 21 |
+
lemmatizer = WordNetLemmatizer()
|
| 22 |
+
|
| 23 |
+
def normalize_text(text: str) -> str:
|
| 24 |
+
if not text:
|
| 25 |
+
return ""
|
| 26 |
+
text = text.lower()
|
| 27 |
+
text = text.replace("\u00ad", "")
|
| 28 |
+
text = re.sub(r"-\s*\n\s*", "", text)
|
| 29 |
+
text = text.replace("\n", " ")
|
| 30 |
+
text = re.sub(r"[^\x20-\x7E]+", " ", text)
|
| 31 |
+
text = (text.replace("'", "'").replace("'", "'")
|
| 32 |
+
.replace(""", '"').replace(""", '"')
|
| 33 |
+
.replace("β", "-").replace("β", "-"))
|
| 34 |
+
text = re.sub(r"[^\w\s-]", " ", text)
|
| 35 |
+
text = re.sub(r"\s+", " ", text).strip()
|
| 36 |
+
tokens = word_tokenize(text)
|
| 37 |
+
lemmas = [lemmatizer.lemmatize(tok) for tok in tokens]
|
| 38 |
+
normalized = " ".join(lemmas)
|
| 39 |
+
normalized = re.sub(r"\s+", " ", normalized).strip()
|
| 40 |
+
return normalized
|
| 41 |
+
|
| 42 |
+
def get_meaningful_sentences(text: str) -> List[str]:
|
| 43 |
+
"""Extract meaningful sentences from text."""
|
| 44 |
+
sentences = sent_tokenize(text or "")
|
| 45 |
+
filtered = []
|
| 46 |
+
for s in sentences:
|
| 47 |
+
words = word_tokenize(s)
|
| 48 |
+
if len(words) >= MIN_WORDS_PER_SENTENCE and len(s.strip()) >= MIN_SENTENCE_LENGTH:
|
| 49 |
+
filtered.append(s.strip())
|
| 50 |
+
return filtered
|
| 51 |
+
|
| 52 |
+
def extract_keywords(text: str, max_keywords: int = 5) -> List[str]:
|
| 53 |
+
"""Extract top keywords from text for search queries."""
|
| 54 |
+
words = word_tokenize((text or "").lower())
|
| 55 |
+
stop_words = set(stopwords.words("english"))
|
| 56 |
+
filtered = [w for w in words if w.isalpha() and w not in stop_words and len(w) > 3]
|
| 57 |
+
freq = nltk.FreqDist(filtered)
|
| 58 |
+
return [word for word, _ in freq.most_common(max_keywords)]
|
| 59 |
+
|
| 60 |
+
def _word_shingles(norm_text: str, k: int = 7) -> List[str]:
|
| 61 |
+
tokens = norm_text.split()
|
| 62 |
+
if len(tokens) < k:
|
| 63 |
+
return []
|
| 64 |
+
return [" ".join(tokens[i:i+k]) for i in range(len(tokens) - k + 1)]
|
| 65 |
+
|
| 66 |
+
def _shingle_sets(a_norm: str, b_norm: str, k: int = 7) -> Tuple[Set[str], Set[str]]:
|
| 67 |
+
return set(_word_shingles(a_norm, k)), set(_word_shingles(b_norm, k))
|
| 68 |
+
|
| 69 |
+
def _jaccard(a: Set[str], b: Set[str]) -> float:
|
| 70 |
+
if not a and not b:
|
| 71 |
+
return 0.0
|
| 72 |
+
inter = len(a & b)
|
| 73 |
+
union = len(a | b) or 1
|
| 74 |
+
return inter / union
|
| 75 |
+
|
| 76 |
+
def _containment(a: Set[str], b: Set[str]) -> float:
|
| 77 |
+
if not a:
|
| 78 |
+
return 0.0
|
| 79 |
+
inter = len(a & b)
|
| 80 |
+
return inter / len(a)
|
| 81 |
+
|
| 82 |
+
def _winnowing_hashes(norm_text: str, k: int = 7, w: int = 4) -> List[Tuple[int, int]]:
|
| 83 |
+
tokens = norm_text.split()
|
| 84 |
+
if len(tokens) < k:
|
| 85 |
+
return []
|
| 86 |
+
shingles = [" ".join(tokens[i:i+k]) for i in range(len(tokens) - k + 1)]
|
| 87 |
+
hashes = [(hash(s) & 0xFFFFFFFF, i) for i, s in enumerate(shingles)]
|
| 88 |
+
|
| 89 |
+
if w <= 1 or len(hashes) <= w:
|
| 90 |
+
return list(dict.fromkeys(hashes))
|
| 91 |
+
|
| 92 |
+
fps: List[Tuple[int, int]] = []
|
| 93 |
+
last_min_abs = -1
|
| 94 |
+
for i in range(0, len(hashes) - w + 1):
|
| 95 |
+
window = hashes[i:i+w]
|
| 96 |
+
min_hash, min_idx = None, None
|
| 97 |
+
for j, (h, _) in enumerate(window):
|
| 98 |
+
if (min_hash is None) or (h < min_hash) or (h == min_hash and j > (min_idx or -1)):
|
| 99 |
+
min_hash, min_idx = h, j
|
| 100 |
+
abs_idx = i + (min_idx or 0)
|
| 101 |
+
if abs_idx != last_min_abs:
|
| 102 |
+
fps.append(hashes[abs_idx])
|
| 103 |
+
last_min_abs = abs_idx
|
| 104 |
+
return list(dict.fromkeys(fps))
|
| 105 |
+
|
| 106 |
+
def _winnowing_overlap(a_fp: List[Tuple[int, int]], b_fp: List[Tuple[int, int]]) -> float:
|
| 107 |
+
a_set, b_set = set(a_fp), set(b_fp)
|
| 108 |
+
if not a_set or not b_set:
|
| 109 |
+
return 0.0
|
| 110 |
+
shared = len(a_set & b_set)
|
| 111 |
+
denom = min(len(a_set), len(b_set)) or 1
|
| 112 |
+
return shared / denom
|
| 113 |
+
|
| 114 |
+
def _exact_substring(norm_sentence: str, norm_external: str) -> bool:
|
| 115 |
+
if not norm_sentence or not norm_external:
|
| 116 |
+
return False
|
| 117 |
+
return norm_external.find(norm_sentence) != -1
|
| 118 |
+
|
| 119 |
+
def _levenshtein_sim(a: str, b: str) -> float:
|
| 120 |
+
if not a or not b:
|
| 121 |
+
return 0.0
|
| 122 |
+
return Levenshtein.normalized_similarity(a, b)
|
| 123 |
+
|
| 124 |
+
def _lcs_length(a: str, b: str) -> int:
|
| 125 |
+
if not a or not b:
|
| 126 |
+
return 0
|
| 127 |
+
prev = [0] * (len(b) + 1)
|
| 128 |
+
best = 0
|
| 129 |
+
for i in range(1, len(a) + 1):
|
| 130 |
+
curr = [0]
|
| 131 |
+
ai = a[i-1]
|
| 132 |
+
for j in range(1, len(b) + 1):
|
| 133 |
+
if ai == b[j-1]:
|
| 134 |
+
v = prev[j-1] + 1
|
| 135 |
+
curr.append(v)
|
| 136 |
+
if v > best:
|
| 137 |
+
best = v
|
| 138 |
+
else:
|
| 139 |
+
curr.append(0)
|
| 140 |
+
prev = curr
|
| 141 |
+
return best
|
| 142 |
+
|
| 143 |
+
def _extract_phrases(norm_text: str, min_words: int = 3, max_words: int = 7) -> List[str]:
|
| 144 |
+
"""Extract all phrases of varying lengths for partial match detection."""
|
| 145 |
+
tokens = norm_text.split()
|
| 146 |
+
phrases = []
|
| 147 |
+
for k in range(min_words, min(max_words + 1, len(tokens) + 1)):
|
| 148 |
+
for i in range(len(tokens) - k + 1):
|
| 149 |
+
phrases.append(" ".join(tokens[i:i+k]))
|
| 150 |
+
return phrases
|
| 151 |
+
|
| 152 |
+
def _phrase_containment_match(norm_sentence: str, norm_external: str) -> Optional[Tuple[str, float]]:
|
| 153 |
+
"""Check if ANY phrase (3-7 words) from sentence appears in external text."""
|
| 154 |
+
phrases = _extract_phrases(norm_sentence, min_words=3, max_words=7)
|
| 155 |
+
|
| 156 |
+
best_phrase = None
|
| 157 |
+
best_score = 0.0
|
| 158 |
+
|
| 159 |
+
for phrase in phrases:
|
| 160 |
+
if phrase in norm_external:
|
| 161 |
+
score = len(phrase.split()) / len(norm_sentence.split())
|
| 162 |
+
if score > best_score:
|
| 163 |
+
best_score = score
|
| 164 |
+
best_phrase = phrase
|
| 165 |
+
|
| 166 |
+
if best_phrase and best_score >= 0.4:
|
| 167 |
+
return best_phrase, round(best_score, 3)
|
| 168 |
+
|
| 169 |
+
return None
|
| 170 |
+
|
| 171 |
+
def find_exact_matches(sentence: str, external_text: str) -> Optional[float]:
|
| 172 |
+
"""
|
| 173 |
+
Full-sentence lexical match with multiple strategies:
|
| 174 |
+
1) Exact substring match β EXACT_MATCH_SCORE (100%)
|
| 175 |
+
2) Winnowing fingerprints β robust plagiarism detection
|
| 176 |
+
3) Edit-distance (Levenshtein) β catches paraphrased content
|
| 177 |
+
4) LCS (longest common substring) β catches missed overlaps
|
| 178 |
+
"""
|
| 179 |
+
norm_s = normalize_text(sentence)
|
| 180 |
+
norm_e = normalize_text(external_text)
|
| 181 |
+
|
| 182 |
+
if len(norm_s) < MIN_SENTENCE_LENGTH:
|
| 183 |
+
return None
|
| 184 |
+
|
| 185 |
+
# 1) Exact full-sentence match
|
| 186 |
+
if _exact_substring(norm_s, norm_e):
|
| 187 |
+
return EXACT_MATCH_SCORE
|
| 188 |
+
|
| 189 |
+
# 2) Winnowing overlap
|
| 190 |
+
win_sim = _winnowing_overlap(
|
| 191 |
+
_winnowing_hashes(norm_s, k=5, w=4),
|
| 192 |
+
_winnowing_hashes(norm_e, k=5, w=4),
|
| 193 |
+
)
|
| 194 |
+
if win_sim >= SEQUENCE_THRESHOLD * 0.9:
|
| 195 |
+
return round(win_sim, 3)
|
| 196 |
+
|
| 197 |
+
# 3) Edit-distance fallback
|
| 198 |
+
lev = _levenshtein_sim(norm_s, norm_e)
|
| 199 |
+
if lev >= SEQUENCE_THRESHOLD:
|
| 200 |
+
return round(lev, 3)
|
| 201 |
+
|
| 202 |
+
# 4) LCS fallback
|
| 203 |
+
lcs_len = _lcs_length(norm_s, norm_e)
|
| 204 |
+
LCS_MIN_CHARS = 15
|
| 205 |
+
if lcs_len >= LCS_MIN_CHARS:
|
| 206 |
+
return round(lcs_len / max(1, len(norm_s)), 3)
|
| 207 |
+
|
| 208 |
+
return None
|
| 209 |
+
|
| 210 |
+
|
| 211 |
+
def find_partial_phrase_match(sentence: str, external_text: str) -> Optional[Tuple[str, float]]:
|
| 212 |
+
"""
|
| 213 |
+
Partial reuse detection via multiple strategies:
|
| 214 |
+
1) Phrase containment (3-7 word phrases)
|
| 215 |
+
2) Shingle-based Jaccard/Containment
|
| 216 |
+
3) Edit distance as fallback
|
| 217 |
+
"""
|
| 218 |
+
norm_s = normalize_text(sentence)
|
| 219 |
+
norm_e = normalize_text(external_text)
|
| 220 |
+
|
| 221 |
+
if not norm_s or not norm_e:
|
| 222 |
+
return None
|
| 223 |
+
|
| 224 |
+
# Strategy 1: Check if any 3-7 word phrase appears verbatim
|
| 225 |
+
phrase_match = _phrase_containment_match(norm_s, norm_e)
|
| 226 |
+
if phrase_match:
|
| 227 |
+
return sentence, phrase_match[1]
|
| 228 |
+
|
| 229 |
+
# Strategy 2: 7-word shingles
|
| 230 |
+
A, B = _shingle_sets(norm_s, norm_e, k=7)
|
| 231 |
+
if A and B:
|
| 232 |
+
jac = _jaccard(A, B)
|
| 233 |
+
con = _containment(A, B)
|
| 234 |
+
score = max(jac, con)
|
| 235 |
+
if score >= SEQUENCE_THRESHOLD * 0.8:
|
| 236 |
+
return sentence, round(score, 3)
|
| 237 |
+
|
| 238 |
+
# Strategy 3: Smaller shingles (5-word) for more matches
|
| 239 |
+
A_small, B_small = _shingle_sets(norm_s, norm_e, k=5)
|
| 240 |
+
if A_small and B_small:
|
| 241 |
+
jac_small = _jaccard(A_small, B_small)
|
| 242 |
+
con_small = _containment(A_small, B_small)
|
| 243 |
+
score_small = max(jac_small, con_small)
|
| 244 |
+
if score_small >= SEQUENCE_THRESHOLD * 0.75:
|
| 245 |
+
return sentence, round(score_small, 3)
|
| 246 |
+
|
| 247 |
+
# Strategy 4: Edit distance as last resort
|
| 248 |
+
lev = _levenshtein_sim(norm_s, norm_e)
|
| 249 |
+
if lev >= SEQUENCE_THRESHOLD * 0.85:
|
| 250 |
+
return sentence, round(lev, 3)
|
| 251 |
+
|
| 252 |
+
return None
|
| 253 |
+
|
| 254 |
+
def find_partial_phrase_match_for_internal(sentence: str, external_text: str) -> Optional[Tuple[str, float]]:
|
| 255 |
+
"""
|
| 256 |
+
Wrapper around find_partial_phrase_match that extracts the actual matched phrase
|
| 257 |
+
that appears in BOTH documents, not just the full sentence from the first doc.
|
| 258 |
+
"""
|
| 259 |
+
result = find_partial_phrase_match(sentence, external_text)
|
| 260 |
+
if not result:
|
| 261 |
+
return None
|
| 262 |
+
|
| 263 |
+
matched_text, score = result
|
| 264 |
+
norm_s = normalize_text(sentence)
|
| 265 |
+
norm_e = normalize_text(external_text)
|
| 266 |
+
|
| 267 |
+
if not norm_s or not norm_e:
|
| 268 |
+
return result
|
| 269 |
+
|
| 270 |
+
# Find the longest common substring that appears in both
|
| 271 |
+
words_s = norm_s.split()
|
| 272 |
+
words_e = norm_e.split()
|
| 273 |
+
|
| 274 |
+
best_common = ""
|
| 275 |
+
best_len = 0
|
| 276 |
+
|
| 277 |
+
# Try all consecutive word sequences from sentence A
|
| 278 |
+
for i in range(len(words_s)):
|
| 279 |
+
for j in range(i + 1, len(words_s) + 1):
|
| 280 |
+
phrase = ' '.join(words_s[i:j])
|
| 281 |
+
# Check if this phrase exists in document B
|
| 282 |
+
if phrase in norm_e:
|
| 283 |
+
phrase_len = len(phrase)
|
| 284 |
+
if phrase_len > best_len:
|
| 285 |
+
best_common = phrase
|
| 286 |
+
best_len = phrase_len
|
| 287 |
+
|
| 288 |
+
# If we found a common phrase, return it
|
| 289 |
+
if best_common:
|
| 290 |
+
return best_common, score
|
| 291 |
+
|
| 292 |
+
# Fallback: return original result
|
| 293 |
+
return matched_text, score
|
app/utils/semantic_utils.py
ADDED
|
@@ -0,0 +1,342 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Semantic analysis with MiniLM embeddings for paraphrasing detection.
|
| 3 |
+
Detects even heavily paraphrased content from LLMs like ChatGPT.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import re
|
| 7 |
+
from typing import List, Dict
|
| 8 |
+
import logging
|
| 9 |
+
import numpy as np
|
| 10 |
+
|
| 11 |
+
logging.basicConfig(level=logging.INFO)
|
| 12 |
+
logger = logging.getLogger("semantic_utils")
|
| 13 |
+
|
| 14 |
+
# Lazy load model to avoid multiple loads
|
| 15 |
+
_encoder_model = None
|
| 16 |
+
|
| 17 |
+
def get_encoder():
|
| 18 |
+
"""Lazy load SentenceTransformer MiniLM model"""
|
| 19 |
+
global _encoder_model
|
| 20 |
+
if _encoder_model is None:
|
| 21 |
+
logger.info("π Loading SentenceTransformer MiniLM-L6-v2 model...")
|
| 22 |
+
try:
|
| 23 |
+
from sentence_transformers import SentenceTransformer
|
| 24 |
+
_encoder_model = SentenceTransformer('all-MiniLM-L6-v2')
|
| 25 |
+
logger.info("β
Model loaded successfully")
|
| 26 |
+
except ImportError:
|
| 27 |
+
logger.error("β sentence-transformers not installed. Install with: pip install sentence-transformers")
|
| 28 |
+
raise
|
| 29 |
+
return _encoder_model
|
| 30 |
+
|
| 31 |
+
def split_into_sentences(text: str) -> List[str]:
|
| 32 |
+
"""Split text into sentences, filtering out junk"""
|
| 33 |
+
sentences = re.split(r'(?<=[.!?])\s+', text)
|
| 34 |
+
|
| 35 |
+
cleaned = []
|
| 36 |
+
for sent in sentences:
|
| 37 |
+
sent = sent.strip()
|
| 38 |
+
if len(sent) > 20 and len(sent.split()) >= 5:
|
| 39 |
+
cleaned.append(sent)
|
| 40 |
+
|
| 41 |
+
return cleaned
|
| 42 |
+
|
| 43 |
+
def extract_key_sentences(text: str, num_sentences: int = 5) -> List[str]:
|
| 44 |
+
"""
|
| 45 |
+
Extract the most important sentences from text based on:
|
| 46 |
+
- Position (first and last sentences often important)
|
| 47 |
+
- Length (15-30 words is optimal)
|
| 48 |
+
- Keyword frequency
|
| 49 |
+
"""
|
| 50 |
+
sentences = split_into_sentences(text)
|
| 51 |
+
if len(sentences) <= num_sentences:
|
| 52 |
+
return sentences
|
| 53 |
+
|
| 54 |
+
scored_sentences = []
|
| 55 |
+
|
| 56 |
+
for idx, sent in enumerate(sentences):
|
| 57 |
+
score = 0.0
|
| 58 |
+
word_count = len(sent.split())
|
| 59 |
+
|
| 60 |
+
# Position score
|
| 61 |
+
if idx < len(sentences) * 0.2 or idx > len(sentences) * 0.8:
|
| 62 |
+
score += 0.3
|
| 63 |
+
|
| 64 |
+
# Length score
|
| 65 |
+
if 15 <= word_count <= 30:
|
| 66 |
+
score += 0.3
|
| 67 |
+
elif 10 <= word_count <= 40:
|
| 68 |
+
score += 0.15
|
| 69 |
+
|
| 70 |
+
# Keyword diversity
|
| 71 |
+
words = set(sent.lower().split())
|
| 72 |
+
common_words = {'the', 'a', 'an', 'and', 'or', 'is', 'are', 'was', 'be', 'it', 'this', 'that'}
|
| 73 |
+
unique_words = len(words - common_words)
|
| 74 |
+
score += min(unique_words / 10, 0.4)
|
| 75 |
+
|
| 76 |
+
scored_sentences.append((score, sent))
|
| 77 |
+
|
| 78 |
+
scored_sentences.sort(reverse=True)
|
| 79 |
+
key_sents = [sent for _, sent in scored_sentences[:num_sentences]]
|
| 80 |
+
|
| 81 |
+
result = []
|
| 82 |
+
for sent in sentences:
|
| 83 |
+
if sent in key_sents:
|
| 84 |
+
result.append(sent)
|
| 85 |
+
|
| 86 |
+
return result
|
| 87 |
+
|
| 88 |
+
def generate_five_queries(text: str, max_words: int = 3000) -> List[str]:
|
| 89 |
+
"""
|
| 90 |
+
Generate 5 high-quality semantic search queries from document.
|
| 91 |
+
|
| 92 |
+
Query 1: Beginning (main topic)
|
| 93 |
+
Query 2: Early Middle
|
| 94 |
+
Query 3: Center (supporting evidence)
|
| 95 |
+
Query 4: Late Middle
|
| 96 |
+
Query 5: End (conclusions)
|
| 97 |
+
"""
|
| 98 |
+
logger.info(" π Generating 5 semantic queries from content...")
|
| 99 |
+
|
| 100 |
+
words = text.split()
|
| 101 |
+
if len(words) > max_words:
|
| 102 |
+
text = ' '.join(words[:max_words])
|
| 103 |
+
|
| 104 |
+
sentences = split_into_sentences(text)
|
| 105 |
+
if len(sentences) < 5:
|
| 106 |
+
# Fallback for very short text
|
| 107 |
+
logger.warning(" β οΈ Very short document, using basic queries")
|
| 108 |
+
return [
|
| 109 |
+
' '.join(words[:30]),
|
| 110 |
+
' '.join(words[max(0, len(words)//5):max(0, len(words)//5)+30]),
|
| 111 |
+
' '.join(words[max(0, 2*len(words)//5):max(0, 2*len(words)//5)+30]),
|
| 112 |
+
' '.join(words[max(0, 3*len(words)//5):max(0, 3*len(words)//5)+30]),
|
| 113 |
+
' '.join(words[max(0, 4*len(words)//5):max(0, 4*len(words)//5)+30])
|
| 114 |
+
]
|
| 115 |
+
|
| 116 |
+
queries = []
|
| 117 |
+
total_sentences = len(sentences)
|
| 118 |
+
|
| 119 |
+
# β
Query 1: BEGINNING - First 3-4 sentences
|
| 120 |
+
beginning_end = min(4, total_sentences // 5)
|
| 121 |
+
query1_sents = sentences[:beginning_end]
|
| 122 |
+
query1 = ' '.join(query1_sents)
|
| 123 |
+
queries.append(query1)
|
| 124 |
+
logger.debug(f" Query 1 (Beginning) length: {len(query1.split())} words")
|
| 125 |
+
|
| 126 |
+
# β
Query 2: EARLY MIDDLE - First third
|
| 127 |
+
early_start = beginning_end
|
| 128 |
+
early_end = min(early_start + 4, total_sentences // 3)
|
| 129 |
+
query2_sents = sentences[early_start:early_end]
|
| 130 |
+
query2 = ' '.join(query2_sents)
|
| 131 |
+
queries.append(query2)
|
| 132 |
+
logger.debug(f" Query 2 (Early Middle) length: {len(query2.split())} words")
|
| 133 |
+
|
| 134 |
+
# β
Query 3: CENTER - Middle section (3-4 sentences)
|
| 135 |
+
mid_start = max(early_end, total_sentences // 3)
|
| 136 |
+
mid_end = min(mid_start + 4, 2 * total_sentences // 3)
|
| 137 |
+
query3_sents = sentences[mid_start:mid_end]
|
| 138 |
+
query3 = ' '.join(query3_sents)
|
| 139 |
+
queries.append(query3)
|
| 140 |
+
logger.debug(f" Query 3 (Center) length: {len(query3.split())} words")
|
| 141 |
+
|
| 142 |
+
# β
Query 4: LATE MIDDLE - Second two-thirds
|
| 143 |
+
late_start = mid_end
|
| 144 |
+
late_end = min(late_start + 4, 2 * total_sentences // 3 + (total_sentences // 3))
|
| 145 |
+
query4_sents = sentences[late_start:late_end]
|
| 146 |
+
query4 = ' '.join(query4_sents)
|
| 147 |
+
queries.append(query4)
|
| 148 |
+
logger.debug(f" Query 4 (Late Middle) length: {len(query4.split())} words")
|
| 149 |
+
|
| 150 |
+
# β
Query 5: END - Last 3-4 sentences
|
| 151 |
+
end_start = max(late_end, total_sentences - 4)
|
| 152 |
+
query5_sents = sentences[end_start:]
|
| 153 |
+
query5 = ' '.join(query5_sents)
|
| 154 |
+
queries.append(query5)
|
| 155 |
+
logger.debug(f" Query 5 (End) length: {len(query5.split())} words")
|
| 156 |
+
|
| 157 |
+
# β
Clean and validate queries
|
| 158 |
+
final_queries = []
|
| 159 |
+
for q in queries:
|
| 160 |
+
q = q.strip()
|
| 161 |
+
if len(q.split()) >= 15: # Minimum 15 words for good search
|
| 162 |
+
final_queries.append(q)
|
| 163 |
+
|
| 164 |
+
logger.info(f" β
Generated {len(final_queries)} queries:")
|
| 165 |
+
for i, q in enumerate(final_queries, 1):
|
| 166 |
+
word_count = len(q.split())
|
| 167 |
+
preview = q[:80] + "..." if len(q) > 80 else q
|
| 168 |
+
logger.info(f" Query {i} ({word_count} words): {preview}")
|
| 169 |
+
|
| 170 |
+
return final_queries
|
| 171 |
+
|
| 172 |
+
def find_semantic_matches(
|
| 173 |
+
doc_text: str,
|
| 174 |
+
source_text: str,
|
| 175 |
+
threshold: float = 0.50
|
| 176 |
+
) -> List[Dict]:
|
| 177 |
+
"""
|
| 178 |
+
Find semantically similar passages using MiniLM embeddings.
|
| 179 |
+
Detects paraphrased content from LLMs.
|
| 180 |
+
|
| 181 |
+
threshold: 0.65+ = high confidence, 0.50+ = catch paraphrasing, 0.35+ = catch weak matches
|
| 182 |
+
"""
|
| 183 |
+
try:
|
| 184 |
+
encoder = get_encoder()
|
| 185 |
+
except ImportError:
|
| 186 |
+
logger.warning("β οΈ SentenceTransformer not available, falling back to string matching")
|
| 187 |
+
return _fallback_semantic_matches(doc_text, source_text, threshold)
|
| 188 |
+
|
| 189 |
+
# Split into sentences
|
| 190 |
+
doc_sentences = split_into_sentences(doc_text)
|
| 191 |
+
source_sentences = split_into_sentences(source_text)
|
| 192 |
+
|
| 193 |
+
if not doc_sentences or not source_sentences:
|
| 194 |
+
return []
|
| 195 |
+
|
| 196 |
+
logger.debug(f" Encoding {len(doc_sentences)} doc sentences + {len(source_sentences)} source sentences...")
|
| 197 |
+
|
| 198 |
+
# Encode all sentences
|
| 199 |
+
try:
|
| 200 |
+
doc_embeddings = encoder.encode(doc_sentences, convert_to_numpy=True, show_progress_bar=False)
|
| 201 |
+
source_embeddings = encoder.encode(source_sentences, convert_to_numpy=True, show_progress_bar=False)
|
| 202 |
+
except Exception as e:
|
| 203 |
+
logger.error(f" Encoding error: {e}, falling back to string matching")
|
| 204 |
+
return _fallback_semantic_matches(doc_text, source_text, threshold)
|
| 205 |
+
|
| 206 |
+
matches = []
|
| 207 |
+
matched_source_indices = set()
|
| 208 |
+
|
| 209 |
+
# Compare using cosine similarity
|
| 210 |
+
for doc_idx, doc_emb in enumerate(doc_embeddings):
|
| 211 |
+
best_similarity = 0.0
|
| 212 |
+
best_source_idx = -1
|
| 213 |
+
|
| 214 |
+
for source_idx, source_emb in enumerate(source_embeddings):
|
| 215 |
+
if source_idx in matched_source_indices:
|
| 216 |
+
continue
|
| 217 |
+
|
| 218 |
+
# Cosine similarity
|
| 219 |
+
similarity = np.dot(doc_emb, source_emb) / (
|
| 220 |
+
np.linalg.norm(doc_emb) * np.linalg.norm(source_emb) + 1e-8
|
| 221 |
+
)
|
| 222 |
+
|
| 223 |
+
if similarity > best_similarity:
|
| 224 |
+
best_similarity = similarity
|
| 225 |
+
best_source_idx = source_idx
|
| 226 |
+
|
| 227 |
+
# Record if above threshold
|
| 228 |
+
if best_similarity >= threshold and best_source_idx >= 0:
|
| 229 |
+
if best_source_idx not in matched_source_indices:
|
| 230 |
+
matched_source_indices.add(best_source_idx)
|
| 231 |
+
|
| 232 |
+
matches.append({
|
| 233 |
+
'doc_text': doc_sentences[doc_idx],
|
| 234 |
+
'source_text': source_sentences[best_source_idx],
|
| 235 |
+
'similarity': float(best_similarity),
|
| 236 |
+
'doc_index': doc_idx,
|
| 237 |
+
'source_index': best_source_idx
|
| 238 |
+
})
|
| 239 |
+
|
| 240 |
+
logger.debug(f" Found {len(matches)} semantic matches (threshold: {threshold})")
|
| 241 |
+
return matches
|
| 242 |
+
|
| 243 |
+
def _fallback_semantic_matches(doc_text: str, source_text: str, threshold: float) -> List[Dict]:
|
| 244 |
+
"""Fallback to string matching if embeddings not available"""
|
| 245 |
+
from difflib import SequenceMatcher
|
| 246 |
+
|
| 247 |
+
doc_sentences = split_into_sentences(doc_text)
|
| 248 |
+
source_sentences = split_into_sentences(source_text)
|
| 249 |
+
|
| 250 |
+
if not doc_sentences or not source_sentences:
|
| 251 |
+
return []
|
| 252 |
+
|
| 253 |
+
matches = []
|
| 254 |
+
matched_source_indices = set()
|
| 255 |
+
|
| 256 |
+
for doc_idx, doc_sent in enumerate(doc_sentences):
|
| 257 |
+
best_similarity = 0.0
|
| 258 |
+
best_source_idx = -1
|
| 259 |
+
|
| 260 |
+
for source_idx, source_sent in enumerate(source_sentences):
|
| 261 |
+
if source_idx in matched_source_indices:
|
| 262 |
+
continue
|
| 263 |
+
|
| 264 |
+
ratio = SequenceMatcher(None, doc_sent.lower(), source_sent.lower()).ratio()
|
| 265 |
+
|
| 266 |
+
if ratio > best_similarity:
|
| 267 |
+
best_similarity = ratio
|
| 268 |
+
best_source_idx = source_idx
|
| 269 |
+
|
| 270 |
+
if best_similarity >= threshold and best_source_idx >= 0:
|
| 271 |
+
if best_source_idx not in matched_source_indices:
|
| 272 |
+
matched_source_indices.add(best_source_idx)
|
| 273 |
+
|
| 274 |
+
matches.append({
|
| 275 |
+
'doc_text': doc_sentences[doc_idx],
|
| 276 |
+
'source_text': source_sentences[best_source_idx],
|
| 277 |
+
'similarity': float(best_similarity),
|
| 278 |
+
'doc_index': doc_idx,
|
| 279 |
+
'source_index': best_source_idx
|
| 280 |
+
})
|
| 281 |
+
|
| 282 |
+
return matches
|
| 283 |
+
|
| 284 |
+
def calculate_semantic_similarity(text1: str, text2: str) -> float:
|
| 285 |
+
"""Calculate semantic similarity between two texts"""
|
| 286 |
+
try:
|
| 287 |
+
encoder = get_encoder()
|
| 288 |
+
embeddings = encoder.encode([text1, text2], convert_to_numpy=True, show_progress_bar=False)
|
| 289 |
+
similarity = np.dot(embeddings[0], embeddings[1]) / (
|
| 290 |
+
np.linalg.norm(embeddings[0]) * np.linalg.norm(embeddings[1]) + 1e-8
|
| 291 |
+
)
|
| 292 |
+
return float(similarity)
|
| 293 |
+
except:
|
| 294 |
+
from difflib import SequenceMatcher
|
| 295 |
+
return SequenceMatcher(None, text1.lower(), text2.lower()).ratio()
|
| 296 |
+
|
| 297 |
+
def compare_semantic_chunks(
|
| 298 |
+
doc_text: str,
|
| 299 |
+
source_text: str,
|
| 300 |
+
chunk_size: int = 200,
|
| 301 |
+
threshold: float = 0.65
|
| 302 |
+
) -> List[Dict]:
|
| 303 |
+
"""
|
| 304 |
+
Compare document chunks against source chunks using semantic similarity.
|
| 305 |
+
"""
|
| 306 |
+
def chunk_text(text, size):
|
| 307 |
+
words = text.split()
|
| 308 |
+
chunks = []
|
| 309 |
+
for i in range(0, len(words), size):
|
| 310 |
+
chunk = ' '.join(words[i:i+size])
|
| 311 |
+
if len(chunk.split()) >= 20:
|
| 312 |
+
chunks.append(chunk)
|
| 313 |
+
return chunks
|
| 314 |
+
|
| 315 |
+
doc_chunks = chunk_text(doc_text, chunk_size)
|
| 316 |
+
source_chunks = chunk_text(source_text, chunk_size)
|
| 317 |
+
|
| 318 |
+
if not doc_chunks or not source_chunks:
|
| 319 |
+
return []
|
| 320 |
+
|
| 321 |
+
try:
|
| 322 |
+
encoder = get_encoder()
|
| 323 |
+
doc_embeddings = encoder.encode(doc_chunks, convert_to_numpy=True, show_progress_bar=False)
|
| 324 |
+
source_embeddings = encoder.encode(source_chunks, convert_to_numpy=True, show_progress_bar=False)
|
| 325 |
+
|
| 326 |
+
matches = []
|
| 327 |
+
for doc_emb in doc_embeddings:
|
| 328 |
+
for source_emb in source_embeddings:
|
| 329 |
+
similarity = np.dot(doc_emb, source_emb) / (
|
| 330 |
+
np.linalg.norm(doc_emb) * np.linalg.norm(source_emb) + 1e-8
|
| 331 |
+
)
|
| 332 |
+
|
| 333 |
+
if similarity >= threshold:
|
| 334 |
+
matches.append({
|
| 335 |
+
'doc_text': doc_chunks[0][:200] + "...",
|
| 336 |
+
'source_text': source_chunks[0][:200] + "...",
|
| 337 |
+
'similarity': float(similarity),
|
| 338 |
+
})
|
| 339 |
+
|
| 340 |
+
return matches
|
| 341 |
+
except:
|
| 342 |
+
return []
|
app/utils/web_utils.py
ADDED
|
@@ -0,0 +1,545 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Dict, List, Optional
|
| 2 |
+
import requests
|
| 3 |
+
from bs4 import BeautifulSoup
|
| 4 |
+
import time
|
| 5 |
+
from app.config import API_KEYS, SEARCH_ENGINE_IDS
|
| 6 |
+
import logging
|
| 7 |
+
import hashlib
|
| 8 |
+
from functools import lru_cache
|
| 9 |
+
from concurrent.futures import ThreadPoolExecutor, as_completed, TimeoutError as FuturesTimeoutError
|
| 10 |
+
from requests.adapters import HTTPAdapter
|
| 11 |
+
from urllib3.util.retry import Retry
|
| 12 |
+
from urllib.parse import urlparse
|
| 13 |
+
from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeoutError
|
| 14 |
+
import re
|
| 15 |
+
import threading
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
from ..logger import logger
|
| 19 |
+
logger.info("Scraper started")
|
| 20 |
+
|
| 21 |
+
# Optional fallback
|
| 22 |
+
try:
|
| 23 |
+
import cloudscraper
|
| 24 |
+
CLOUDSCRAPER_AVAILABLE = True
|
| 25 |
+
except Exception:
|
| 26 |
+
CLOUDSCRAPER_AVAILABLE = False
|
| 27 |
+
|
| 28 |
+
# ---- OPTIMIZED Configuration ----
|
| 29 |
+
MAX_WORKERS = 8 # Increased for parallelism
|
| 30 |
+
REQUEST_TIMEOUT = 5 # Reduced from 6 - fail faster
|
| 31 |
+
PER_URL_TIMEOUT = 8 # Reduced from 10
|
| 32 |
+
POLITENESS_DELAY = 0.1 # Reduced from 0.25
|
| 33 |
+
GOOGLE_NUM_DEFAULT = 10
|
| 34 |
+
MIN_TEXT_LENGTH = 700
|
| 35 |
+
GOOGLE_API_TIMEOUT = 6
|
| 36 |
+
BRAVE_API_KEY = "BSAE_jMY2tpTa_jYwCkcaiddxmzLs7m"
|
| 37 |
+
BRAVE_ENDPOINT = "https://api.search.brave.com/res/v1/web/search"
|
| 38 |
+
|
| 39 |
+
_api_key_index = 0
|
| 40 |
+
_api_key_lock = threading.Lock()
|
| 41 |
+
|
| 42 |
+
def _get_next_api_credentials():
|
| 43 |
+
"""Get next API key and search engine ID in round-robin fashion"""
|
| 44 |
+
global _api_key_index
|
| 45 |
+
with _api_key_lock:
|
| 46 |
+
key = API_KEYS[_api_key_index]
|
| 47 |
+
engine_id = SEARCH_ENGINE_IDS[_api_key_index]
|
| 48 |
+
_api_key_index = (_api_key_index + 1) % len(API_KEYS)
|
| 49 |
+
return key, engine_id
|
| 50 |
+
|
| 51 |
+
# β
HARD TIMEOUT: 3 minutes (180 seconds) for ALL scraping
|
| 52 |
+
MULTI_QUERY_TIMEOUT = 180
|
| 53 |
+
|
| 54 |
+
# Blacklist slow/unscrapeable domains
|
| 55 |
+
BLACKLIST_DOMAINS = {
|
| 56 |
+
'neurips.cc',
|
| 57 |
+
'icml.cc',
|
| 58 |
+
'jmlr.org',
|
| 59 |
+
'researchgate.net',
|
| 60 |
+
'arxiv.org',
|
| 61 |
+
'springer.com',
|
| 62 |
+
'nature.com',
|
| 63 |
+
'nips.cc',
|
| 64 |
+
'iccv2023.thecvf.com'
|
| 65 |
+
}
|
| 66 |
+
|
| 67 |
+
# Logging
|
| 68 |
+
logging.basicConfig(level=logging.INFO)
|
| 69 |
+
logger = logging.getLogger("scraper")
|
| 70 |
+
|
| 71 |
+
# ---- Session ----
|
| 72 |
+
def _make_session() -> requests.Session:
|
| 73 |
+
s = requests.Session()
|
| 74 |
+
retries = Retry(
|
| 75 |
+
total=1, # Only 1 retry - fail fast
|
| 76 |
+
backoff_factor=0.1, # Minimal backoff
|
| 77 |
+
status_forcelist=[429, 500, 502, 503, 504],
|
| 78 |
+
allowed_methods=["GET", "POST"],
|
| 79 |
+
respect_retry_after_header=False # Don't wait for Retry-After
|
| 80 |
+
)
|
| 81 |
+
s.mount("https://", HTTPAdapter(max_retries=retries, pool_connections=20, pool_maxsize=20))
|
| 82 |
+
s.mount("http://", HTTPAdapter(max_retries=retries, pool_connections=20, pool_maxsize=20))
|
| 83 |
+
s.headers.update({
|
| 84 |
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
|
| 85 |
+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
| 86 |
+
"Accept-Language": "en-US,en;q=0.5",
|
| 87 |
+
"Connection": "keep-alive",
|
| 88 |
+
})
|
| 89 |
+
return s
|
| 90 |
+
|
| 91 |
+
_SESSION = _make_session()
|
| 92 |
+
|
| 93 |
+
# ---- Global timeout tracking ----
|
| 94 |
+
_scraping_start_time = None
|
| 95 |
+
_scraping_deadline = None
|
| 96 |
+
|
| 97 |
+
def _set_scraping_deadline():
|
| 98 |
+
"""Set the scraping deadline to 3 minutes from now"""
|
| 99 |
+
global _scraping_start_time, _scraping_deadline
|
| 100 |
+
_scraping_start_time = time.time()
|
| 101 |
+
_scraping_deadline = _scraping_start_time + MULTI_QUERY_TIMEOUT
|
| 102 |
+
|
| 103 |
+
def _time_remaining() -> float:
|
| 104 |
+
"""Get remaining time in seconds"""
|
| 105 |
+
if _scraping_deadline is None:
|
| 106 |
+
return MULTI_QUERY_TIMEOUT
|
| 107 |
+
remaining = _scraping_deadline - time.time()
|
| 108 |
+
return max(0, remaining)
|
| 109 |
+
|
| 110 |
+
def _is_timeout_exceeded() -> bool:
|
| 111 |
+
"""Check if timeout has been exceeded"""
|
| 112 |
+
return _time_remaining() <= 0
|
| 113 |
+
|
| 114 |
+
# ---- Helpers ----
|
| 115 |
+
def _normalize_whitespace(s: str) -> str:
|
| 116 |
+
return " ".join(s.split())
|
| 117 |
+
|
| 118 |
+
def _text_hash(text: str) -> str:
|
| 119 |
+
return hashlib.sha256(text.encode("utf-8")).hexdigest()
|
| 120 |
+
|
| 121 |
+
def _clean_soup(soup: BeautifulSoup, prefer_main: bool = True, max_chars: Optional[int] = None) -> str:
|
| 122 |
+
for junk in soup(["script", "style", "nav", "footer", "noscript", "header"]):
|
| 123 |
+
junk.decompose()
|
| 124 |
+
parts = []
|
| 125 |
+
if prefer_main:
|
| 126 |
+
main = soup.find(["main", "article", "section"])
|
| 127 |
+
if main:
|
| 128 |
+
elems = main.find_all(["p", "h1", "h2", "h3", "li"])
|
| 129 |
+
else:
|
| 130 |
+
elems = soup.find_all(["p", "h1", "h2", "h3"])
|
| 131 |
+
else:
|
| 132 |
+
elems = soup.find_all(["p", "li"])
|
| 133 |
+
for el in elems:
|
| 134 |
+
t = el.get_text(separator=" ", strip=True)
|
| 135 |
+
if t and len(t) > 30:
|
| 136 |
+
parts.append(t)
|
| 137 |
+
text = _normalize_whitespace(" ".join(parts))
|
| 138 |
+
if max_chars and len(text) > max_chars:
|
| 139 |
+
return text[:max_chars]
|
| 140 |
+
return text
|
| 141 |
+
|
| 142 |
+
# ---- Google Search ----
|
| 143 |
+
@lru_cache(maxsize=256)
|
| 144 |
+
def google_search(query: str, num_results: int = 10):
|
| 145 |
+
api_key, search_engine_id = _get_next_api_credentials()
|
| 146 |
+
|
| 147 |
+
if not api_key or not search_engine_id:
|
| 148 |
+
logger.warning("Missing API_KEY or SEARCH_ENGINE_ID in config.py")
|
| 149 |
+
return []
|
| 150 |
+
|
| 151 |
+
url = "https://www.googleapis.com/customsearch/v1"
|
| 152 |
+
params = {"key": api_key, "cx": search_engine_id, "q": query, "num": num_results}
|
| 153 |
+
try:
|
| 154 |
+
r = _SESSION.get(url, params=params, timeout=GOOGLE_API_TIMEOUT)
|
| 155 |
+
r.raise_for_status()
|
| 156 |
+
data = r.json()
|
| 157 |
+
items = data.get("items", []) or []
|
| 158 |
+
out = []
|
| 159 |
+
for i in items:
|
| 160 |
+
out.append({
|
| 161 |
+
"link": i.get("link"),
|
| 162 |
+
"title": i.get("title", ""),
|
| 163 |
+
"snippet": i.get("snippet", "")
|
| 164 |
+
})
|
| 165 |
+
logger.info(f"google_search: got {len(out)} items for '{query[:60]}'")
|
| 166 |
+
return out
|
| 167 |
+
except Exception as e:
|
| 168 |
+
logger.warning(f"google_search failed: {e}")
|
| 169 |
+
return []
|
| 170 |
+
|
| 171 |
+
# ---- IMPROVED: Cloudscraper with early exit ----
|
| 172 |
+
def scrape_with_cloudscraper(url: str, timeout: int = 8):
|
| 173 |
+
if not CLOUDSCRAPER_AVAILABLE:
|
| 174 |
+
return ""
|
| 175 |
+
try:
|
| 176 |
+
logger.debug(f"cloudscraper: GET {url}")
|
| 177 |
+
scraper = cloudscraper.create_scraper()
|
| 178 |
+
r = scraper.get(url, timeout=timeout)
|
| 179 |
+
r.raise_for_status()
|
| 180 |
+
soup = BeautifulSoup(r.text, "html.parser")
|
| 181 |
+
text = _clean_soup(soup, prefer_main=True, max_chars=20000)
|
| 182 |
+
if text and len(text) > MIN_TEXT_LENGTH:
|
| 183 |
+
logger.info(f" β
Scraped {len(text)} chars (cloudscraper) for {url}")
|
| 184 |
+
return text
|
| 185 |
+
return ""
|
| 186 |
+
except Exception as e:
|
| 187 |
+
logger.debug(f"cloudscraper error for {url}: {e}")
|
| 188 |
+
return ""
|
| 189 |
+
|
| 190 |
+
# ---- IMPROVED: Playwright with hard timeout ----
|
| 191 |
+
def scrape_with_playwright(url: str, timeout: int = 8):
|
| 192 |
+
"""Scrape JS-heavy pages with aggressive waiting and content extraction."""
|
| 193 |
+
try:
|
| 194 |
+
logger.debug(f"Playwright: GET {url}")
|
| 195 |
+
with sync_playwright() as p:
|
| 196 |
+
browser = p.chromium.launch(
|
| 197 |
+
headless=True,
|
| 198 |
+
args=[
|
| 199 |
+
"--disable-blink-features=AutomationControlled",
|
| 200 |
+
"--disable-dev-shm-usage",
|
| 201 |
+
"--no-sandbox",
|
| 202 |
+
"--disable-gpu"
|
| 203 |
+
]
|
| 204 |
+
)
|
| 205 |
+
context = browser.new_context(
|
| 206 |
+
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
|
| 207 |
+
)
|
| 208 |
+
context.set_default_timeout(timeout * 1000)
|
| 209 |
+
page = context.new_page()
|
| 210 |
+
|
| 211 |
+
try:
|
| 212 |
+
# Navigate with longer timeout for JS-heavy sites
|
| 213 |
+
page.goto(url, wait_until="networkidle", timeout=timeout * 1000)
|
| 214 |
+
|
| 215 |
+
# Wait for common content containers to load
|
| 216 |
+
try:
|
| 217 |
+
page.wait_for_selector("article, [role='article'], .post-content, .blog-content, main", timeout=3000)
|
| 218 |
+
except:
|
| 219 |
+
pass # Selector may not exist, continue anyway
|
| 220 |
+
|
| 221 |
+
# Extended wait for dynamic content
|
| 222 |
+
page.wait_for_timeout(3000) # 3 seconds for rendering
|
| 223 |
+
|
| 224 |
+
# Scroll down to load lazy-loaded images/content
|
| 225 |
+
page.evaluate("""
|
| 226 |
+
async () => {
|
| 227 |
+
await new Promise((resolve) => {
|
| 228 |
+
let totalHeight = 0;
|
| 229 |
+
const distance = 100;
|
| 230 |
+
const timer = setInterval(() => {
|
| 231 |
+
window.scrollBy(0, distance);
|
| 232 |
+
totalHeight += distance;
|
| 233 |
+
if (totalHeight >= document.body.scrollHeight) {
|
| 234 |
+
clearInterval(timer);
|
| 235 |
+
resolve();
|
| 236 |
+
}
|
| 237 |
+
}, 100);
|
| 238 |
+
});
|
| 239 |
+
}
|
| 240 |
+
""")
|
| 241 |
+
|
| 242 |
+
# Wait after scrolling
|
| 243 |
+
page.wait_for_timeout(1500)
|
| 244 |
+
|
| 245 |
+
# Scroll back to top
|
| 246 |
+
page.evaluate("window.scrollTo(0, 0)")
|
| 247 |
+
page.wait_for_timeout(500)
|
| 248 |
+
|
| 249 |
+
html = page.content()
|
| 250 |
+
finally:
|
| 251 |
+
context.close()
|
| 252 |
+
browser.close()
|
| 253 |
+
|
| 254 |
+
soup = BeautifulSoup(html, "html.parser")
|
| 255 |
+
text = _clean_soup(soup, prefer_main=True, max_chars=25000)
|
| 256 |
+
|
| 257 |
+
if text and len(text) > MIN_TEXT_LENGTH:
|
| 258 |
+
logger.info(f" β
Scraped {len(text)} chars (Playwright) for {url}")
|
| 259 |
+
return text
|
| 260 |
+
|
| 261 |
+
# If we got very little content, try a less aggressive cleanup
|
| 262 |
+
logger.debug(f"Initial extraction got {len(text)} chars, trying aggressive extraction")
|
| 263 |
+
text_aggressive = _clean_soup(soup, prefer_main=False, max_chars=25000)
|
| 264 |
+
if text_aggressive and len(text_aggressive) > MIN_TEXT_LENGTH and len(text_aggressive) > len(text):
|
| 265 |
+
logger.info(f" β
Scraped {len(text_aggressive)} chars (Playwright aggressive) for {url}")
|
| 266 |
+
return text_aggressive
|
| 267 |
+
|
| 268 |
+
logger.debug(f"Playwright extraction minimal for {url}: {len(text)} chars")
|
| 269 |
+
return text if text else ""
|
| 270 |
+
|
| 271 |
+
except (PlaywrightTimeoutError, Exception) as e:
|
| 272 |
+
logger.debug(f"Playwright error for {url}: {e}")
|
| 273 |
+
return ""
|
| 274 |
+
|
| 275 |
+
# ---- IMPROVED: Smart fallback strategy ----
|
| 276 |
+
def scrape_page(url: str, timeout: int = 5):
|
| 277 |
+
"""Scrape pages in order: requests β cloudscraper β Playwright."""
|
| 278 |
+
# β
HARD CHECK: Exit immediately if timeout exceeded
|
| 279 |
+
if _is_timeout_exceeded():
|
| 280 |
+
logger.warning(f"Scraping timeout exceeded, skipping {url}")
|
| 281 |
+
return ""
|
| 282 |
+
|
| 283 |
+
try:
|
| 284 |
+
domain = urlparse(url).netloc.lower()
|
| 285 |
+
|
| 286 |
+
# Skip blacklisted domains entirely
|
| 287 |
+
if any(bd in domain for bd in BLACKLIST_DOMAINS):
|
| 288 |
+
logger.info(f"Skipping blacklisted domain: {domain}")
|
| 289 |
+
return ""
|
| 290 |
+
|
| 291 |
+
# --- 1οΈβ£ Requests (fastest) ---
|
| 292 |
+
headers = {
|
| 293 |
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
|
| 294 |
+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
| 295 |
+
"Accept-Language": "en-US,en;q=0.5",
|
| 296 |
+
}
|
| 297 |
+
try:
|
| 298 |
+
logger.debug(f"requests: GET {url}")
|
| 299 |
+
r = _SESSION.get(url, headers=headers, timeout=timeout, allow_redirects=True)
|
| 300 |
+
r.raise_for_status()
|
| 301 |
+
soup = BeautifulSoup(r.text, "html.parser")
|
| 302 |
+
text = _clean_soup(soup, prefer_main=True, max_chars=20000)
|
| 303 |
+
if text and len(text) > MIN_TEXT_LENGTH:
|
| 304 |
+
logger.info(f" β
Scraped {len(text)} chars (requests) for {url}")
|
| 305 |
+
return text
|
| 306 |
+
except Exception as e:
|
| 307 |
+
logger.debug(f"requests failed for {url}: {e}")
|
| 308 |
+
|
| 309 |
+
# β
HARD CHECK: Exit if timeout exceeded between attempts
|
| 310 |
+
if _is_timeout_exceeded():
|
| 311 |
+
logger.warning(f"Scraping timeout exceeded, stopping fallback methods for {url}")
|
| 312 |
+
return ""
|
| 313 |
+
|
| 314 |
+
# --- 2οΈβ£ Cloudscraper (medium) ---
|
| 315 |
+
if CLOUDSCRAPER_AVAILABLE:
|
| 316 |
+
try:
|
| 317 |
+
logger.debug(f"cloudscraper: GET {url}")
|
| 318 |
+
scraper = cloudscraper.create_scraper()
|
| 319 |
+
r = scraper.get(url, timeout=timeout)
|
| 320 |
+
r.raise_for_status()
|
| 321 |
+
soup = BeautifulSoup(r.text, "html.parser")
|
| 322 |
+
text = _clean_soup(soup, prefer_main=True, max_chars=20000)
|
| 323 |
+
if text and len(text) > MIN_TEXT_LENGTH:
|
| 324 |
+
logger.info(f" β
Scraped {len(text)} chars (cloudscraper) for {url}")
|
| 325 |
+
return text
|
| 326 |
+
except Exception as e:
|
| 327 |
+
logger.debug(f"cloudscraper failed for {url}: {e}")
|
| 328 |
+
|
| 329 |
+
# β
HARD CHECK: Exit if timeout exceeded before Playwright
|
| 330 |
+
if _is_timeout_exceeded():
|
| 331 |
+
logger.warning(f"Scraping timeout exceeded, skipping Playwright for {url}")
|
| 332 |
+
return ""
|
| 333 |
+
|
| 334 |
+
# --- 3οΈβ£ Playwright (heaviest) ---
|
| 335 |
+
try:
|
| 336 |
+
logger.debug(f"Playwright: GET {url}")
|
| 337 |
+
res = scrape_with_playwright(url, timeout=12)
|
| 338 |
+
if res and len(res) > MIN_TEXT_LENGTH:
|
| 339 |
+
return res
|
| 340 |
+
except Exception as e:
|
| 341 |
+
logger.debug(f"Playwright failed for {url}: {e}")
|
| 342 |
+
|
| 343 |
+
except Exception as e:
|
| 344 |
+
logger.debug(f"All scrapers failed for {url}: {e}")
|
| 345 |
+
|
| 346 |
+
return ""
|
| 347 |
+
|
| 348 |
+
# ---- IMPROVED: Parallel fetch with HARD OVERALL TIMEOUT ----
|
| 349 |
+
def fetch_sources(query: str, num_results: int = 10):
|
| 350 |
+
"""Fetch sources for a single query"""
|
| 351 |
+
logger.info(f"Fetching sources for query: '{query[:60]}'")
|
| 352 |
+
items = google_search(query, num_results=num_results)
|
| 353 |
+
if not items:
|
| 354 |
+
logger.warning("No URLs returned from Google Search")
|
| 355 |
+
return []
|
| 356 |
+
|
| 357 |
+
urls = [it["link"] for it in items if it.get("link")]
|
| 358 |
+
snippets = {it["link"]: it.get("snippet", "") for it in items if it.get("link")}
|
| 359 |
+
domain_last_hit: Dict[str, float] = {}
|
| 360 |
+
|
| 361 |
+
def _scrape_task(u: str) -> Dict:
|
| 362 |
+
try:
|
| 363 |
+
dom = urlparse(u).netloc
|
| 364 |
+
last = domain_last_hit.get(dom, 0.0)
|
| 365 |
+
now = time.time()
|
| 366 |
+
delta = now - last
|
| 367 |
+
if delta < POLITENESS_DELAY:
|
| 368 |
+
time.sleep(POLITENESS_DELAY - delta)
|
| 369 |
+
domain_last_hit[dom] = time.time()
|
| 370 |
+
logger.info(f"Scraping URL: {u}")
|
| 371 |
+
|
| 372 |
+
text = scrape_page(u, timeout=REQUEST_TIMEOUT)
|
| 373 |
+
if text:
|
| 374 |
+
return {"url": u, "content": text, "method": "scraped", "len": len(text), "hash": _text_hash(text[:4000])}
|
| 375 |
+
else:
|
| 376 |
+
return {"url": u, "content": "", "method": "failed", "len": 0, "hash": ""}
|
| 377 |
+
except Exception as e:
|
| 378 |
+
logger.debug(f"Exception in _scrape_task for {u}: {e}")
|
| 379 |
+
return {"url": u, "content": "", "method": "error", "len": 0, "hash": ""}
|
| 380 |
+
|
| 381 |
+
results = []
|
| 382 |
+
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as ex:
|
| 383 |
+
future_to_url = {ex.submit(_scrape_task, u): u for u in urls}
|
| 384 |
+
for fut in as_completed(future_to_url):
|
| 385 |
+
u = future_to_url[fut]
|
| 386 |
+
try:
|
| 387 |
+
r = fut.result(timeout=PER_URL_TIMEOUT)
|
| 388 |
+
if r.get("content"):
|
| 389 |
+
results.append(r)
|
| 390 |
+
except FuturesTimeoutError:
|
| 391 |
+
logger.warning(f"Timeout scraping {u}")
|
| 392 |
+
except Exception as e:
|
| 393 |
+
logger.warning(f"Error scraping {u}: {e}")
|
| 394 |
+
|
| 395 |
+
if not results:
|
| 396 |
+
logger.info("No pages scraped; using snippets")
|
| 397 |
+
for u in urls:
|
| 398 |
+
snip = snippets.get(u, "")
|
| 399 |
+
if snip:
|
| 400 |
+
results.append({"url": u, "content": snip, "method": "google_snippet", "len": len(snip), "hash": _text_hash(snip)})
|
| 401 |
+
return results
|
| 402 |
+
|
| 403 |
+
# Deduplicate
|
| 404 |
+
seen: Dict[str, Dict] = {}
|
| 405 |
+
for r in results:
|
| 406 |
+
h = r.get("hash") or _text_hash(r.get("content", ""))
|
| 407 |
+
cur = seen.get(h)
|
| 408 |
+
if not cur or (r.get("len", 0) > cur.get("len", 0)):
|
| 409 |
+
seen[h] = r
|
| 410 |
+
|
| 411 |
+
deduped = list(seen.values())
|
| 412 |
+
|
| 413 |
+
# Fill from snippets if needed
|
| 414 |
+
if len(deduped) < num_results:
|
| 415 |
+
for u in urls:
|
| 416 |
+
if any(x["url"] == u for x in deduped):
|
| 417 |
+
continue
|
| 418 |
+
snip = snippets.get(u, "")
|
| 419 |
+
if snip:
|
| 420 |
+
deduped.append({"url": u, "content": snip, "method": "google_snippet", "len": len(snip), "hash": _text_hash(snip)})
|
| 421 |
+
if len(deduped) >= num_results:
|
| 422 |
+
break
|
| 423 |
+
|
| 424 |
+
logger.info(f"Fetched {len(deduped)} sources for query")
|
| 425 |
+
return deduped
|
| 426 |
+
|
| 427 |
+
# ---- NEW: Multi-query fetch - accepts pre-generated queries ----
|
| 428 |
+
def fetch_sources_multi_query(query: str, num_results: int = 10) -> List[Dict[str, str]]:
|
| 429 |
+
"""
|
| 430 |
+
Accept a single pre-generated query and fetch sources.
|
| 431 |
+
NO internal query generation - just scrape this query.
|
| 432 |
+
"""
|
| 433 |
+
_set_scraping_deadline()
|
| 434 |
+
|
| 435 |
+
logger.info(f"Processing single query with overall 3-minute timeout")
|
| 436 |
+
|
| 437 |
+
all_sources: Dict[str, Dict] = {}
|
| 438 |
+
DOC_EXTENSIONS = [".pdf", ".doc", ".docx", ".odf", ".xls", ".xlsx", ".ppt", ".pptx"]
|
| 439 |
+
lock = threading.Lock()
|
| 440 |
+
|
| 441 |
+
def _process_url(u: str, items: List) -> Optional[Dict]:
|
| 442 |
+
"""Process single URL with timeout check"""
|
| 443 |
+
# β
HARD CHECK: Exit immediately if timeout exceeded
|
| 444 |
+
if _is_timeout_exceeded():
|
| 445 |
+
return None
|
| 446 |
+
|
| 447 |
+
if u in all_sources:
|
| 448 |
+
return None
|
| 449 |
+
if any(ext in u.lower() for ext in DOC_EXTENSIONS):
|
| 450 |
+
logger.info(f"Skipping document URL: {u}")
|
| 451 |
+
return None
|
| 452 |
+
|
| 453 |
+
logger.info(f"Scraping URL: {u}")
|
| 454 |
+
|
| 455 |
+
try:
|
| 456 |
+
# Try scraping the page
|
| 457 |
+
text_content = scrape_page(u, timeout=REQUEST_TIMEOUT)
|
| 458 |
+
if text_content and len(text_content) > MIN_TEXT_LENGTH:
|
| 459 |
+
logger.info(f" β
Scraped {len(text_content)} chars for {u}")
|
| 460 |
+
return {"url": u, "content": text_content, "source_url": u}
|
| 461 |
+
|
| 462 |
+
# If scraping failed, try snippet as fallback
|
| 463 |
+
snippet = next((it.get("snippet", "") for it in items if it.get("link") == u), "")
|
| 464 |
+
if snippet and len(snippet) > 50:
|
| 465 |
+
logger.info(f" β οΈ Using snippet ({len(snippet)} chars) for {u}")
|
| 466 |
+
return {"url": u, "content": snippet, "source_url": u}
|
| 467 |
+
|
| 468 |
+
logger.warning(f" β No content extracted for {u}")
|
| 469 |
+
return None
|
| 470 |
+
|
| 471 |
+
except Exception as e:
|
| 472 |
+
logger.debug(f"Error scraping {u}: {e}")
|
| 473 |
+
return None
|
| 474 |
+
|
| 475 |
+
# β
HARD CHECK: Exit if timeout exceeded
|
| 476 |
+
if _is_timeout_exceeded():
|
| 477 |
+
logger.warning(f"Timeout exceeded, skipping query")
|
| 478 |
+
return []
|
| 479 |
+
|
| 480 |
+
logger.info(f"Query: '{query[:60]}'")
|
| 481 |
+
items = google_search(query, num_results=num_results)
|
| 482 |
+
if not items:
|
| 483 |
+
return []
|
| 484 |
+
|
| 485 |
+
urls = [it["link"] for it in items if it.get("link")]
|
| 486 |
+
|
| 487 |
+
# Process URLs for this query in parallel
|
| 488 |
+
with ThreadPoolExecutor(max_workers=4) as url_ex:
|
| 489 |
+
url_futures = {url_ex.submit(_process_url, u, items): u for u in urls}
|
| 490 |
+
for fut in as_completed(url_futures):
|
| 491 |
+
# β
HARD CHECK: Exit if timeout exceeded
|
| 492 |
+
if _is_timeout_exceeded():
|
| 493 |
+
logger.warning(f"Timeout exceeded, stopping URL processing")
|
| 494 |
+
for f in url_futures:
|
| 495 |
+
f.cancel()
|
| 496 |
+
break
|
| 497 |
+
|
| 498 |
+
try:
|
| 499 |
+
result = fut.result(timeout=PER_URL_TIMEOUT)
|
| 500 |
+
if result:
|
| 501 |
+
with lock:
|
| 502 |
+
all_sources[result['url']] = result
|
| 503 |
+
logger.info(f" β
Added: {result['url'][:50]}")
|
| 504 |
+
except FuturesTimeoutError:
|
| 505 |
+
logger.warning(f"Timeout on URL")
|
| 506 |
+
except Exception as e:
|
| 507 |
+
logger.debug(f"Error: {e}")
|
| 508 |
+
time.sleep(0.05)
|
| 509 |
+
|
| 510 |
+
res = list(all_sources.values())
|
| 511 |
+
elapsed = time.time() - _scraping_start_time if _scraping_start_time else 0
|
| 512 |
+
logger.info(f"Sources for this query: {len(res)} (elapsed: {elapsed:.1f}s)")
|
| 513 |
+
return res
|
| 514 |
+
|
| 515 |
+
|
| 516 |
+
def fetch_brave_sources(query: str, num_results: int = 5) -> list[dict]:
|
| 517 |
+
"""Fetch from Brave Search API."""
|
| 518 |
+
headers = {
|
| 519 |
+
"Accept": "application/json",
|
| 520 |
+
"X-API-KEY": BRAVE_API_KEY
|
| 521 |
+
}
|
| 522 |
+
params = {"q": query, "count": num_results}
|
| 523 |
+
|
| 524 |
+
try:
|
| 525 |
+
resp = requests.get(BRAVE_ENDPOINT, headers=headers, params=params, timeout=6)
|
| 526 |
+
resp.raise_for_status()
|
| 527 |
+
data = resp.json()
|
| 528 |
+
results = []
|
| 529 |
+
for item in data.get("webPages", []):
|
| 530 |
+
results.append({
|
| 531 |
+
"title": item.get("url") or "Unknown",
|
| 532 |
+
"content": item.get("snippet") or "",
|
| 533 |
+
})
|
| 534 |
+
return results
|
| 535 |
+
except Exception as e:
|
| 536 |
+
logging.warning(f"Brave search failed for query '{query}': {e}")
|
| 537 |
+
return []
|
| 538 |
+
|
| 539 |
+
def prepare_brave_query(text: str, min_len: int = 20, max_len: int = 200) -> str:
|
| 540 |
+
t = re.sub(r"\s+", " ", text).strip()
|
| 541 |
+
if len(t) < min_len:
|
| 542 |
+
return None
|
| 543 |
+
if len(t) > max_len:
|
| 544 |
+
t = t[:max_len].rsplit(" ", 1)[0]
|
| 545 |
+
return t
|