topGdev commited on
Commit
a561338
Β·
1 Parent(s): 76cca3d

add ai similarity

Browse files
Files changed (47) hide show
  1. .DS_Store +0 -0
  2. app/.DS_Store +0 -0
  3. app/__pycache__/config.cpython-312.pyc +0 -0
  4. app/__pycache__/logger.cpython-312.pyc +0 -0
  5. app/__pycache__/main.cpython-312.pyc +0 -0
  6. app/config.py +46 -0
  7. app/dependencies/__pycache__/auth.cpython-312.pyc +0 -0
  8. app/dependencies/auth.py +41 -0
  9. app/logger.py +13 -0
  10. app/main.py +26 -0
  11. app/routers/.DS_Store +0 -0
  12. app/routers/__pycache__/health.cpython-312.pyc +0 -0
  13. app/routers/__pycache__/plagiarism.cpython-312.pyc +0 -0
  14. app/routers/student/__pycache__/lexical_analysis.cpython-312.pyc +0 -0
  15. app/routers/student/__pycache__/plagiarism.cpython-312.pyc +0 -0
  16. app/routers/student/lexical_analysis.py +225 -0
  17. app/routers/teacher/__pycache__/code_analysis.cpython-312.pyc +0 -0
  18. app/routers/teacher/__pycache__/internal_analysis.cpython-312.pyc +0 -0
  19. app/routers/teacher/__pycache__/lexical_analysis.cpython-312.pyc +0 -0
  20. app/routers/teacher/__pycache__/semantic_analysis.cpython-312.pyc +0 -0
  21. app/routers/teacher/internal_analysis.py +323 -0
  22. app/routers/teacher/lexical_analysis.py +472 -0
  23. app/routers/teacher/semantic_analysis.py +406 -0
  24. app/schemas/__pycache__/plagiarism_schemas.cpython-312.pyc +0 -0
  25. app/schemas/__pycache__/report_schemas.cpython-312.pyc +0 -0
  26. app/schemas/__pycache__/schemas.cpython-312.pyc +0 -0
  27. app/schemas/__pycache__/sources_schemas.cpython-312.pyc +0 -0
  28. app/schemas/__pycache__/teacher_schemas.cpython-312.pyc +0 -0
  29. app/schemas/plagiarism_schemas.py +22 -0
  30. app/schemas/report_schemas.py +21 -0
  31. app/schemas/sources_schemas.py +9 -0
  32. app/schemas/teacher_schemas.py +185 -0
  33. app/utils/__pycache__/ai_detector.cpython-312.pyc +0 -0
  34. app/utils/__pycache__/code_comparism.cpython-312.pyc +0 -0
  35. app/utils/__pycache__/code_detection.cpython-312.pyc +0 -0
  36. app/utils/__pycache__/code_plagiarism_integration.cpython-312.pyc +0 -0
  37. app/utils/__pycache__/code_utils.cpython-312.pyc +0 -0
  38. app/utils/__pycache__/file_utils.cpython-312.pyc +0 -0
  39. app/utils/__pycache__/lexical_utils.cpython-312.pyc +0 -0
  40. app/utils/__pycache__/semantic_utils.cpython-312.pyc +0 -0
  41. app/utils/__pycache__/text_utils.cpython-312.pyc +0 -0
  42. app/utils/__pycache__/web_utils.cpython-312.pyc +0 -0
  43. app/utils/ai_detector.py +80 -0
  44. app/utils/file_utils.py +36 -0
  45. app/utils/lexical_utils.py +293 -0
  46. app/utils/semantic_utils.py +342 -0
  47. app/utils/web_utils.py +545 -0
.DS_Store ADDED
Binary file (6.15 kB). View file
 
app/.DS_Store ADDED
Binary file (6.15 kB). View file
 
app/__pycache__/config.cpython-312.pyc ADDED
Binary file (1.99 kB). View file
 
app/__pycache__/logger.cpython-312.pyc ADDED
Binary file (587 Bytes). View file
 
app/__pycache__/main.cpython-312.pyc ADDED
Binary file (1.29 kB). View file
 
app/config.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dotenv import load_dotenv
3
+
4
+ load_dotenv()
5
+
6
+ # ───── API Keys & URLs ─────
7
+ MONGODB_URI = os.getenv("MONGODB_URI", "mongodb+srv://localhost:27017/sluethink")
8
+
9
+ # ───── Similarity thresholds ─────
10
+ MIN_WORDS_PER_SENTENCE = 4
11
+ MIN_SENTENCE_LENGTH = 30
12
+ SEQUENCE_THRESHOLD = 0.75
13
+ TFIDF_THRESHOLD = 0.80
14
+ SUB_PHRASE_TFIDF_MIN = 0.50
15
+ EXACT_MATCH_SCORE = 1.0
16
+
17
+ # ───── File support ─────
18
+ ALLOWED_EXTENSIONS = {"txt", "pdf", "docx"}
19
+ JWT_SECRET_KEY = os.getenv("JWT_SECRET_KEY", "your_nextauth_secret")
20
+ JWT_ALGORITHM = os.getenv("JWT_ALGORITHM", "HS256")
21
+ HF_TOKEN = os.getenv("HF_TOKEN", "")
22
+
23
+ API_KEYS = os.getenv("API_KEYS", "").split(",") if os.getenv("API_KEYS") else []
24
+ SEARCH_ENGINE_IDS = os.getenv("SEARCH_ENGINE_IDS", "").split(",") if os.getenv("SEARCH_ENGINE_IDS") else []
25
+
26
+ SECRET_KEY = os.getenv("SECRET_KEY", "change-me")
27
+ ALGORITHM = os.getenv("ALGORITHM", "HS256")
28
+
29
+ MIN_FUNCTION_LINES = 5
30
+ MIN_CODE_BLOCK_LINES = 3
31
+ STRUCTURAL_SIMILARITY_THRESHOLD = 0.75
32
+ TOKEN_SIMILARITY_THRESHOLD = 0.70
33
+ EXACT_MATCH_THRESHOLD = 0.90
34
+
35
+ MAX_QUERIES_PER_SUBMISSION = 3
36
+ RESULTS_PER_QUERY = 10
37
+ MAX_SOURCES_TO_ANALYZE = 8
38
+
39
+ REQUEST_TIMEOUT = 6
40
+ MAX_SCRAPE_WORKERS = 4
41
+ POLITENESS_DELAY = 0.2
42
+
43
+ ALLOWED_CODE_EXTENSIONS = {'.py', '.java', '.cpp', '.c', '.js', '.jsx', '.ts', '.tsx', '.cs', '.rb', '.go', '.php'}
44
+ MAX_FILE_SIZE_MB = 5
45
+
46
+ LOG_LEVEL = "INFO"
app/dependencies/__pycache__/auth.cpython-312.pyc ADDED
Binary file (787 Bytes). View file
 
app/dependencies/auth.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app/dependencies/auth.py
2
+
3
+ from jose import JWTError, jwt
4
+ from fastapi import Depends, HTTPException
5
+ from fastapi.security import OAuth2PasswordBearer
6
+ from motor.motor_asyncio import AsyncIOMotorClient
7
+ from bson import ObjectId
8
+
9
+ from app.config import MONGODB_URI, JWT_SECRET_KEY, JWT_ALGORITHM
10
+ # from app.schemas.report_schemas import User
11
+
12
+ oauth2_scheme = OAuth2PasswordBearer(tokenUrl="/auth/login")
13
+
14
+ async def get_mongo_client():
15
+ return AsyncIOMotorClient(MONGODB_URI)
16
+
17
+ # async def get_current_user(
18
+ # token: str = Depends(oauth2_scheme),
19
+ # mongo_client: AsyncIOMotorClient = Depends(get_mongo_client),
20
+ # ):
21
+ # # 1) Decode the JWT
22
+ # try:
23
+ # payload = jwt.decode(token, JWT_SECRET_KEY, algorithms=[JWT_ALGORITHM])
24
+ # user_id: str = payload.get("id")
25
+ # if not user_id:
26
+ # raise HTTPException(status_code=401, detail="Invalid token payload")
27
+ # except JWTError:
28
+ # raise HTTPException(status_code=401, detail="Could not validate credentials")
29
+
30
+ # # 2) Retrieve the user document from MongoDB
31
+ # db = mongo_client.get_default_database()
32
+ # users_col = db["users"]
33
+ # user_doc = await users_col.find_one({"_id": ObjectId(user_id)})
34
+ # if not user_doc:
35
+ # raise HTTPException(status_code=404, detail="User not found")
36
+
37
+
38
+ # user_doc["id"] = str(user_doc["_id"])
39
+
40
+
41
+ # return User(**user_doc)
app/logger.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # logger.py
2
+ import logging
3
+
4
+ logging.basicConfig(
5
+ level=logging.INFO,
6
+ format='%(asctime)s [%(levelname)s] %(name)s: %(message)s',
7
+ handlers=[
8
+ logging.FileHandler("scraper.log"),
9
+ logging.StreamHandler()
10
+ ]
11
+ )
12
+
13
+ logger = logging.getLogger("scraper")
app/main.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from fastapi import FastAPI
3
+ from app.routers.student.lexical_analysis import router as student_lexical_router
4
+ from app.routers.teacher.semantic_analysis import router as teacher_semantic_router
5
+ from app.routers.teacher.lexical_analysis import router as teacher_lexical_router
6
+ from app.routers.teacher.internal_analysis import router as teacher_internal_router
7
+
8
+ from fastapi.middleware.cors import CORSMiddleware
9
+
10
+ logging.basicConfig(level=logging.INFO)
11
+ logger = logging.getLogger(__name__)
12
+
13
+ app = FastAPI()
14
+
15
+ app.add_middleware(
16
+ CORSMiddleware,
17
+ allow_origins=["http://localhost:3000", "https://plagiarism-detection-frontend.vercel.app", "*"],
18
+ allow_credentials=True,
19
+ allow_methods=["*"],
20
+ allow_headers=["*"],
21
+ )
22
+
23
+ app.include_router(student_lexical_router)
24
+ app.include_router(teacher_internal_router)
25
+ app.include_router(teacher_lexical_router)
26
+ app.include_router(teacher_semantic_router)
app/routers/.DS_Store ADDED
Binary file (6.15 kB). View file
 
app/routers/__pycache__/health.cpython-312.pyc ADDED
Binary file (151 Bytes). View file
 
app/routers/__pycache__/plagiarism.cpython-312.pyc ADDED
Binary file (8.67 kB). View file
 
app/routers/student/__pycache__/lexical_analysis.cpython-312.pyc ADDED
Binary file (9.74 kB). View file
 
app/routers/student/__pycache__/plagiarism.cpython-312.pyc ADDED
Binary file (8.68 kB). View file
 
app/routers/student/lexical_analysis.py ADDED
@@ -0,0 +1,225 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import APIRouter, UploadFile, File, HTTPException, Depends
2
+ from typing import Optional
3
+ from datetime import datetime
4
+ from fastapi.security import OAuth2PasswordBearer
5
+ from jose import JWTError, jwt
6
+ from motor.motor_asyncio import AsyncIOMotorClient
7
+ import os
8
+
9
+ from app.config import MONGODB_URI,ALGORITHM, SECRET_KEY
10
+ from app.schemas.teacher_schemas import (
11
+ LexicalMatch
12
+ )
13
+ from app.utils.file_utils import extract_text_from_file, allowed_file
14
+ from app.utils.lexical_utils import (
15
+ get_meaningful_sentences, extract_keywords,
16
+ find_exact_matches, find_partial_phrase_match,
17
+ )
18
+ from app.utils.web_utils import fetch_sources, fetch_sources_multi_query
19
+
20
+ router = APIRouter(prefix="/student", tags=["student-lexical"])
21
+
22
+ LEXICAL_DOC_THRESHOLD = 0.85 # 85%
23
+ oauth2_scheme = OAuth2PasswordBearer(tokenUrl="/token")
24
+
25
+
26
+
27
+ def verify_token(token: str = Depends(oauth2_scheme)):
28
+ try:
29
+ return jwt.decode(token, SECRET_KEY, algorithms=[ALGORITHM])
30
+ except JWTError:
31
+ raise HTTPException(status_code=401, detail="Invalid or expired token")
32
+
33
+ async def get_mongo_client():
34
+ return AsyncIOMotorClient(MONGODB_URI)
35
+
36
+ @router.post("/lexical-analysis")
37
+ async def student_lexical_analysis(
38
+ file: UploadFile = File(...),
39
+ current_user=Depends(verify_token),
40
+ ):
41
+ if not file:
42
+ raise HTTPException(status_code=400, detail="No file uploaded")
43
+
44
+ t0 = datetime.utcnow()
45
+ total_matches = 0
46
+
47
+ print(f"πŸ” Starting student lexical analysis for uploaded file...")
48
+
49
+ # Process single file
50
+ if not allowed_file(file.filename):
51
+ raise HTTPException(status_code=400, detail=f"Invalid file type: {file.filename}")
52
+
53
+ raw = await file.read()
54
+ text = extract_text_from_file(raw, file.filename) or ""
55
+ sentences = get_meaningful_sentences(text)
56
+
57
+ print(f"\nπŸ“„ Processing file: {file.filename}")
58
+ print(f" ➀ Extracted {len(sentences)} sentences")
59
+ print(f" ➀ Approx word count: {len(text.split())}")
60
+
61
+ # Build search query from keywords
62
+ sources = fetch_sources_multi_query(text, num_results=10)
63
+ print(f" ➀ Found {len(sources)} online sources from diverse queries")
64
+
65
+ if not sources:
66
+ raise HTTPException(status_code=404, detail=f"No sources found online for {file.filename}")
67
+
68
+ matches = []
69
+ highest = 0.0
70
+ source_matches_count = {}
71
+
72
+ externals = [
73
+ {
74
+ "title": s.get("url", "Unknown"),
75
+ "text": s.get("content", ""),
76
+ "source_url": s.get("url", ""),
77
+ "type": "web",
78
+ }
79
+ for s in sources if s.get("content")
80
+ ]
81
+
82
+ for ext in externals:
83
+ print(f" 🌐 Source: {ext['source_url'][:60]}...")
84
+ source_matches_count[ext['source_url']] = 0
85
+
86
+ # Compare each sentence against ALL sources
87
+ for s in sentences:
88
+ best_overall_score = 0.0
89
+ best_overall_match = None
90
+ best_overall_src = None
91
+
92
+ for ext in externals:
93
+ # Try exact match first
94
+ sim = find_exact_matches(s, ext["text"])
95
+ if sim is not None and sim > best_overall_score:
96
+ best_overall_score = sim
97
+ best_overall_match = s
98
+ best_overall_src = ext
99
+ continue
100
+
101
+ # Try partial phrase match
102
+ pp = find_partial_phrase_match(s, ext["text"])
103
+ if pp:
104
+ phrase, score = pp
105
+ if score > best_overall_score:
106
+ best_overall_score = score
107
+ best_overall_match = phrase
108
+ best_overall_src = ext
109
+
110
+ # Add match if found and above threshold (50%)
111
+ if best_overall_match and best_overall_score > 0.0:
112
+ pct = round(best_overall_score * 100.0, 1)
113
+
114
+ if pct >= 50:
115
+ matches.append({
116
+ "matched_text": best_overall_match,
117
+ "similarity": pct,
118
+ "source_type": best_overall_src["type"],
119
+ "source_title": best_overall_src["title"],
120
+ "source_url": best_overall_src["source_url"],
121
+ "context": "Potential plagiarism detected",
122
+ })
123
+ source_matches_count[best_overall_src['source_url']] += 1
124
+ highest = max(highest, pct)
125
+ total_matches += 1
126
+ print(f" βœ… Match ({pct}%) with {best_overall_src['source_url'][:50]}")
127
+
128
+ # Better flagging logic considering multiple sources
129
+ num_sources_with_matches = sum(1 for c in source_matches_count.values() if c > 0)
130
+ avg_match_score = (sum(m["similarity"] for m in matches) / len(matches)) if matches else 0.0
131
+
132
+ # Flag if any of these conditions are met:
133
+ # 1. Single source with high similarity (>85%)
134
+ # 2. Content plagiarized from 2+ different sources
135
+ # 3. 3+ matches with average >70%
136
+ flagged = (
137
+ highest >= 85 or
138
+ num_sources_with_matches >= 2 or
139
+ (len(matches) >= 3 and avg_match_score >= 70)
140
+ )
141
+
142
+ print(f" ➀ Highest similarity: {highest:.1f}%")
143
+ print(f" ➀ Total matches: {len(matches)}")
144
+ print(f" ➀ Sources with matches: {num_sources_with_matches}")
145
+ print(f" ➀ Average match score: {avg_match_score:.1f}%")
146
+ print(f" ➀ Flagged: {flagged}")
147
+
148
+ elapsed = (datetime.utcnow() - t0).total_seconds()
149
+ mm = int(elapsed // 60)
150
+ ss = int(elapsed % 60)
151
+ processing_time = f"{mm}m {ss:02d}s"
152
+
153
+ print("\nβœ… Analysis completed!")
154
+ print(f" ➀ Flagged: {flagged}")
155
+ print(f" ➀ Highest Similarity: {highest}%")
156
+ print(f" ➀ Average Similarity: {avg_match_score:.1f}%")
157
+ print(f" ➀ Processing Time: {processing_time}")
158
+
159
+ # Extract unique sources
160
+ all_sources = list(set(m["source_url"] for m in matches))
161
+
162
+ # Build response
163
+ result = {
164
+ "id": None, # Will be set after MongoDB insert
165
+ "name": file.filename,
166
+ "content": text,
167
+ "matches": matches,
168
+ "similarity": round(highest, 1),
169
+ "flagged": flagged,
170
+ "wordCount": len(text.split()),
171
+ "processingTime": processing_time,
172
+ "totalMatches": total_matches,
173
+ "averageSimilarity": round(avg_match_score, 1),
174
+ "sources": all_sources,
175
+ "uploadDate": datetime.utcnow().isoformat(),
176
+ }
177
+
178
+ # Save to MongoDB
179
+ try:
180
+ mongo_client = await get_mongo_client()
181
+ db = mongo_client.sluethink
182
+ reports_collection = db.reports
183
+
184
+ # Prepare document for MongoDB
185
+ report_doc = {
186
+ "name": file.filename,
187
+ "analysisType": "lexical",
188
+ "submittedBy": current_user.get("username", "System"),
189
+ "uploadDate": datetime.utcnow().strftime("%Y-%m-%d"),
190
+ "similarity": highest,
191
+ "status": "completed",
192
+ "flagged": flagged,
193
+ "fileCount": 1,
194
+ "processingTime": processing_time,
195
+ "avgSimilarity": avg_match_score,
196
+ "sources": all_sources,
197
+ "createdAt": datetime.utcnow(),
198
+ "userId": current_user.get("sub") or current_user.get("user_id"),
199
+ "content": text,
200
+ "wordCount": len(text.split()),
201
+ "matches": matches,
202
+ "totalMatches": total_matches,
203
+ }
204
+
205
+ # Insert into MongoDB
206
+ insert_result = await reports_collection.insert_one(report_doc)
207
+ print(f"\nπŸ’Ύ Report saved to MongoDB with ID: {insert_result.inserted_id}")
208
+
209
+ # Update the result with the MongoDB ID
210
+ result["id"] = str(insert_result.inserted_id)
211
+
212
+ mongo_client.close()
213
+
214
+ except Exception as e:
215
+ print(f"\n❌ Error saving to MongoDB: {str(e)}")
216
+ # Don't fail the request if MongoDB save fails
217
+ result["id"] = "temp_id"
218
+
219
+ print(f"\n🧾 Returning report:\n"
220
+ f" Flagged: {flagged}\n"
221
+ f" Avg Similarity: {avg_match_score:.1f}%\n"
222
+ f" Highest Similarity: {highest}%\n"
223
+ f" Total Matches: {total_matches}")
224
+
225
+ return result
app/routers/teacher/__pycache__/code_analysis.cpython-312.pyc ADDED
Binary file (16.5 kB). View file
 
app/routers/teacher/__pycache__/internal_analysis.cpython-312.pyc ADDED
Binary file (14 kB). View file
 
app/routers/teacher/__pycache__/lexical_analysis.cpython-312.pyc ADDED
Binary file (24.5 kB). View file
 
app/routers/teacher/__pycache__/semantic_analysis.cpython-312.pyc ADDED
Binary file (21.9 kB). View file
 
app/routers/teacher/internal_analysis.py ADDED
@@ -0,0 +1,323 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import APIRouter, UploadFile, File, HTTPException, Depends
2
+ from typing import List, Tuple, Set
3
+ from datetime import datetime
4
+ from fastapi.security import OAuth2PasswordBearer
5
+ from jose import JWTError, jwt
6
+ from motor.motor_asyncio import AsyncIOMotorClient
7
+
8
+ from app.schemas.teacher_schemas import (
9
+ DocumentInfo, OverlapDetail, ComparisonDetail,
10
+ InternalReportDetail, InternalReportSummary
11
+ )
12
+ from app.utils.file_utils import extract_text_from_file, allowed_file
13
+ from app.utils.lexical_utils import (
14
+ find_partial_phrase_match_for_internal,
15
+ get_meaningful_sentences,
16
+ find_exact_matches,
17
+ find_partial_phrase_match,
18
+ )
19
+ from app.config import MONGODB_URI,ALGORITHM, SECRET_KEY
20
+
21
+ router = APIRouter(prefix="/teacher", tags=["teacher-internal"])
22
+
23
+ LEXICAL_PAIR_THRESHOLD = 0.50 # 50% - pairs above this are flagged
24
+ OVERLAP_MIN_TOKENS = 12
25
+
26
+ # Add these new thresholds for color coding:
27
+ HIGH_SIMILARITY_THRESHOLD = 0.85 # 85% - Red (very high)
28
+ MEDIUM_SIMILARITY_THRESHOLD = 0.70 # 70% - Yellow (medium)
29
+ LOW_SIMILARITY_THRESHOLD = 0.50
30
+ oauth2_scheme = OAuth2PasswordBearer(tokenUrl="/token")
31
+
32
+
33
+ def verify_token(token: str = Depends(oauth2_scheme)):
34
+ try:
35
+ return jwt.decode(token, SECRET_KEY, algorithms=[ALGORITHM])
36
+ except JWTError:
37
+ raise HTTPException(status_code=401, detail="Invalid or expired token")
38
+
39
+
40
+ async def get_mongo_client():
41
+ return AsyncIOMotorClient(MONGODB_URI)
42
+
43
+
44
+ def _percent(x: float) -> float:
45
+ return round(float(x) * 100.0, 1)
46
+
47
+
48
+ def _ordered_pair_key(i: int, j: int) -> str:
49
+ a, b = (i, j) if i < j else (j, i)
50
+ return f"{a}-{b}"
51
+
52
+
53
+ def _aggregate_pair_score(overlaps: List[OverlapDetail]) -> float:
54
+ return max((o.similarity for o in overlaps), default=0.0)
55
+
56
+
57
+ def _create_overlap_key(name_a: str, name_b: str, text: str, similarity: float, context: str) -> str:
58
+ """Create unique key for overlap deduplication - includes context to distinguish different match types"""
59
+ # Normalize text to handle whitespace variations
60
+ text_normalized = ' '.join(text.split())
61
+ return f"{name_a}|{name_b}|{text_normalized}|{similarity}|{context}"
62
+
63
+
64
+ def _extract_matched_text_from_sentence(sent_b: str, phrase: str) -> str:
65
+ """Extract the actual text from sent_b that matches the phrase"""
66
+ if not sent_b or not phrase:
67
+ return phrase
68
+
69
+ # Normalize both for comparison
70
+ phrase_normalized = ' '.join(phrase.split()).lower()
71
+ sent_normalized = ' '.join(sent_b.split()).lower()
72
+ sent_b_normalized = ' '.join(sent_b.split()) # Keep original casing
73
+
74
+ # If phrase exists in sentence, extract it as-is from original
75
+ if phrase_normalized in sent_normalized:
76
+ start_idx = sent_normalized.find(phrase_normalized)
77
+ end_idx = start_idx + len(phrase_normalized)
78
+ return sent_b_normalized[start_idx:end_idx].strip()
79
+
80
+ # If not found exactly, try to find similar chunks
81
+ # Split into words and try to find the best match
82
+ phrase_words = phrase_normalized.split()
83
+ sent_words = sent_normalized.split()
84
+
85
+ # Look for the phrase words in the sentence
86
+ for i in range(len(sent_words) - len(phrase_words) + 1):
87
+ if sent_words[i:i+len(phrase_words)] == phrase_words:
88
+ return ' '.join(sent_b_normalized.split()[i:i+len(phrase_words)])
89
+
90
+ # Fallback: return the phrase as-is
91
+ return phrase
92
+
93
+
94
+ def _find_overlaps_for_pair(
95
+ name_a: str, sents_a: List[str],
96
+ name_b: str, sents_b: List[str],
97
+ seen_overlaps: Set[str]
98
+ ) -> List[OverlapDetail]:
99
+ """Find all overlaps between two document's sentences"""
100
+ overlaps: List[OverlapDetail] = []
101
+
102
+ for sent_a in sents_a:
103
+ # Check exact matches
104
+ for sent_b in sents_b:
105
+ exact_score = find_exact_matches(sent_a, sent_b)
106
+ if exact_score is not None:
107
+ sim_pct = _percent(exact_score)
108
+ if sim_pct >= LEXICAL_PAIR_THRESHOLD * 100:
109
+ context = "Exact/near-exact sentence overlap"
110
+ overlap_key = _create_overlap_key(name_a, name_b, sent_a, sim_pct, context)
111
+ if overlap_key not in seen_overlaps:
112
+ seen_overlaps.add(overlap_key)
113
+ overlaps.append(OverlapDetail(
114
+ fromDoc=name_a,
115
+ toDoc=name_b,
116
+ text=sent_a,
117
+ similarity=sim_pct,
118
+ sectionA=sent_a,
119
+ sectionB=sent_b,
120
+ context=context,
121
+ ))
122
+
123
+ # Check partial phrase matches
124
+ best_partial = None
125
+ best_score = 0.0
126
+ best_sent_b = None
127
+
128
+ for sent_b in sents_b:
129
+ partial_result = find_partial_phrase_match_for_internal(sent_a, sent_b)
130
+ if partial_result:
131
+ phrase, score = partial_result
132
+ print(f"DEBUG: Partial match - phrase: {phrase[:80]}, score: {score}")
133
+ if score > best_score:
134
+ best_score = score
135
+ best_partial = phrase
136
+ best_sent_b = sent_b
137
+
138
+ # Add best partial match if it meets threshold
139
+ if best_partial and best_sent_b and len(best_partial.split()) >= OVERLAP_MIN_TOKENS:
140
+ sim_pct = _percent(best_score)
141
+ if sim_pct >= LEXICAL_PAIR_THRESHOLD * 100:
142
+ context = "High-overlap phrase (shingle/containment)"
143
+ overlap_key = _create_overlap_key(name_a, name_b, best_partial, sim_pct, context)
144
+ if overlap_key not in seen_overlaps:
145
+ seen_overlaps.add(overlap_key)
146
+ overlaps.append(OverlapDetail(
147
+ fromDoc=name_a,
148
+ toDoc=name_b,
149
+ text=best_partial,
150
+ similarity=sim_pct,
151
+ sectionA=sent_a,
152
+ sectionB=best_sent_b,
153
+ context=context,
154
+ ))
155
+
156
+ return overlaps
157
+
158
+ @router.post("/internal-analysis", response_model=InternalReportDetail)
159
+ async def internal_analysis(
160
+ files: List[UploadFile] = File(...),
161
+ token_payload: dict = Depends(verify_token),
162
+ mongo: AsyncIOMotorClient = Depends(get_mongo_client),
163
+ ):
164
+ if len(files) < 2:
165
+ raise HTTPException(status_code=400, detail="Upload at least 2 files")
166
+
167
+ t0 = datetime.utcnow()
168
+
169
+ # --- Load & sentence-split all docs ---
170
+ docs: List[Tuple[str, List[str]]] = []
171
+ doc_infos: List[DocumentInfo] = []
172
+ doc_texts = {}
173
+
174
+ for idx, f in enumerate(files, start=1):
175
+ if not allowed_file(f.filename):
176
+ raise HTTPException(status_code=400, detail=f"Invalid file type: {f.filename}")
177
+ raw = await f.read()
178
+ text = extract_text_from_file(raw, f.filename) or ""
179
+ sents = get_meaningful_sentences(text)
180
+ doc_infos.append(DocumentInfo(id=idx, name=f.filename, author=None))
181
+ docs.append((f.filename, sents))
182
+ doc_texts[f.filename] = text
183
+
184
+ # --- Pairwise comparisons ---
185
+ comparisons: List[ComparisonDetail] = []
186
+ seen_overlaps: Set[str] = set()
187
+
188
+ for i in range(len(docs)):
189
+ for j in range(i + 1, len(docs)):
190
+ name_a, sents_a = docs[i]
191
+ name_b, sents_b = docs[j]
192
+
193
+ # Find all overlaps for this pair
194
+ overlaps = _find_overlaps_for_pair(
195
+ name_a, sents_a,
196
+ name_b, sents_b,
197
+ seen_overlaps
198
+ )
199
+
200
+ # Calculate pair score and flag if needed
201
+ pair_score = _aggregate_pair_score(overlaps)
202
+ flagged = pair_score >= LEXICAL_PAIR_THRESHOLD * 100
203
+
204
+ comp = ComparisonDetail(
205
+ id=_ordered_pair_key(i + 1, j + 1),
206
+ docA=name_a,
207
+ docB=name_b,
208
+ similarity=round(pair_score, 1),
209
+ flagged=flagged,
210
+ overlaps=overlaps,
211
+ contentA=doc_texts[name_a],
212
+ contentB=doc_texts[name_b],
213
+ )
214
+ if flagged:
215
+ comparisons.append(comp)
216
+
217
+ # --- Compute per-document results ---
218
+ doc_results = []
219
+ total_matches = 0
220
+ flagged_count = 0
221
+
222
+ for d_idx, d in enumerate(doc_infos, start=1):
223
+ name = d.name
224
+ word_count = len(doc_texts[name].split())
225
+ matches = [o for c in comparisons for o in c.overlaps if o.fromDoc == name or o.toDoc == name]
226
+ highest_similarity = max((o.similarity for o in matches), default=0.0)
227
+ flagged = highest_similarity >= LEXICAL_PAIR_THRESHOLD * 100
228
+ if flagged:
229
+ flagged_count += 1
230
+ total_matches += len(matches)
231
+
232
+ doc_results.append({
233
+ "id": d.id,
234
+ "name": d.name,
235
+ "similarity": round(highest_similarity, 1),
236
+ "flagged": flagged,
237
+ "wordCount": word_count,
238
+ "matchCount": len(matches),
239
+ "matches": matches
240
+ })
241
+
242
+ highest_any = max(d['similarity'] for d in doc_results) if doc_results else 0.0
243
+ avg_similarity = round(sum(d['similarity'] for d in doc_results) / len(doc_results), 1) if doc_results else 0.0
244
+ elapsed = (datetime.utcnow() - t0).total_seconds()
245
+ processing = f"{int(elapsed // 60)}m {int(elapsed % 60):02d}s"
246
+
247
+ report = InternalReportDetail(
248
+ id="internal_report",
249
+ name="Internal Plagiarism Check",
250
+ uploadDate=datetime.utcnow(),
251
+ processingTime=processing,
252
+ documents=doc_infos,
253
+ comparisons=comparisons,
254
+ summary=InternalReportSummary(
255
+ totalDocuments=len(doc_results),
256
+ totalComparisons=(len(docs) * (len(docs) - 1)) // 2,
257
+ flaggedComparisons=flagged_count,
258
+ highestSimilarity=round(highest_any, 1),
259
+ averageSimilarity=avg_similarity,
260
+ ),
261
+ )
262
+
263
+ # --- Save to MongoDB ---
264
+ try:
265
+ db = mongo.sluethink
266
+ reports_collection = db.reports
267
+
268
+ all_sources = set()
269
+ for comp in comparisons:
270
+ for o in comp.overlaps:
271
+ all_sources.add(o.toDoc)
272
+
273
+ report_doc = {
274
+ "name": f"Internal_{datetime.utcnow().strftime('%Y%m%d_%H%M%S')}",
275
+ "analysisType": "internal",
276
+ "submittedBy": token_payload.get("name", "System"),
277
+ "uploadDate": datetime.utcnow().strftime("%Y-%m-%d"),
278
+ "similarity": highest_any,
279
+ "status": "completed",
280
+ "flagged": flagged_count > 0,
281
+ "fileCount": len(doc_results),
282
+ "processingTime": processing,
283
+ "avgSimilarity": avg_similarity,
284
+ "totalMatches": total_matches,
285
+ "sources": list(all_sources),
286
+ "createdAt": datetime.utcnow(),
287
+ "userId": token_payload.get("sub") or token_payload.get("user_id"),
288
+ "documents": [
289
+ {
290
+ "id": d['id'],
291
+ "name": d['name'],
292
+ "similarity": d['similarity'],
293
+ "flagged": d['flagged'],
294
+ "wordCount": d['wordCount'],
295
+ "matchCount": d['matchCount'],
296
+ "matches": [
297
+ {
298
+ "matched_text": m.text,
299
+ "similarity": m.similarity,
300
+ "source_url": m.toDoc,
301
+ "source_title": m.toDoc,
302
+ "source_type": "internal",
303
+ } for m in d['matches']
304
+ ]
305
+ } for d in doc_results
306
+ ],
307
+ "summary": {
308
+ "totalDocuments": len(doc_results),
309
+ "flaggedDocuments": flagged_count,
310
+ "highestSimilarity": highest_any,
311
+ "averageSimilarity": avg_similarity,
312
+ "totalMatches": total_matches,
313
+ }
314
+ }
315
+
316
+ insert_result = await reports_collection.insert_one(report_doc)
317
+ print(f"πŸ’Ύ Report saved to MongoDB with ID: {insert_result.inserted_id}")
318
+ report.id = str(insert_result.inserted_id)
319
+
320
+ except Exception as e:
321
+ print(f"❌ Error saving to MongoDB: {str(e)}")
322
+
323
+ return report
app/routers/teacher/lexical_analysis.py ADDED
@@ -0,0 +1,472 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import APIRouter, UploadFile, File, HTTPException, Depends
2
+ from typing import List
3
+ from datetime import datetime
4
+ from fastapi.security import OAuth2PasswordBearer
5
+ from jose import JWTError, jwt
6
+ from motor.motor_asyncio import AsyncIOMotorClient
7
+ import logging
8
+ import asyncio
9
+ import threading
10
+
11
+ from app.config import MONGODB_URI, ALGORITHM, SECRET_KEY
12
+ from app.schemas.teacher_schemas import (
13
+ TeacherLexicalBatchReport, TeacherLexicalSummary,
14
+ LexicalDocResult, LexicalMatch
15
+ )
16
+ from app.utils.file_utils import extract_text_from_file, allowed_file
17
+ from app.utils.lexical_utils import (
18
+ get_meaningful_sentences, extract_keywords,
19
+ find_exact_matches, find_partial_phrase_match,
20
+ )
21
+ from app.utils.web_utils import fetch_sources_multi_query
22
+
23
+ router = APIRouter(prefix="/teacher", tags=["teacher-lexical"])
24
+
25
+ LEXICAL_DOC_THRESHOLD = 0.85 # 85%
26
+
27
+ # βœ… HARD TIMEOUT: 3 minutes (180 seconds) for all queries combined
28
+ SCRAPING_TIMEOUT = 180
29
+
30
+ oauth2_scheme = OAuth2PasswordBearer(tokenUrl="/token")
31
+ logging.basicConfig(level=logging.INFO)
32
+ logger = logging.getLogger("lexical_analysis")
33
+
34
+ def verify_token(token: str = Depends(oauth2_scheme)):
35
+ try:
36
+ return jwt.decode(token, SECRET_KEY, algorithms=[ALGORITHM])
37
+ except JWTError:
38
+ raise HTTPException(status_code=401, detail="Invalid or expired token")
39
+
40
+ async def get_mongo_client():
41
+ return AsyncIOMotorClient(MONGODB_URI)
42
+
43
+ def generate_five_queries(text: str) -> List[str]:
44
+ """
45
+ Generate 5 high-quality search queries from document.
46
+ Covers: beginning, 1/4, middle, 3/4, end
47
+ """
48
+ from app.utils.lexical_utils import get_meaningful_sentences
49
+
50
+ logger.info(" πŸ” Generating 5 lexical queries from content...")
51
+
52
+ sentences = get_meaningful_sentences(text)
53
+ if len(sentences) < 5:
54
+ logger.warning(" ⚠️ Not enough sentences, using fewer queries")
55
+ # Fallback for short documents
56
+ words = text.split()
57
+ return [
58
+ ' '.join(words[:30]) if len(words) > 0 else text,
59
+ ' '.join(words[max(0, len(words)//4):max(0, len(words)//4)+30]) if len(words) > 30 else text,
60
+ ' '.join(words[max(0, len(words)//2):max(0, len(words)//2)+30]) if len(words) > 30 else text,
61
+ ]
62
+
63
+ queries = []
64
+
65
+ # βœ… Query 1: BEGINNING - First 3-4 sentences
66
+ beginning_end = min(4, len(sentences))
67
+ query1 = ' '.join(sentences[:beginning_end])
68
+ queries.append(query1)
69
+ logger.debug(f" Query 1 length: {len(query1.split())} words")
70
+
71
+ # βœ… Query 2: QUARTER-POINT - Around 25% of document
72
+ quarter_start = max(beginning_end, len(sentences) // 4)
73
+ quarter_end = min(quarter_start + 4, len(sentences))
74
+ query2 = ' '.join(sentences[quarter_start:quarter_end])
75
+ queries.append(query2)
76
+ logger.debug(f" Query 2 length: {len(query2.split())} words")
77
+
78
+ # βœ… Query 3: MIDDLE - Around 50% of document
79
+ mid_start = max(quarter_end, len(sentences) // 2)
80
+ mid_end = min(mid_start + 4, len(sentences))
81
+ query3 = ' '.join(sentences[mid_start:mid_end])
82
+ queries.append(query3)
83
+ logger.debug(f" Query 3 length: {len(query3.split())} words")
84
+
85
+ # βœ… Query 4: THREE-QUARTER-POINT - Around 75% of document
86
+ three_quarter_start = max(mid_end, int(len(sentences) * 0.75))
87
+ three_quarter_end = min(three_quarter_start + 4, len(sentences))
88
+ query4 = ' '.join(sentences[three_quarter_start:three_quarter_end])
89
+ queries.append(query4)
90
+ logger.debug(f" Query 4 length: {len(query4.split())} words")
91
+
92
+ # βœ… Query 5: END - Last 3-4 sentences
93
+ end_start = max(three_quarter_end, len(sentences) - 4)
94
+ query5 = ' '.join(sentences[end_start:])
95
+ queries.append(query5)
96
+ logger.debug(f" Query 5 length: {len(query5.split())} words")
97
+
98
+ # βœ… Validate queries
99
+ final_queries = []
100
+ for q in queries:
101
+ q = q.strip()
102
+ if len(q.split()) >= 15: # Minimum 15 words for good search
103
+ final_queries.append(q)
104
+
105
+ logger.info(f" βœ… Generated {len(final_queries)} queries:")
106
+ for i, q in enumerate(final_queries, 1):
107
+ word_count = len(q.split())
108
+ preview = q[:80] + "..." if len(q) > 80 else q
109
+ logger.info(f" Query {i} ({word_count} words): {preview}")
110
+
111
+ return final_queries
112
+
113
+ class ScrapingTimeoutManager:
114
+ """Manages web scraping with hard 3-minute overall timeout"""
115
+
116
+ def __init__(self, timeout_seconds: int = 180):
117
+ self.timeout = timeout_seconds
118
+ self.start_time = None
119
+ self.sources = []
120
+ self.lock = threading.Lock()
121
+ self.cancelled = False
122
+
123
+ def elapsed(self) -> float:
124
+ """Get elapsed time in seconds"""
125
+ if self.start_time is None:
126
+ return 0.0
127
+ return (datetime.utcnow() - self.start_time).total_seconds()
128
+
129
+ def is_timeout(self) -> bool:
130
+ """Check if 3-minute timeout exceeded"""
131
+ return self.elapsed() >= self.timeout
132
+
133
+ async def fetch_all_sources(self, queries: List[str], num_results: int = 10) -> List:
134
+ """
135
+ Fetch sources for all 5 queries with hard 180-second overall timeout.
136
+ Immediately stops and starts matching when timeout reached.
137
+ """
138
+ self.start_time = datetime.utcnow()
139
+ self.sources = []
140
+
141
+ logger.info(f"\nπŸ”Ž WEB SCRAPING PHASE")
142
+ logger.info(f" Max Duration: {self.timeout}s (3 minutes)")
143
+ logger.info(f" Queries: {len(queries)}")
144
+ logger.info(f" Starting: {self.start_time.strftime('%H:%M:%S')}")
145
+
146
+ # Process all queries in parallel with timeout
147
+ tasks = []
148
+ for query_idx, query in enumerate(queries, 1):
149
+ logger.info(f"\n Query {query_idx}/{len(queries)}: {query[:60]}...")
150
+ tasks.append(self._fetch_query(query, num_results))
151
+
152
+ try:
153
+ # Wait for all tasks with overall timeout
154
+ await asyncio.wait_for(
155
+ asyncio.gather(*tasks, return_exceptions=True),
156
+ timeout=self.timeout
157
+ )
158
+ except asyncio.TimeoutError:
159
+ logger.warning(f"\nπŸ›‘ HARD TIMEOUT REACHED after {self.elapsed():.1f}s")
160
+ logger.warning(f" Cancelling all pending queries")
161
+ self.cancelled = True
162
+ # Cancel remaining tasks
163
+ for task in tasks:
164
+ if isinstance(task, asyncio.Task):
165
+ task.cancel()
166
+
167
+ # Remove duplicates
168
+ seen_urls = set()
169
+ unique_sources = []
170
+ for source in self.sources:
171
+ url = source.get('url', '')
172
+ if url and url not in seen_urls:
173
+ seen_urls.add(url)
174
+ unique_sources.append(source)
175
+
176
+ elapsed = self.elapsed()
177
+ logger.info(f"\nβœ… SCRAPING PHASE STOPPED")
178
+ logger.info(f" Total Duration: {elapsed:.1f}s ({int(elapsed)//60}m {int(elapsed)%60}s)")
179
+ logger.info(f" Unique Sources: {len(unique_sources)}")
180
+ logger.info(f" Status: {'πŸ›‘ TIMEOUT' if self.is_timeout() else 'βœ… COMPLETED'}")
181
+
182
+ return unique_sources
183
+
184
+ async def _fetch_query(self, query: str, num_results: int = 10):
185
+ """Fetch sources for a single query"""
186
+ try:
187
+ sources = await asyncio.to_thread(
188
+ fetch_sources_multi_query,
189
+ query,
190
+ num_results
191
+ )
192
+
193
+ with self.lock:
194
+ self.sources.extend(sources)
195
+
196
+ logger.info(f" βœ… Found {len(sources)} sources")
197
+
198
+ except asyncio.CancelledError:
199
+ logger.warning(f" ⏭️ Query cancelled (timeout)")
200
+ except Exception as e:
201
+ logger.error(f" ❌ Error: {e}")
202
+
203
+ @router.post("/lexical-analysis", response_model=TeacherLexicalBatchReport)
204
+ async def teacher_lexical_analysis(
205
+ files: List[UploadFile] = File(...),
206
+ current_user=Depends(verify_token),
207
+ ):
208
+ if not files:
209
+ raise HTTPException(status_code=400, detail="No files uploaded")
210
+
211
+ t0 = datetime.utcnow()
212
+ doc_results: List[LexicalDocResult] = []
213
+ total_matches = 0
214
+
215
+ logger.info(f"\n{'='*80}")
216
+ logger.info(f"πŸ” LEXICAL ANALYSIS - {len(files)} file(s)")
217
+ logger.info(f"{'='*80}")
218
+
219
+ for idx, f in enumerate(files, start=1):
220
+ if not allowed_file(f.filename):
221
+ raise HTTPException(status_code=400, detail=f"Invalid file type: {f.filename}")
222
+
223
+ raw = await f.read()
224
+ try:
225
+ text = extract_text_from_file(raw, f.filename) or ""
226
+ except ValueError as ve:
227
+ # Catch over-word files
228
+ raise HTTPException(status_code=400, detail=str(ve))
229
+
230
+ sentences = get_meaningful_sentences(text)
231
+
232
+ logger.info(f"\nπŸ“„ File {idx}: {f.filename}")
233
+ logger.info(f" Sentences: {len(sentences)}")
234
+ logger.info(f" Words: {len(text.split())}")
235
+
236
+ # βœ… Generate 5 lexical queries
237
+ queries = generate_five_queries(text)
238
+
239
+ # βœ… WEB SCRAPING WITH 3-MINUTE HARD TIMEOUT (OVERALL)
240
+ scraper = ScrapingTimeoutManager(timeout_seconds=SCRAPING_TIMEOUT)
241
+ sources = await scraper.fetch_all_sources(queries, num_results=5)
242
+
243
+ # βœ… RESET TIMEOUT - Scraping phase is done, matching has no time limit
244
+ from app.utils import web_utils
245
+ web_utils._scraping_deadline = None
246
+ web_utils._scraping_start_time = None
247
+
248
+ logger.info(f" Total unique sources: {len(sources)}")
249
+
250
+ if not sources:
251
+ logger.warning(f" ⚠️ No sources found, skipping lexical matching")
252
+ doc_results.append(LexicalDocResult(
253
+ id=idx,
254
+ name=f.filename,
255
+ author=None,
256
+ similarity=0.0,
257
+ flagged=False,
258
+ wordCount=len(text.split()),
259
+ matches=[],
260
+ content=text
261
+ ))
262
+ continue
263
+
264
+ matches: List[LexicalMatch] = []
265
+ highest = 0.0
266
+ source_matches_count = {}
267
+
268
+ # βœ… MATCHING PHASE (starts immediately after timeout)
269
+ logger.info(f"\nπŸ“Š LEXICAL MATCHING PHASE")
270
+ logger.info(f" Comparing {len(sentences)} sentences against {len(sources)} sources...")
271
+
272
+ externals = [
273
+ {
274
+ "title": s.get("url", "Unknown"),
275
+ "text": s.get("content", ""),
276
+ "source_url": s.get("url", ""),
277
+ "type": "web",
278
+ }
279
+ for s in sources if s.get("content")
280
+ ]
281
+
282
+ for ext in externals:
283
+ logger.info(f" 🌐 Source: {ext['source_url'][:60]}...")
284
+ source_matches_count[ext['source_url']] = 0
285
+
286
+ # Compare each sentence against ALL sources
287
+ for s in sentences:
288
+ best_overall_score = 0.0
289
+ best_overall_match = None
290
+ best_overall_src = None
291
+
292
+ for ext in externals:
293
+ # Try exact match first
294
+ sim = find_exact_matches(s, ext["text"])
295
+ if sim is not None and sim > best_overall_score:
296
+ best_overall_score = sim
297
+ best_overall_match = s
298
+ best_overall_src = ext
299
+ continue
300
+
301
+ # Try partial phrase match
302
+ pp = find_partial_phrase_match(s, ext["text"])
303
+ if pp:
304
+ phrase, score = pp
305
+ if score > best_overall_score:
306
+ best_overall_score = score
307
+ best_overall_match = phrase
308
+ best_overall_src = ext
309
+
310
+ # Add match if found and above threshold (50%)
311
+ if best_overall_match and best_overall_score > 0.0:
312
+ pct = round(best_overall_score * 100.0, 1)
313
+
314
+ if pct >= 50:
315
+ matches.append(LexicalMatch(
316
+ matched_text=best_overall_match,
317
+ similarity=pct,
318
+ source_type=best_overall_src["type"],
319
+ source_title=best_overall_src["title"],
320
+ source_url=best_overall_src["source_url"],
321
+ section=None,
322
+ context="Potential plagiarism detected",
323
+ ))
324
+ source_matches_count[best_overall_src['source_url']] += 1
325
+ highest = max(highest, pct)
326
+ total_matches += 1
327
+ logger.debug(f" βœ… Match ({pct}%) with {best_overall_src['source_url'][:50]}")
328
+
329
+ # Better flagging logic considering multiple sources
330
+ num_sources_with_matches = sum(1 for c in source_matches_count.values() if c > 0)
331
+ avg_match_score = (sum(m.similarity for m in matches) / len(matches)) if matches else 0.0
332
+
333
+ # Flag if any of these conditions are met:
334
+ # 1. Single source with high similarity (>85%)
335
+ # 2. Content plagiarized from 2+ different sources
336
+ # 3. 3+ matches with average >70%
337
+ flagged = (
338
+ highest >= 85 or
339
+ num_sources_with_matches >= 2 or
340
+ (len(matches) >= 3 and avg_match_score >= 70)
341
+ )
342
+
343
+ logger.info(f" πŸ“ˆ Results:")
344
+ logger.info(f" Highest similarity: {highest:.1f}%")
345
+ logger.info(f" Total matches: {len(matches)}")
346
+ logger.info(f" Sources with matches: {num_sources_with_matches}")
347
+ logger.info(f" Average match score: {avg_match_score:.1f}%")
348
+ logger.info(f" Flagged: {flagged}")
349
+
350
+ doc_results.append(LexicalDocResult(
351
+ id=idx,
352
+ name=f.filename,
353
+ author=None,
354
+ similarity=round(highest, 1),
355
+ flagged=flagged,
356
+ wordCount=len(text.split()),
357
+ matches=matches,
358
+ content=text # Include full document for frontend
359
+ ))
360
+
361
+ highest_any = max((d.similarity for d in doc_results), default=0.0)
362
+ avg = round(sum(d.similarity for d in doc_results) / len(doc_results), 1) if doc_results else 0.0
363
+ flagged_count = sum(1 for d in doc_results if d.flagged)
364
+
365
+ elapsed = (datetime.utcnow() - t0).total_seconds()
366
+ mm = int(elapsed // 60)
367
+ ss = int(elapsed % 60)
368
+ processing = f"{mm}m {ss:02d}s"
369
+
370
+ logger.info(f"\n{'='*80}")
371
+ logger.info(f"βœ… ANALYSIS COMPLETE")
372
+ logger.info(f"{'='*80}")
373
+ logger.info(f" Documents: {len(doc_results)}")
374
+ logger.info(f" Flagged: {flagged_count}")
375
+ logger.info(f" Highest: {highest_any}%")
376
+ logger.info(f" Average: {avg}%")
377
+ logger.info(f" Total Matches: {total_matches}")
378
+ logger.info(f" Total Time: {processing}\n")
379
+
380
+ result = TeacherLexicalBatchReport(
381
+ id="teacher_lexical_batch",
382
+ name="Teacher Lexical Analysis",
383
+ uploadDate=datetime.utcnow(),
384
+ processingTime=processing,
385
+ documents=doc_results,
386
+ summary=TeacherLexicalSummary(
387
+ totalDocuments=len(doc_results),
388
+ flaggedDocuments=flagged_count,
389
+ highestSimilarity=highest_any,
390
+ averageSimilarity=avg,
391
+ totalMatches=total_matches,
392
+ ),
393
+ )
394
+
395
+ # Save to MongoDB
396
+ try:
397
+ mongo_client = await get_mongo_client()
398
+ db = mongo_client.sluethink
399
+ reports_collection = db.reports
400
+
401
+ # Extract unique sources from all matches
402
+ all_sources = set()
403
+ for doc in doc_results:
404
+ for match in doc.matches:
405
+ all_sources.add(match.source_url)
406
+
407
+ # Prepare document for MongoDB
408
+ report_doc = {
409
+ "name": f"Lexical_Batch_{datetime.utcnow().strftime('%Y%m%d_%H%M%S')}",
410
+ "analysisType": "lexical",
411
+ "submittedBy": current_user.get("username", "System"),
412
+ "uploadDate": datetime.utcnow().strftime("%Y-%m-%d"),
413
+ "similarity": highest_any,
414
+ "status": "completed",
415
+ "flagged": flagged_count > 0,
416
+ "fileCount": len(doc_results),
417
+ "processingTime": processing,
418
+ "avgSimilarity": avg,
419
+ "sources": list(all_sources),
420
+ "createdAt": datetime.utcnow(),
421
+ "userId": current_user.get("sub") or current_user.get("user_id"),
422
+ # Store full analysis details
423
+ "documents": [
424
+ {
425
+ "id": doc.id,
426
+ "name": doc.name,
427
+ "similarity": doc.similarity,
428
+ "flagged": doc.flagged,
429
+ "wordCount": doc.wordCount,
430
+ "matchCount": len(doc.matches),
431
+ "matches": [
432
+ {
433
+ "matched_text": m.matched_text,
434
+ "similarity": m.similarity,
435
+ "source_url": m.source_url,
436
+ "source_title": m.source_title,
437
+ "source_type": m.source_type,
438
+ }
439
+ for m in doc.matches
440
+ ]
441
+ }
442
+ for doc in doc_results
443
+ ],
444
+ "summary": {
445
+ "totalDocuments": result.summary.totalDocuments,
446
+ "flaggedDocuments": result.summary.flaggedDocuments,
447
+ "highestSimilarity": result.summary.highestSimilarity,
448
+ "averageSimilarity": result.summary.averageSimilarity,
449
+ "totalMatches": result.summary.totalMatches,
450
+ }
451
+ }
452
+
453
+ # Insert into MongoDB
454
+ insert_result = await reports_collection.insert_one(report_doc)
455
+ logger.info(f"πŸ’Ύ Report saved to MongoDB with ID: {insert_result.inserted_id}")
456
+
457
+ # Update the result with the MongoDB ID
458
+ result.id = str(insert_result.inserted_id)
459
+
460
+ mongo_client.close()
461
+
462
+ except Exception as e:
463
+ logger.error(f"❌ Error saving to MongoDB: {str(e)}")
464
+
465
+ logger.info(f"\n🧾 Returning report:")
466
+ logger.info(f" Total Docs: {result.summary.totalDocuments}")
467
+ logger.info(f" Flagged Docs: {result.summary.flaggedDocuments}")
468
+ logger.info(f" Avg Similarity: {result.summary.averageSimilarity}%")
469
+ logger.info(f" Highest Similarity: {result.summary.highestSimilarity}%")
470
+ logger.info(f" Total Matches: {result.summary.totalMatches}\n")
471
+
472
+ return result
app/routers/teacher/semantic_analysis.py ADDED
@@ -0,0 +1,406 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import APIRouter, UploadFile, File, HTTPException, Depends
2
+ from typing import List
3
+ from datetime import datetime
4
+ from fastapi.security import OAuth2PasswordBearer
5
+ from jose import JWTError, jwt
6
+ from motor.motor_asyncio import AsyncIOMotorClient
7
+ import logging
8
+ import asyncio
9
+ import threading
10
+ from concurrent.futures import ThreadPoolExecutor
11
+
12
+ from app.config import MONGODB_URI, ALGORITHM, SECRET_KEY
13
+ from app.schemas.teacher_schemas import (
14
+ TeacherLexicalBatchReport, TeacherLexicalSummary,
15
+ LexicalDocResult, LexicalMatch
16
+ )
17
+ from app.utils.file_utils import extract_text_from_file, allowed_file
18
+ from app.utils.semantic_utils import (
19
+ generate_five_queries,
20
+ find_semantic_matches,
21
+ )
22
+ from app.utils.web_utils import fetch_sources_multi_query
23
+ from app.utils.ai_detector import detect_ai_similarity
24
+
25
+ router = APIRouter(prefix="/teacher", tags=["teacher-semantic"])
26
+
27
+ SEMANTIC_THRESHOLD = 0.50
28
+ SCRAPING_TIMEOUT = 180 # 3 minutes total for all queries combined
29
+
30
+ oauth2_scheme = OAuth2PasswordBearer(tokenUrl="/token")
31
+ logging.basicConfig(level=logging.INFO)
32
+ logger = logging.getLogger("semantic_analysis")
33
+
34
+ def verify_token(token: str = Depends(oauth2_scheme)):
35
+ try:
36
+ return jwt.decode(token, SECRET_KEY, algorithms=[ALGORITHM])
37
+ except JWTError:
38
+ raise HTTPException(status_code=401, detail="Invalid or expired token")
39
+
40
+ async def get_mongo_client():
41
+ return AsyncIOMotorClient(MONGODB_URI)
42
+
43
+ class ScrapingTimeoutManager:
44
+ """Manages web scraping with hard 3-minute overall timeout"""
45
+
46
+ def __init__(self, timeout_seconds: int = 180):
47
+ self.timeout = timeout_seconds
48
+ self.start_time = None
49
+ self.sources = []
50
+ self.executor = ThreadPoolExecutor(max_workers=4)
51
+ self.lock = threading.Lock()
52
+ self.cancelled = False
53
+
54
+ def elapsed(self) -> float:
55
+ """Get elapsed time in seconds"""
56
+ if self.start_time is None:
57
+ return 0.0
58
+ return (datetime.utcnow() - self.start_time).total_seconds()
59
+
60
+ def is_timeout(self) -> bool:
61
+ """Check if 3-minute timeout exceeded"""
62
+ return self.elapsed() >= self.timeout
63
+
64
+ async def fetch_all_queries(self, queries: List[str], num_results: int = 5) -> List:
65
+ """
66
+ Fetch sources for all queries with hard 180-second overall timeout.
67
+ Immediately stops and starts matching when timeout reached.
68
+ """
69
+ self.start_time = datetime.utcnow()
70
+ self.sources = []
71
+
72
+ logger.info(f"\nπŸ”Ž WEB SCRAPING PHASE")
73
+ logger.info(f" Max Duration: {self.timeout}s (3 minutes)")
74
+ logger.info(f" Queries: {len(queries)}")
75
+ logger.info(f" Starting: {self.start_time.strftime('%H:%M:%S')}")
76
+
77
+ # Process all queries in parallel with timeout
78
+ tasks = []
79
+ for query_idx, query in enumerate(queries, 1):
80
+ logger.info(f"\n Query {query_idx}/{len(queries)}: {query[:60]}...")
81
+ tasks.append(self._fetch_query(query, num_results))
82
+
83
+ try:
84
+ # Wait for all tasks with overall timeout
85
+ await asyncio.wait_for(
86
+ asyncio.gather(*tasks, return_exceptions=True),
87
+ timeout=self.timeout
88
+ )
89
+ except asyncio.TimeoutError:
90
+ logger.warning(f"\nπŸ›‘ HARD TIMEOUT REACHED after {self.elapsed():.1f}s")
91
+ logger.warning(f" Cancelling all pending queries")
92
+ self.cancelled = True
93
+ # Cancel remaining tasks
94
+ for task in tasks:
95
+ if isinstance(task, asyncio.Task):
96
+ task.cancel()
97
+
98
+ # Remove duplicates
99
+ seen_urls = set()
100
+ unique_sources = []
101
+ for source in self.sources:
102
+ url = source.get('url', '')
103
+ if url and url not in seen_urls:
104
+ seen_urls.add(url)
105
+ unique_sources.append(source)
106
+
107
+ elapsed = self.elapsed()
108
+ logger.info(f"\nβœ… SCRAPING PHASE STOPPED")
109
+ logger.info(f" Total Duration: {elapsed:.1f}s ({int(elapsed)//60}m {int(elapsed)%60}s)")
110
+ logger.info(f" Unique Sources: {len(unique_sources)}")
111
+ logger.info(f" Status: {'πŸ›‘ TIMEOUT' if self.is_timeout() else 'βœ… COMPLETED'}")
112
+
113
+ return unique_sources
114
+
115
+ async def _fetch_query(self, query: str, num_results: int = 5):
116
+ """Fetch sources for a single query"""
117
+ try:
118
+ sources = await asyncio.to_thread(
119
+ fetch_sources_multi_query,
120
+ query,
121
+ num_results
122
+ )
123
+
124
+ with self.lock:
125
+ self.sources.extend(sources)
126
+
127
+ logger.info(f" βœ… Found {len(sources)} sources")
128
+
129
+ except asyncio.CancelledError:
130
+ logger.warning(f" ⏭️ Query cancelled (timeout)")
131
+ except Exception as e:
132
+ logger.error(f" ❌ Error: {e}")
133
+
134
+ def cleanup(self):
135
+ """Clean up executor"""
136
+ try:
137
+ self.executor.shutdown(wait=False)
138
+ except:
139
+ pass
140
+
141
+ @router.post("/semantic-analysis", response_model=TeacherLexicalBatchReport)
142
+ async def teacher_semantic_analysis(
143
+ files: List[UploadFile] = File(...),
144
+ current_user=Depends(verify_token),
145
+ ):
146
+ if not files:
147
+ raise HTTPException(status_code=400, detail="No files uploaded")
148
+
149
+ t0 = datetime.utcnow()
150
+ doc_results: List[LexicalDocResult] = []
151
+ total_matches = 0
152
+
153
+ logger.info(f"\n{'='*80}")
154
+ logger.info(f"🧠 SEMANTIC ANALYSIS - {len(files)} file(s)")
155
+ logger.info(f"{'='*80}")
156
+
157
+ for idx, f in enumerate(files, start=1):
158
+ if not allowed_file(f.filename):
159
+ raise HTTPException(status_code=400, detail=f"Invalid file type: {f.filename}")
160
+
161
+ raw = await f.read()
162
+ text = extract_text_from_file(raw, f.filename) or ""
163
+
164
+ logger.info(f"\nπŸ“„ File {idx}: {f.filename}")
165
+ logger.info(f" Words: {len(text.split())}")
166
+
167
+ # βœ… AI DETECTION
168
+ logger.info(f"πŸ€– Running AI detection...")
169
+ ai_similarity = detect_ai_similarity(text)
170
+ logger.info(f" AI Similarity: {ai_similarity}")
171
+
172
+ # Generate 3 semantic queries
173
+ queries = generate_five_queries(text)
174
+
175
+ # βœ… WEB SCRAPING WITH 3-MINUTE HARD TIMEOUT (OVERALL)
176
+ scraper = ScrapingTimeoutManager(timeout_seconds=SCRAPING_TIMEOUT)
177
+ try:
178
+ unique_sources = await scraper.fetch_all_queries(queries, num_results=5)
179
+ finally:
180
+ scraper.cleanup()
181
+
182
+ logger.info(f" Total unique sources: {len(unique_sources)}")
183
+
184
+ if not unique_sources:
185
+ logger.warning(f" ⚠️ No sources found, skipping semantic matching")
186
+ doc_results.append(LexicalDocResult(
187
+ id=idx,
188
+ name=f.filename,
189
+ author=None,
190
+ similarity=0.0,
191
+ flagged=False,
192
+ wordCount=len(text.split()),
193
+ matches=[],
194
+ content=text[:5000],
195
+ ai_similarity=ai_similarity
196
+ ))
197
+ continue
198
+
199
+ # βœ… MATCHING PHASE (starts immediately after timeout)
200
+ logger.info(f"\nπŸ“Š SEMANTIC MATCHING PHASE")
201
+ logger.info(f" Comparing against {len(unique_sources)} sources...")
202
+
203
+ matches: List[LexicalMatch] = []
204
+ highest = 0.0
205
+ source_matches_count = {}
206
+
207
+ # Prepare externals
208
+ externals = [
209
+ {
210
+ "title": s.get("url", "Unknown"),
211
+ "text": s.get("content", ""),
212
+ "source_url": s.get("url", ""),
213
+ "type": "web",
214
+ }
215
+ for s in unique_sources if s.get("content")
216
+ ]
217
+
218
+ for ext_idx, ext in enumerate(externals, 1):
219
+ logger.info(f" Source {ext_idx}/{len(externals)}: {ext['source_url'][:60]}...")
220
+ source_matches_count[ext['source_url']] = 0
221
+
222
+ try:
223
+ # Semantic comparison
224
+ semantic_matches = find_semantic_matches(
225
+ text,
226
+ ext["text"],
227
+ threshold=SEMANTIC_THRESHOLD
228
+ )
229
+
230
+ logger.info(f" Found {len(semantic_matches)} semantic matches")
231
+
232
+ for match in semantic_matches:
233
+ similarity_pct = round(match['similarity'] * 100, 1)
234
+
235
+ matches.append(LexicalMatch(
236
+ matched_text=match['doc_text'][:300],
237
+ similarity=similarity_pct,
238
+ source_type=ext["type"],
239
+ source_title=ext["title"],
240
+ source_url=ext["source_url"],
241
+ section=None,
242
+ context="Semantic similarity detected (possible paraphrasing)",
243
+ ))
244
+
245
+ source_matches_count[ext['source_url']] += 1
246
+ highest = max(highest, similarity_pct)
247
+ total_matches += 1
248
+
249
+ logger.debug(f" Match: {similarity_pct}% - {match['doc_text'][:50]}...")
250
+
251
+ except Exception as e:
252
+ logger.error(f" Error matching source: {e}")
253
+ continue
254
+
255
+ # Deduplicate matches
256
+ logger.info(f" πŸ”„ Deduplicating {len(matches)} matches...")
257
+ unique_matches_dict = {}
258
+
259
+ for match in matches:
260
+ key = match.matched_text.lower().strip()
261
+ if key not in unique_matches_dict or match.similarity > unique_matches_dict[key].similarity:
262
+ unique_matches_dict[key] = match
263
+
264
+ matches = list(unique_matches_dict.values())
265
+ logger.info(f" βœ… Deduplicated to {len(matches)} unique matches")
266
+
267
+ # Recalculate metrics
268
+ highest = max((m.similarity for m in matches), default=0.0)
269
+ source_matches_count = {}
270
+ for match in matches:
271
+ source_matches_count[match.source_url] = source_matches_count.get(match.source_url, 0) + 1
272
+
273
+ # Flagging logic
274
+ num_sources_with_matches = sum(1 for c in source_matches_count.values() if c > 0)
275
+ avg_match_score = (sum(m.similarity for m in matches) / len(matches)) if matches else 0.0
276
+
277
+ flagged = (
278
+ highest >= 80 or
279
+ num_sources_with_matches >= 2 or
280
+ (len(matches) >= 2 and avg_match_score >= 70)
281
+ )
282
+
283
+ logger.info(f" πŸ“ˆ Results:")
284
+ logger.info(f" Highest: {highest:.1f}%")
285
+ logger.info(f" Total matches: {len(matches)}")
286
+ logger.info(f" Sources with matches: {num_sources_with_matches}")
287
+ logger.info(f" Average: {avg_match_score:.1f}%")
288
+ logger.info(f" Flagged: {flagged}")
289
+
290
+ doc_results.append(LexicalDocResult(
291
+ id=idx,
292
+ name=f.filename,
293
+ author=None,
294
+ similarity=round(highest, 1),
295
+ flagged=flagged,
296
+ wordCount=len(text.split()),
297
+ matches=matches,
298
+ content=text[:5000],
299
+ ai_similarity=ai_similarity
300
+ ))
301
+
302
+ # Final summary
303
+ highest_any = max((d.similarity for d in doc_results), default=0.0)
304
+ avg = round(sum(d.similarity for d in doc_results) / len(doc_results), 1) if doc_results else 0.0
305
+ flagged_count = sum(1 for d in doc_results if d.flagged)
306
+ avg_ai_similarity = round(sum(d.ai_similarity for d in doc_results) / len(doc_results), 3) if doc_results else 0.0
307
+
308
+ elapsed = (datetime.utcnow() - t0).total_seconds()
309
+ mm = int(elapsed // 60)
310
+ ss = int(elapsed % 60)
311
+ processing = f"{mm}m {ss:02d}s"
312
+
313
+ logger.info(f"\n{'='*80}")
314
+ logger.info(f"βœ… ANALYSIS COMPLETE")
315
+ logger.info(f"{'='*80}")
316
+ logger.info(f" Documents: {len(doc_results)}")
317
+ logger.info(f" Flagged: {flagged_count}")
318
+ logger.info(f" Highest Semantic Similarity: {highest_any}%")
319
+ logger.info(f" Average Semantic Similarity: {avg}%")
320
+ logger.info(f" Average AI Similarity: {avg_ai_similarity}")
321
+ logger.info(f" Total Matches: {total_matches}")
322
+ logger.info(f" Total Time: {processing}\n")
323
+
324
+ result = TeacherLexicalBatchReport(
325
+ id="teacher_semantic_batch",
326
+ name="Teacher Semantic Analysis",
327
+ uploadDate=datetime.utcnow(),
328
+ processingTime=processing,
329
+ documents=doc_results,
330
+ summary=TeacherLexicalSummary(
331
+ totalDocuments=len(doc_results),
332
+ flaggedDocuments=flagged_count,
333
+ highestSimilarity=highest_any,
334
+ averageSimilarity=avg,
335
+ totalMatches=total_matches,
336
+ averageAiSimilarity=avg_ai_similarity,
337
+ ),
338
+ )
339
+
340
+ # Save to MongoDB
341
+ try:
342
+ mongo_client = await get_mongo_client()
343
+ db = mongo_client.sluethink
344
+ reports_collection = db.reports
345
+
346
+ all_sources = set()
347
+ for doc in doc_results:
348
+ for match in doc.matches:
349
+ all_sources.add(match.source_url)
350
+
351
+ report_doc = {
352
+ "name": f"Semantic_Batch_{datetime.utcnow().strftime('%Y%m%d_%H%M%S')}",
353
+ "analysisType": "semantic",
354
+ "submittedBy": current_user.get("username", "System"),
355
+ "uploadDate": datetime.utcnow().strftime("%Y-%m-%d"),
356
+ "similarity": highest_any,
357
+ "aiSimilarity": avg_ai_similarity,
358
+ "status": "completed",
359
+ "flagged": flagged_count > 0,
360
+ "fileCount": len(doc_results),
361
+ "processingTime": processing,
362
+ "avgSimilarity": avg,
363
+ "sources": list(all_sources),
364
+ "createdAt": datetime.utcnow(),
365
+ "userId": current_user.get("sub") or current_user.get("user_id"),
366
+ "documents": [
367
+ {
368
+ "id": doc.id,
369
+ "name": doc.name,
370
+ "similarity": doc.similarity,
371
+ "aiSimilarity": doc.ai_similarity,
372
+ "flagged": doc.flagged,
373
+ "wordCount": doc.wordCount,
374
+ "matchCount": len(doc.matches),
375
+ "matches": [
376
+ {
377
+ "matched_text": m.matched_text,
378
+ "similarity": m.similarity,
379
+ "source_url": m.source_url,
380
+ "source_title": m.source_title,
381
+ "source_type": m.source_type,
382
+ }
383
+ for m in doc.matches
384
+ ]
385
+ }
386
+ for doc in doc_results
387
+ ],
388
+ "summary": {
389
+ "totalDocuments": result.summary.totalDocuments,
390
+ "flaggedDocuments": result.summary.flaggedDocuments,
391
+ "highestSimilarity": result.summary.highestSimilarity,
392
+ "averageSimilarity": result.summary.averageSimilarity,
393
+ "averageAiSimilarity": result.summary.averageAiSimilarity,
394
+ "totalMatches": result.summary.totalMatches,
395
+ }
396
+ }
397
+
398
+ insert_result = await reports_collection.insert_one(report_doc)
399
+ logger.info(f"πŸ’Ύ Saved to MongoDB: {insert_result.inserted_id}")
400
+ result.id = str(insert_result.inserted_id)
401
+ mongo_client.close()
402
+
403
+ except Exception as e:
404
+ logger.error(f"❌ MongoDB error: {str(e)}")
405
+
406
+ return result
app/schemas/__pycache__/plagiarism_schemas.cpython-312.pyc ADDED
Binary file (1.21 kB). View file
 
app/schemas/__pycache__/report_schemas.cpython-312.pyc ADDED
Binary file (1.07 kB). View file
 
app/schemas/__pycache__/schemas.cpython-312.pyc ADDED
Binary file (2.16 kB). View file
 
app/schemas/__pycache__/sources_schemas.cpython-312.pyc ADDED
Binary file (526 Bytes). View file
 
app/schemas/__pycache__/teacher_schemas.cpython-312.pyc ADDED
Binary file (8.09 kB). View file
 
app/schemas/plagiarism_schemas.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pydantic import BaseModel
2
+ from typing import List
3
+
4
+ class MatchDetail(BaseModel):
5
+ matched_text: str
6
+ similarity: float
7
+ source_type: str # "news" or "academic"
8
+ source_title: str
9
+ source_url: str
10
+
11
+
12
+ class SentenceResult(BaseModel):
13
+ original_sentence: str
14
+ normalized_sentence: str
15
+ match_type: str # "full_sentence", "partial_phrase", "no_match"
16
+ matches: List[MatchDetail]
17
+
18
+
19
+ class PlagiarismResponse(BaseModel):
20
+ checked_sentences: int
21
+ checked_sources: int
22
+ results: List[SentenceResult]
app/schemas/report_schemas.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pydantic import BaseModel
2
+ from typing import List
3
+ from datetime import datetime
4
+ from app.schemas.plagiarism_schemas import MatchDetail
5
+
6
+ class ReportSummary(BaseModel):
7
+ id: str # MongoDB’s ObjectId as string
8
+ name: str
9
+ date: datetime
10
+ similarity: float # highest sentence‐level similarity (0–100)
11
+ sources: List[str] # unique list of source titles used
12
+ word_count: int
13
+ time_spent: str # e.g. "00:00"
14
+ flagged: bool # true if similarity > 70%
15
+
16
+
17
+ class ReportDetail(BaseModel):
18
+ id: str
19
+ name: str
20
+ content: str
21
+ plagiarism_data: List[MatchDetail]
app/schemas/sources_schemas.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ from pydantic import BaseModel
2
+
3
+
4
+ class SourceData(BaseModel):
5
+ id: str # MongoDB ObjectId as string
6
+ title: str
7
+ text: str
8
+ source_url: str
9
+ type: str
app/schemas/teacher_schemas.py ADDED
@@ -0,0 +1,185 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pydantic import BaseModel, Field
2
+ from typing import List, Optional
3
+ from datetime import datetime
4
+
5
+ # ---- Shared ----
6
+
7
+ class DocumentInfo(BaseModel):
8
+ id: int
9
+ name: str
10
+ author: Optional[str] = None
11
+
12
+ class OverlapDetail(BaseModel):
13
+ # For lexical/internal
14
+ fromDoc: str
15
+ toDoc: str
16
+ text: str
17
+ similarity: float # percent (0–100)
18
+ sectionA: Optional[str] = None
19
+ sectionB: Optional[str] = None
20
+ context: Optional[str] = None
21
+
22
+ class ComparisonDetail(BaseModel):
23
+ id: str # "i-j"
24
+ docA: str
25
+ docB: str
26
+ similarity: float # percent (0–100)
27
+ flagged: bool
28
+ overlaps: List[OverlapDetail] = Field(default_factory=list)
29
+ contentA: str = ""
30
+ contentB: str = ""
31
+
32
+ class InternalReportSummary(BaseModel):
33
+ totalDocuments: int
34
+ totalComparisons: int
35
+ flaggedComparisons: int
36
+ highestSimilarity: float
37
+ averageSimilarity: Optional[float] = None
38
+
39
+ class InternalReportDetail(BaseModel):
40
+ id: str
41
+ name: str
42
+ analysisType: str = "internal"
43
+ uploadDate: datetime
44
+ processingTime: str
45
+ status: str = "completed"
46
+ documents: List[DocumentInfo]
47
+ comparisons: List[ComparisonDetail]
48
+ summary: InternalReportSummary
49
+
50
+
51
+
52
+ class LexicalMatch(BaseModel):
53
+ matched_text: str
54
+ similarity: float
55
+ source_type: str
56
+ source_title: str
57
+ source_url: str
58
+ section: Optional[str] = None
59
+ context: Optional[str] = None
60
+
61
+ class LexicalDocResult(BaseModel):
62
+ id: int
63
+ name: str
64
+ author: Optional[str] = None
65
+ similarity: float # overall percent
66
+ flagged: bool
67
+ wordCount: Optional[int] = None
68
+ matches: List[LexicalMatch] = Field(default_factory=list)
69
+ content: Optional[str] = None
70
+ ai_similarity: float = 0.0
71
+
72
+ class TeacherLexicalSummary(BaseModel):
73
+ totalDocuments: int
74
+ flaggedDocuments: int
75
+ highestSimilarity: float
76
+ averageSimilarity: Optional[float] = None
77
+ totalMatches: int
78
+ averageAiSimilarity: float = 0.0
79
+
80
+ class TeacherLexicalBatchReport(BaseModel):
81
+ id: str
82
+ name: str
83
+ analysisType: str = "lexical"
84
+ uploadDate: datetime
85
+ processingTime: str
86
+ status: str = "completed"
87
+ documents: List[LexicalDocResult]
88
+ summary: TeacherLexicalSummary
89
+
90
+ # ---- Teacher Semantic (internal/external) ----
91
+
92
+ class SemanticOverlap(BaseModel):
93
+ textA: str
94
+ textB: str
95
+ cosine: float # 0–1
96
+ cosine_pct: float # 0–100
97
+ sectionA: Optional[str] = None
98
+ sectionB: Optional[str] = None
99
+ confidence: str # "high" | "medium" | "low"
100
+
101
+ class SemanticComparison(BaseModel):
102
+ id: str
103
+ docA: str
104
+ docB: str
105
+ similarity: float # aggregated cosine percent
106
+ flagged: bool
107
+ overlaps: List[SemanticOverlap] = Field(default_factory=list)
108
+
109
+ class TeacherSemanticReport(BaseModel):
110
+ id: str
111
+ name: str
112
+ analysisType: str = "semantic"
113
+ mode: str # "internal" | "external"
114
+ uploadDate: datetime
115
+ processingTime: str
116
+ status: str = "completed"
117
+ documents: List[DocumentInfo]
118
+ comparisons: List[SemanticComparison]
119
+ summary: InternalReportSummary
120
+ narrative: Optional[str] = None
121
+
122
+ from pydantic import BaseModel, Field
123
+ from typing import List, Optional
124
+ from datetime import datetime
125
+
126
+ class CodeMatch(BaseModel):
127
+ matched_code: str
128
+ similarity: float
129
+ source_type: str # 'peer', 'github', 'stackoverflow', 'web'
130
+ source_title: str
131
+ source_url: Optional[str] = None
132
+ match_type: str # 'exact', 'structural', 'token_sequence'
133
+ line_start: Optional[int] = None
134
+ line_end: Optional[int] = None
135
+ context: Optional[str] = None
136
+
137
+ class CodeFunction(BaseModel):
138
+ name: str
139
+ start_line: int
140
+ end_line: int
141
+ code: str
142
+ complexity: int # Cyclomatic complexity
143
+ tokens: List[str]
144
+ ast_hash: str
145
+
146
+ class CodeDocResult(BaseModel):
147
+ id: int
148
+ name: str
149
+ author: Optional[str] = None
150
+ similarity: float
151
+ flagged: bool
152
+ lineCount: int
153
+ functionCount: int
154
+ matches: List[CodeMatch]
155
+ functions: List[CodeFunction]
156
+ content: str # Full code content
157
+ language: str
158
+
159
+ class CodeAnalysisSummary(BaseModel):
160
+ totalDocuments: int
161
+ flaggedDocuments: int
162
+ highestSimilarity: float
163
+ averageSimilarity: float
164
+ totalMatches: int
165
+ peerMatches: int
166
+ externalMatches: int
167
+
168
+ class TeacherCodeBatchReport(BaseModel):
169
+ id: str
170
+ name: str
171
+ uploadDate: datetime
172
+ processingTime: str
173
+ documents: List[CodeDocResult]
174
+ summary: CodeAnalysisSummary
175
+ assignmentTopic: Optional[str] = None
176
+
177
+ class InternalMatch(BaseModel):
178
+ """Match between two student submissions"""
179
+ student1_id: int
180
+ student2_id: int
181
+ student1_name: str
182
+ student2_name: str
183
+ similarity: float
184
+ match_type: str
185
+ matched_functions: List[str]
app/utils/__pycache__/ai_detector.cpython-312.pyc ADDED
Binary file (3.22 kB). View file
 
app/utils/__pycache__/code_comparism.cpython-312.pyc ADDED
Binary file (12 kB). View file
 
app/utils/__pycache__/code_detection.cpython-312.pyc ADDED
Binary file (13.5 kB). View file
 
app/utils/__pycache__/code_plagiarism_integration.cpython-312.pyc ADDED
Binary file (10.6 kB). View file
 
app/utils/__pycache__/code_utils.cpython-312.pyc ADDED
Binary file (22.8 kB). View file
 
app/utils/__pycache__/file_utils.cpython-312.pyc ADDED
Binary file (2.42 kB). View file
 
app/utils/__pycache__/lexical_utils.cpython-312.pyc ADDED
Binary file (13.9 kB). View file
 
app/utils/__pycache__/semantic_utils.cpython-312.pyc ADDED
Binary file (15 kB). View file
 
app/utils/__pycache__/text_utils.cpython-312.pyc ADDED
Binary file (10.8 kB). View file
 
app/utils/__pycache__/web_utils.cpython-312.pyc ADDED
Binary file (27.2 kB). View file
 
app/utils/ai_detector.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ AI detection using HuggingFace fakespot-ai/roberta-base-ai-text-detection-v1
3
+ Fast, accurate ML model. Returns AI similarity score (0-1).
4
+ """
5
+ import logging
6
+ import os
7
+ from huggingface_hub import InferenceClient
8
+ from app.config import HF_TOKEN
9
+ logging.basicConfig(level=logging.INFO)
10
+ logger = logging.getLogger("ai_detector")
11
+
12
+
13
+ # Initialize HF client
14
+ try:
15
+ hf_client = InferenceClient(api_key=HF_TOKEN)
16
+ logger.info("βœ“ HuggingFace client initialized")
17
+ except Exception as e:
18
+ logger.error(f"❌ HF_TOKEN not available: {e}")
19
+ hf_client = None
20
+
21
+
22
+ def detect_ai_similarity(text: str) -> float:
23
+ """
24
+ Detect AI-generated text using HuggingFace model.
25
+ Fast, accurate ML-based detection.
26
+
27
+ Returns:
28
+ float: 0.0-1.0 where 1.0 = definitely AI, 0.0 = definitely human
29
+ """
30
+ try:
31
+ logger.info("Starting AI detection (HuggingFace model)...")
32
+
33
+ if not text or len(text) < 20:
34
+ logger.warning("Text too short for detection")
35
+ return 0.0
36
+
37
+ if hf_client is None:
38
+ logger.error("HuggingFace client not initialized")
39
+ return 0.0
40
+
41
+ # Truncate to avoid token limits
42
+ text = text[:2000]
43
+
44
+ logger.info("Sending request to HuggingFace...")
45
+ result = hf_client.text_classification(
46
+ text,
47
+ model="fakespot-ai/roberta-base-ai-text-detection-v1",
48
+ )
49
+
50
+ logger.info(f"API response: {result}")
51
+
52
+ # Extract top result
53
+ if not result or len(result) == 0:
54
+ logger.error("No result from API")
55
+ return 0.0
56
+
57
+ top_result = result[0]
58
+ label = top_result.get('label', '').upper()
59
+ confidence = top_result.get('score', 0.5)
60
+
61
+ logger.info(f"Classification: {label} (confidence: {confidence:.4f})")
62
+
63
+ # Map to 0-1 score
64
+ if label == 'AI':
65
+ ai_score = confidence
66
+ elif label == 'HUMAN':
67
+ ai_score = 1.0 - confidence
68
+ else:
69
+ ai_score = 0.5
70
+
71
+ ai_score = min(max(ai_score, 0.0), 1.0) # Clamp to 0-1
72
+ logger.info(f"βœ… AI detection complete: {ai_score:.3f}")
73
+
74
+ return round(ai_score, 3)
75
+
76
+ except Exception as e:
77
+ logger.error(f"❌ AI detection error: {str(e)}", exc_info=True)
78
+ return 0.0
79
+
80
+
app/utils/file_utils.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import tempfile
2
+ from pdfminer.high_level import extract_text as extract_pdf_text
3
+ from docx import Document as DocxDocument
4
+ from app.config import ALLOWED_EXTENSIONS
5
+
6
+
7
+ def allowed_file(filename: str) -> bool:
8
+ return "." in filename and filename.rsplit(".", 1)[1].lower() in ALLOWED_EXTENSIONS
9
+
10
+
11
+ def extract_text_from_file(content_bytes: bytes, filename: str, max_words: int = 500) -> str:
12
+ ext = filename.rsplit(".", 1)[1].lower()
13
+ text = ""
14
+ try:
15
+ if ext == "txt":
16
+ text = content_bytes.decode("utf-8", errors="ignore")
17
+ elif ext == "pdf":
18
+ with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp:
19
+ tmp.write(content_bytes)
20
+ tmp.flush()
21
+ text = extract_pdf_text(tmp.name)
22
+ elif ext == "docx":
23
+ with tempfile.NamedTemporaryFile(suffix=".docx", delete=False) as tmp:
24
+ tmp.write(content_bytes)
25
+ tmp.flush()
26
+ doc = DocxDocument(tmp.name)
27
+ text = "\n".join([p.text for p in doc.paragraphs])
28
+ except Exception:
29
+ text = ""
30
+
31
+ # Word count check
32
+ word_count = len(text.split())
33
+ if word_count > max_words:
34
+ raise ValueError(f"File exceeds {max_words} words (found {word_count}).")
35
+
36
+ return text
app/utils/lexical_utils.py ADDED
@@ -0,0 +1,293 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from typing import List, Optional, Tuple, Set
3
+ import nltk
4
+ from nltk.tokenize import sent_tokenize, word_tokenize
5
+ from nltk.corpus import stopwords
6
+ from nltk.stem import WordNetLemmatizer
7
+ from rapidfuzz.distance import Levenshtein
8
+
9
+ from app.config import (
10
+ MIN_WORDS_PER_SENTENCE,
11
+ MIN_SENTENCE_LENGTH,
12
+ SEQUENCE_THRESHOLD,
13
+ EXACT_MATCH_SCORE,
14
+ )
15
+
16
+ nltk.download("punkt")
17
+ nltk.download("stopwords")
18
+ nltk.download("wordnet")
19
+ nltk.download('punkt_tab')
20
+
21
+ lemmatizer = WordNetLemmatizer()
22
+
23
+ def normalize_text(text: str) -> str:
24
+ if not text:
25
+ return ""
26
+ text = text.lower()
27
+ text = text.replace("\u00ad", "")
28
+ text = re.sub(r"-\s*\n\s*", "", text)
29
+ text = text.replace("\n", " ")
30
+ text = re.sub(r"[^\x20-\x7E]+", " ", text)
31
+ text = (text.replace("'", "'").replace("'", "'")
32
+ .replace(""", '"').replace(""", '"')
33
+ .replace("β€”", "-").replace("–", "-"))
34
+ text = re.sub(r"[^\w\s-]", " ", text)
35
+ text = re.sub(r"\s+", " ", text).strip()
36
+ tokens = word_tokenize(text)
37
+ lemmas = [lemmatizer.lemmatize(tok) for tok in tokens]
38
+ normalized = " ".join(lemmas)
39
+ normalized = re.sub(r"\s+", " ", normalized).strip()
40
+ return normalized
41
+
42
+ def get_meaningful_sentences(text: str) -> List[str]:
43
+ """Extract meaningful sentences from text."""
44
+ sentences = sent_tokenize(text or "")
45
+ filtered = []
46
+ for s in sentences:
47
+ words = word_tokenize(s)
48
+ if len(words) >= MIN_WORDS_PER_SENTENCE and len(s.strip()) >= MIN_SENTENCE_LENGTH:
49
+ filtered.append(s.strip())
50
+ return filtered
51
+
52
+ def extract_keywords(text: str, max_keywords: int = 5) -> List[str]:
53
+ """Extract top keywords from text for search queries."""
54
+ words = word_tokenize((text or "").lower())
55
+ stop_words = set(stopwords.words("english"))
56
+ filtered = [w for w in words if w.isalpha() and w not in stop_words and len(w) > 3]
57
+ freq = nltk.FreqDist(filtered)
58
+ return [word for word, _ in freq.most_common(max_keywords)]
59
+
60
+ def _word_shingles(norm_text: str, k: int = 7) -> List[str]:
61
+ tokens = norm_text.split()
62
+ if len(tokens) < k:
63
+ return []
64
+ return [" ".join(tokens[i:i+k]) for i in range(len(tokens) - k + 1)]
65
+
66
+ def _shingle_sets(a_norm: str, b_norm: str, k: int = 7) -> Tuple[Set[str], Set[str]]:
67
+ return set(_word_shingles(a_norm, k)), set(_word_shingles(b_norm, k))
68
+
69
+ def _jaccard(a: Set[str], b: Set[str]) -> float:
70
+ if not a and not b:
71
+ return 0.0
72
+ inter = len(a & b)
73
+ union = len(a | b) or 1
74
+ return inter / union
75
+
76
+ def _containment(a: Set[str], b: Set[str]) -> float:
77
+ if not a:
78
+ return 0.0
79
+ inter = len(a & b)
80
+ return inter / len(a)
81
+
82
+ def _winnowing_hashes(norm_text: str, k: int = 7, w: int = 4) -> List[Tuple[int, int]]:
83
+ tokens = norm_text.split()
84
+ if len(tokens) < k:
85
+ return []
86
+ shingles = [" ".join(tokens[i:i+k]) for i in range(len(tokens) - k + 1)]
87
+ hashes = [(hash(s) & 0xFFFFFFFF, i) for i, s in enumerate(shingles)]
88
+
89
+ if w <= 1 or len(hashes) <= w:
90
+ return list(dict.fromkeys(hashes))
91
+
92
+ fps: List[Tuple[int, int]] = []
93
+ last_min_abs = -1
94
+ for i in range(0, len(hashes) - w + 1):
95
+ window = hashes[i:i+w]
96
+ min_hash, min_idx = None, None
97
+ for j, (h, _) in enumerate(window):
98
+ if (min_hash is None) or (h < min_hash) or (h == min_hash and j > (min_idx or -1)):
99
+ min_hash, min_idx = h, j
100
+ abs_idx = i + (min_idx or 0)
101
+ if abs_idx != last_min_abs:
102
+ fps.append(hashes[abs_idx])
103
+ last_min_abs = abs_idx
104
+ return list(dict.fromkeys(fps))
105
+
106
+ def _winnowing_overlap(a_fp: List[Tuple[int, int]], b_fp: List[Tuple[int, int]]) -> float:
107
+ a_set, b_set = set(a_fp), set(b_fp)
108
+ if not a_set or not b_set:
109
+ return 0.0
110
+ shared = len(a_set & b_set)
111
+ denom = min(len(a_set), len(b_set)) or 1
112
+ return shared / denom
113
+
114
+ def _exact_substring(norm_sentence: str, norm_external: str) -> bool:
115
+ if not norm_sentence or not norm_external:
116
+ return False
117
+ return norm_external.find(norm_sentence) != -1
118
+
119
+ def _levenshtein_sim(a: str, b: str) -> float:
120
+ if not a or not b:
121
+ return 0.0
122
+ return Levenshtein.normalized_similarity(a, b)
123
+
124
+ def _lcs_length(a: str, b: str) -> int:
125
+ if not a or not b:
126
+ return 0
127
+ prev = [0] * (len(b) + 1)
128
+ best = 0
129
+ for i in range(1, len(a) + 1):
130
+ curr = [0]
131
+ ai = a[i-1]
132
+ for j in range(1, len(b) + 1):
133
+ if ai == b[j-1]:
134
+ v = prev[j-1] + 1
135
+ curr.append(v)
136
+ if v > best:
137
+ best = v
138
+ else:
139
+ curr.append(0)
140
+ prev = curr
141
+ return best
142
+
143
+ def _extract_phrases(norm_text: str, min_words: int = 3, max_words: int = 7) -> List[str]:
144
+ """Extract all phrases of varying lengths for partial match detection."""
145
+ tokens = norm_text.split()
146
+ phrases = []
147
+ for k in range(min_words, min(max_words + 1, len(tokens) + 1)):
148
+ for i in range(len(tokens) - k + 1):
149
+ phrases.append(" ".join(tokens[i:i+k]))
150
+ return phrases
151
+
152
+ def _phrase_containment_match(norm_sentence: str, norm_external: str) -> Optional[Tuple[str, float]]:
153
+ """Check if ANY phrase (3-7 words) from sentence appears in external text."""
154
+ phrases = _extract_phrases(norm_sentence, min_words=3, max_words=7)
155
+
156
+ best_phrase = None
157
+ best_score = 0.0
158
+
159
+ for phrase in phrases:
160
+ if phrase in norm_external:
161
+ score = len(phrase.split()) / len(norm_sentence.split())
162
+ if score > best_score:
163
+ best_score = score
164
+ best_phrase = phrase
165
+
166
+ if best_phrase and best_score >= 0.4:
167
+ return best_phrase, round(best_score, 3)
168
+
169
+ return None
170
+
171
+ def find_exact_matches(sentence: str, external_text: str) -> Optional[float]:
172
+ """
173
+ Full-sentence lexical match with multiple strategies:
174
+ 1) Exact substring match β†’ EXACT_MATCH_SCORE (100%)
175
+ 2) Winnowing fingerprints β†’ robust plagiarism detection
176
+ 3) Edit-distance (Levenshtein) β†’ catches paraphrased content
177
+ 4) LCS (longest common substring) β†’ catches missed overlaps
178
+ """
179
+ norm_s = normalize_text(sentence)
180
+ norm_e = normalize_text(external_text)
181
+
182
+ if len(norm_s) < MIN_SENTENCE_LENGTH:
183
+ return None
184
+
185
+ # 1) Exact full-sentence match
186
+ if _exact_substring(norm_s, norm_e):
187
+ return EXACT_MATCH_SCORE
188
+
189
+ # 2) Winnowing overlap
190
+ win_sim = _winnowing_overlap(
191
+ _winnowing_hashes(norm_s, k=5, w=4),
192
+ _winnowing_hashes(norm_e, k=5, w=4),
193
+ )
194
+ if win_sim >= SEQUENCE_THRESHOLD * 0.9:
195
+ return round(win_sim, 3)
196
+
197
+ # 3) Edit-distance fallback
198
+ lev = _levenshtein_sim(norm_s, norm_e)
199
+ if lev >= SEQUENCE_THRESHOLD:
200
+ return round(lev, 3)
201
+
202
+ # 4) LCS fallback
203
+ lcs_len = _lcs_length(norm_s, norm_e)
204
+ LCS_MIN_CHARS = 15
205
+ if lcs_len >= LCS_MIN_CHARS:
206
+ return round(lcs_len / max(1, len(norm_s)), 3)
207
+
208
+ return None
209
+
210
+
211
+ def find_partial_phrase_match(sentence: str, external_text: str) -> Optional[Tuple[str, float]]:
212
+ """
213
+ Partial reuse detection via multiple strategies:
214
+ 1) Phrase containment (3-7 word phrases)
215
+ 2) Shingle-based Jaccard/Containment
216
+ 3) Edit distance as fallback
217
+ """
218
+ norm_s = normalize_text(sentence)
219
+ norm_e = normalize_text(external_text)
220
+
221
+ if not norm_s or not norm_e:
222
+ return None
223
+
224
+ # Strategy 1: Check if any 3-7 word phrase appears verbatim
225
+ phrase_match = _phrase_containment_match(norm_s, norm_e)
226
+ if phrase_match:
227
+ return sentence, phrase_match[1]
228
+
229
+ # Strategy 2: 7-word shingles
230
+ A, B = _shingle_sets(norm_s, norm_e, k=7)
231
+ if A and B:
232
+ jac = _jaccard(A, B)
233
+ con = _containment(A, B)
234
+ score = max(jac, con)
235
+ if score >= SEQUENCE_THRESHOLD * 0.8:
236
+ return sentence, round(score, 3)
237
+
238
+ # Strategy 3: Smaller shingles (5-word) for more matches
239
+ A_small, B_small = _shingle_sets(norm_s, norm_e, k=5)
240
+ if A_small and B_small:
241
+ jac_small = _jaccard(A_small, B_small)
242
+ con_small = _containment(A_small, B_small)
243
+ score_small = max(jac_small, con_small)
244
+ if score_small >= SEQUENCE_THRESHOLD * 0.75:
245
+ return sentence, round(score_small, 3)
246
+
247
+ # Strategy 4: Edit distance as last resort
248
+ lev = _levenshtein_sim(norm_s, norm_e)
249
+ if lev >= SEQUENCE_THRESHOLD * 0.85:
250
+ return sentence, round(lev, 3)
251
+
252
+ return None
253
+
254
+ def find_partial_phrase_match_for_internal(sentence: str, external_text: str) -> Optional[Tuple[str, float]]:
255
+ """
256
+ Wrapper around find_partial_phrase_match that extracts the actual matched phrase
257
+ that appears in BOTH documents, not just the full sentence from the first doc.
258
+ """
259
+ result = find_partial_phrase_match(sentence, external_text)
260
+ if not result:
261
+ return None
262
+
263
+ matched_text, score = result
264
+ norm_s = normalize_text(sentence)
265
+ norm_e = normalize_text(external_text)
266
+
267
+ if not norm_s or not norm_e:
268
+ return result
269
+
270
+ # Find the longest common substring that appears in both
271
+ words_s = norm_s.split()
272
+ words_e = norm_e.split()
273
+
274
+ best_common = ""
275
+ best_len = 0
276
+
277
+ # Try all consecutive word sequences from sentence A
278
+ for i in range(len(words_s)):
279
+ for j in range(i + 1, len(words_s) + 1):
280
+ phrase = ' '.join(words_s[i:j])
281
+ # Check if this phrase exists in document B
282
+ if phrase in norm_e:
283
+ phrase_len = len(phrase)
284
+ if phrase_len > best_len:
285
+ best_common = phrase
286
+ best_len = phrase_len
287
+
288
+ # If we found a common phrase, return it
289
+ if best_common:
290
+ return best_common, score
291
+
292
+ # Fallback: return original result
293
+ return matched_text, score
app/utils/semantic_utils.py ADDED
@@ -0,0 +1,342 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Semantic analysis with MiniLM embeddings for paraphrasing detection.
3
+ Detects even heavily paraphrased content from LLMs like ChatGPT.
4
+ """
5
+
6
+ import re
7
+ from typing import List, Dict
8
+ import logging
9
+ import numpy as np
10
+
11
+ logging.basicConfig(level=logging.INFO)
12
+ logger = logging.getLogger("semantic_utils")
13
+
14
+ # Lazy load model to avoid multiple loads
15
+ _encoder_model = None
16
+
17
+ def get_encoder():
18
+ """Lazy load SentenceTransformer MiniLM model"""
19
+ global _encoder_model
20
+ if _encoder_model is None:
21
+ logger.info("πŸ”„ Loading SentenceTransformer MiniLM-L6-v2 model...")
22
+ try:
23
+ from sentence_transformers import SentenceTransformer
24
+ _encoder_model = SentenceTransformer('all-MiniLM-L6-v2')
25
+ logger.info("βœ… Model loaded successfully")
26
+ except ImportError:
27
+ logger.error("❌ sentence-transformers not installed. Install with: pip install sentence-transformers")
28
+ raise
29
+ return _encoder_model
30
+
31
+ def split_into_sentences(text: str) -> List[str]:
32
+ """Split text into sentences, filtering out junk"""
33
+ sentences = re.split(r'(?<=[.!?])\s+', text)
34
+
35
+ cleaned = []
36
+ for sent in sentences:
37
+ sent = sent.strip()
38
+ if len(sent) > 20 and len(sent.split()) >= 5:
39
+ cleaned.append(sent)
40
+
41
+ return cleaned
42
+
43
+ def extract_key_sentences(text: str, num_sentences: int = 5) -> List[str]:
44
+ """
45
+ Extract the most important sentences from text based on:
46
+ - Position (first and last sentences often important)
47
+ - Length (15-30 words is optimal)
48
+ - Keyword frequency
49
+ """
50
+ sentences = split_into_sentences(text)
51
+ if len(sentences) <= num_sentences:
52
+ return sentences
53
+
54
+ scored_sentences = []
55
+
56
+ for idx, sent in enumerate(sentences):
57
+ score = 0.0
58
+ word_count = len(sent.split())
59
+
60
+ # Position score
61
+ if idx < len(sentences) * 0.2 or idx > len(sentences) * 0.8:
62
+ score += 0.3
63
+
64
+ # Length score
65
+ if 15 <= word_count <= 30:
66
+ score += 0.3
67
+ elif 10 <= word_count <= 40:
68
+ score += 0.15
69
+
70
+ # Keyword diversity
71
+ words = set(sent.lower().split())
72
+ common_words = {'the', 'a', 'an', 'and', 'or', 'is', 'are', 'was', 'be', 'it', 'this', 'that'}
73
+ unique_words = len(words - common_words)
74
+ score += min(unique_words / 10, 0.4)
75
+
76
+ scored_sentences.append((score, sent))
77
+
78
+ scored_sentences.sort(reverse=True)
79
+ key_sents = [sent for _, sent in scored_sentences[:num_sentences]]
80
+
81
+ result = []
82
+ for sent in sentences:
83
+ if sent in key_sents:
84
+ result.append(sent)
85
+
86
+ return result
87
+
88
+ def generate_five_queries(text: str, max_words: int = 3000) -> List[str]:
89
+ """
90
+ Generate 5 high-quality semantic search queries from document.
91
+
92
+ Query 1: Beginning (main topic)
93
+ Query 2: Early Middle
94
+ Query 3: Center (supporting evidence)
95
+ Query 4: Late Middle
96
+ Query 5: End (conclusions)
97
+ """
98
+ logger.info(" πŸ” Generating 5 semantic queries from content...")
99
+
100
+ words = text.split()
101
+ if len(words) > max_words:
102
+ text = ' '.join(words[:max_words])
103
+
104
+ sentences = split_into_sentences(text)
105
+ if len(sentences) < 5:
106
+ # Fallback for very short text
107
+ logger.warning(" ⚠️ Very short document, using basic queries")
108
+ return [
109
+ ' '.join(words[:30]),
110
+ ' '.join(words[max(0, len(words)//5):max(0, len(words)//5)+30]),
111
+ ' '.join(words[max(0, 2*len(words)//5):max(0, 2*len(words)//5)+30]),
112
+ ' '.join(words[max(0, 3*len(words)//5):max(0, 3*len(words)//5)+30]),
113
+ ' '.join(words[max(0, 4*len(words)//5):max(0, 4*len(words)//5)+30])
114
+ ]
115
+
116
+ queries = []
117
+ total_sentences = len(sentences)
118
+
119
+ # βœ… Query 1: BEGINNING - First 3-4 sentences
120
+ beginning_end = min(4, total_sentences // 5)
121
+ query1_sents = sentences[:beginning_end]
122
+ query1 = ' '.join(query1_sents)
123
+ queries.append(query1)
124
+ logger.debug(f" Query 1 (Beginning) length: {len(query1.split())} words")
125
+
126
+ # βœ… Query 2: EARLY MIDDLE - First third
127
+ early_start = beginning_end
128
+ early_end = min(early_start + 4, total_sentences // 3)
129
+ query2_sents = sentences[early_start:early_end]
130
+ query2 = ' '.join(query2_sents)
131
+ queries.append(query2)
132
+ logger.debug(f" Query 2 (Early Middle) length: {len(query2.split())} words")
133
+
134
+ # βœ… Query 3: CENTER - Middle section (3-4 sentences)
135
+ mid_start = max(early_end, total_sentences // 3)
136
+ mid_end = min(mid_start + 4, 2 * total_sentences // 3)
137
+ query3_sents = sentences[mid_start:mid_end]
138
+ query3 = ' '.join(query3_sents)
139
+ queries.append(query3)
140
+ logger.debug(f" Query 3 (Center) length: {len(query3.split())} words")
141
+
142
+ # βœ… Query 4: LATE MIDDLE - Second two-thirds
143
+ late_start = mid_end
144
+ late_end = min(late_start + 4, 2 * total_sentences // 3 + (total_sentences // 3))
145
+ query4_sents = sentences[late_start:late_end]
146
+ query4 = ' '.join(query4_sents)
147
+ queries.append(query4)
148
+ logger.debug(f" Query 4 (Late Middle) length: {len(query4.split())} words")
149
+
150
+ # βœ… Query 5: END - Last 3-4 sentences
151
+ end_start = max(late_end, total_sentences - 4)
152
+ query5_sents = sentences[end_start:]
153
+ query5 = ' '.join(query5_sents)
154
+ queries.append(query5)
155
+ logger.debug(f" Query 5 (End) length: {len(query5.split())} words")
156
+
157
+ # βœ… Clean and validate queries
158
+ final_queries = []
159
+ for q in queries:
160
+ q = q.strip()
161
+ if len(q.split()) >= 15: # Minimum 15 words for good search
162
+ final_queries.append(q)
163
+
164
+ logger.info(f" βœ… Generated {len(final_queries)} queries:")
165
+ for i, q in enumerate(final_queries, 1):
166
+ word_count = len(q.split())
167
+ preview = q[:80] + "..." if len(q) > 80 else q
168
+ logger.info(f" Query {i} ({word_count} words): {preview}")
169
+
170
+ return final_queries
171
+
172
+ def find_semantic_matches(
173
+ doc_text: str,
174
+ source_text: str,
175
+ threshold: float = 0.50
176
+ ) -> List[Dict]:
177
+ """
178
+ Find semantically similar passages using MiniLM embeddings.
179
+ Detects paraphrased content from LLMs.
180
+
181
+ threshold: 0.65+ = high confidence, 0.50+ = catch paraphrasing, 0.35+ = catch weak matches
182
+ """
183
+ try:
184
+ encoder = get_encoder()
185
+ except ImportError:
186
+ logger.warning("⚠️ SentenceTransformer not available, falling back to string matching")
187
+ return _fallback_semantic_matches(doc_text, source_text, threshold)
188
+
189
+ # Split into sentences
190
+ doc_sentences = split_into_sentences(doc_text)
191
+ source_sentences = split_into_sentences(source_text)
192
+
193
+ if not doc_sentences or not source_sentences:
194
+ return []
195
+
196
+ logger.debug(f" Encoding {len(doc_sentences)} doc sentences + {len(source_sentences)} source sentences...")
197
+
198
+ # Encode all sentences
199
+ try:
200
+ doc_embeddings = encoder.encode(doc_sentences, convert_to_numpy=True, show_progress_bar=False)
201
+ source_embeddings = encoder.encode(source_sentences, convert_to_numpy=True, show_progress_bar=False)
202
+ except Exception as e:
203
+ logger.error(f" Encoding error: {e}, falling back to string matching")
204
+ return _fallback_semantic_matches(doc_text, source_text, threshold)
205
+
206
+ matches = []
207
+ matched_source_indices = set()
208
+
209
+ # Compare using cosine similarity
210
+ for doc_idx, doc_emb in enumerate(doc_embeddings):
211
+ best_similarity = 0.0
212
+ best_source_idx = -1
213
+
214
+ for source_idx, source_emb in enumerate(source_embeddings):
215
+ if source_idx in matched_source_indices:
216
+ continue
217
+
218
+ # Cosine similarity
219
+ similarity = np.dot(doc_emb, source_emb) / (
220
+ np.linalg.norm(doc_emb) * np.linalg.norm(source_emb) + 1e-8
221
+ )
222
+
223
+ if similarity > best_similarity:
224
+ best_similarity = similarity
225
+ best_source_idx = source_idx
226
+
227
+ # Record if above threshold
228
+ if best_similarity >= threshold and best_source_idx >= 0:
229
+ if best_source_idx not in matched_source_indices:
230
+ matched_source_indices.add(best_source_idx)
231
+
232
+ matches.append({
233
+ 'doc_text': doc_sentences[doc_idx],
234
+ 'source_text': source_sentences[best_source_idx],
235
+ 'similarity': float(best_similarity),
236
+ 'doc_index': doc_idx,
237
+ 'source_index': best_source_idx
238
+ })
239
+
240
+ logger.debug(f" Found {len(matches)} semantic matches (threshold: {threshold})")
241
+ return matches
242
+
243
+ def _fallback_semantic_matches(doc_text: str, source_text: str, threshold: float) -> List[Dict]:
244
+ """Fallback to string matching if embeddings not available"""
245
+ from difflib import SequenceMatcher
246
+
247
+ doc_sentences = split_into_sentences(doc_text)
248
+ source_sentences = split_into_sentences(source_text)
249
+
250
+ if not doc_sentences or not source_sentences:
251
+ return []
252
+
253
+ matches = []
254
+ matched_source_indices = set()
255
+
256
+ for doc_idx, doc_sent in enumerate(doc_sentences):
257
+ best_similarity = 0.0
258
+ best_source_idx = -1
259
+
260
+ for source_idx, source_sent in enumerate(source_sentences):
261
+ if source_idx in matched_source_indices:
262
+ continue
263
+
264
+ ratio = SequenceMatcher(None, doc_sent.lower(), source_sent.lower()).ratio()
265
+
266
+ if ratio > best_similarity:
267
+ best_similarity = ratio
268
+ best_source_idx = source_idx
269
+
270
+ if best_similarity >= threshold and best_source_idx >= 0:
271
+ if best_source_idx not in matched_source_indices:
272
+ matched_source_indices.add(best_source_idx)
273
+
274
+ matches.append({
275
+ 'doc_text': doc_sentences[doc_idx],
276
+ 'source_text': source_sentences[best_source_idx],
277
+ 'similarity': float(best_similarity),
278
+ 'doc_index': doc_idx,
279
+ 'source_index': best_source_idx
280
+ })
281
+
282
+ return matches
283
+
284
+ def calculate_semantic_similarity(text1: str, text2: str) -> float:
285
+ """Calculate semantic similarity between two texts"""
286
+ try:
287
+ encoder = get_encoder()
288
+ embeddings = encoder.encode([text1, text2], convert_to_numpy=True, show_progress_bar=False)
289
+ similarity = np.dot(embeddings[0], embeddings[1]) / (
290
+ np.linalg.norm(embeddings[0]) * np.linalg.norm(embeddings[1]) + 1e-8
291
+ )
292
+ return float(similarity)
293
+ except:
294
+ from difflib import SequenceMatcher
295
+ return SequenceMatcher(None, text1.lower(), text2.lower()).ratio()
296
+
297
+ def compare_semantic_chunks(
298
+ doc_text: str,
299
+ source_text: str,
300
+ chunk_size: int = 200,
301
+ threshold: float = 0.65
302
+ ) -> List[Dict]:
303
+ """
304
+ Compare document chunks against source chunks using semantic similarity.
305
+ """
306
+ def chunk_text(text, size):
307
+ words = text.split()
308
+ chunks = []
309
+ for i in range(0, len(words), size):
310
+ chunk = ' '.join(words[i:i+size])
311
+ if len(chunk.split()) >= 20:
312
+ chunks.append(chunk)
313
+ return chunks
314
+
315
+ doc_chunks = chunk_text(doc_text, chunk_size)
316
+ source_chunks = chunk_text(source_text, chunk_size)
317
+
318
+ if not doc_chunks or not source_chunks:
319
+ return []
320
+
321
+ try:
322
+ encoder = get_encoder()
323
+ doc_embeddings = encoder.encode(doc_chunks, convert_to_numpy=True, show_progress_bar=False)
324
+ source_embeddings = encoder.encode(source_chunks, convert_to_numpy=True, show_progress_bar=False)
325
+
326
+ matches = []
327
+ for doc_emb in doc_embeddings:
328
+ for source_emb in source_embeddings:
329
+ similarity = np.dot(doc_emb, source_emb) / (
330
+ np.linalg.norm(doc_emb) * np.linalg.norm(source_emb) + 1e-8
331
+ )
332
+
333
+ if similarity >= threshold:
334
+ matches.append({
335
+ 'doc_text': doc_chunks[0][:200] + "...",
336
+ 'source_text': source_chunks[0][:200] + "...",
337
+ 'similarity': float(similarity),
338
+ })
339
+
340
+ return matches
341
+ except:
342
+ return []
app/utils/web_utils.py ADDED
@@ -0,0 +1,545 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, List, Optional
2
+ import requests
3
+ from bs4 import BeautifulSoup
4
+ import time
5
+ from app.config import API_KEYS, SEARCH_ENGINE_IDS
6
+ import logging
7
+ import hashlib
8
+ from functools import lru_cache
9
+ from concurrent.futures import ThreadPoolExecutor, as_completed, TimeoutError as FuturesTimeoutError
10
+ from requests.adapters import HTTPAdapter
11
+ from urllib3.util.retry import Retry
12
+ from urllib.parse import urlparse
13
+ from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeoutError
14
+ import re
15
+ import threading
16
+
17
+
18
+ from ..logger import logger
19
+ logger.info("Scraper started")
20
+
21
+ # Optional fallback
22
+ try:
23
+ import cloudscraper
24
+ CLOUDSCRAPER_AVAILABLE = True
25
+ except Exception:
26
+ CLOUDSCRAPER_AVAILABLE = False
27
+
28
+ # ---- OPTIMIZED Configuration ----
29
+ MAX_WORKERS = 8 # Increased for parallelism
30
+ REQUEST_TIMEOUT = 5 # Reduced from 6 - fail faster
31
+ PER_URL_TIMEOUT = 8 # Reduced from 10
32
+ POLITENESS_DELAY = 0.1 # Reduced from 0.25
33
+ GOOGLE_NUM_DEFAULT = 10
34
+ MIN_TEXT_LENGTH = 700
35
+ GOOGLE_API_TIMEOUT = 6
36
+ BRAVE_API_KEY = "BSAE_jMY2tpTa_jYwCkcaiddxmzLs7m"
37
+ BRAVE_ENDPOINT = "https://api.search.brave.com/res/v1/web/search"
38
+
39
+ _api_key_index = 0
40
+ _api_key_lock = threading.Lock()
41
+
42
+ def _get_next_api_credentials():
43
+ """Get next API key and search engine ID in round-robin fashion"""
44
+ global _api_key_index
45
+ with _api_key_lock:
46
+ key = API_KEYS[_api_key_index]
47
+ engine_id = SEARCH_ENGINE_IDS[_api_key_index]
48
+ _api_key_index = (_api_key_index + 1) % len(API_KEYS)
49
+ return key, engine_id
50
+
51
+ # βœ… HARD TIMEOUT: 3 minutes (180 seconds) for ALL scraping
52
+ MULTI_QUERY_TIMEOUT = 180
53
+
54
+ # Blacklist slow/unscrapeable domains
55
+ BLACKLIST_DOMAINS = {
56
+ 'neurips.cc',
57
+ 'icml.cc',
58
+ 'jmlr.org',
59
+ 'researchgate.net',
60
+ 'arxiv.org',
61
+ 'springer.com',
62
+ 'nature.com',
63
+ 'nips.cc',
64
+ 'iccv2023.thecvf.com'
65
+ }
66
+
67
+ # Logging
68
+ logging.basicConfig(level=logging.INFO)
69
+ logger = logging.getLogger("scraper")
70
+
71
+ # ---- Session ----
72
+ def _make_session() -> requests.Session:
73
+ s = requests.Session()
74
+ retries = Retry(
75
+ total=1, # Only 1 retry - fail fast
76
+ backoff_factor=0.1, # Minimal backoff
77
+ status_forcelist=[429, 500, 502, 503, 504],
78
+ allowed_methods=["GET", "POST"],
79
+ respect_retry_after_header=False # Don't wait for Retry-After
80
+ )
81
+ s.mount("https://", HTTPAdapter(max_retries=retries, pool_connections=20, pool_maxsize=20))
82
+ s.mount("http://", HTTPAdapter(max_retries=retries, pool_connections=20, pool_maxsize=20))
83
+ s.headers.update({
84
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
85
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
86
+ "Accept-Language": "en-US,en;q=0.5",
87
+ "Connection": "keep-alive",
88
+ })
89
+ return s
90
+
91
+ _SESSION = _make_session()
92
+
93
+ # ---- Global timeout tracking ----
94
+ _scraping_start_time = None
95
+ _scraping_deadline = None
96
+
97
+ def _set_scraping_deadline():
98
+ """Set the scraping deadline to 3 minutes from now"""
99
+ global _scraping_start_time, _scraping_deadline
100
+ _scraping_start_time = time.time()
101
+ _scraping_deadline = _scraping_start_time + MULTI_QUERY_TIMEOUT
102
+
103
+ def _time_remaining() -> float:
104
+ """Get remaining time in seconds"""
105
+ if _scraping_deadline is None:
106
+ return MULTI_QUERY_TIMEOUT
107
+ remaining = _scraping_deadline - time.time()
108
+ return max(0, remaining)
109
+
110
+ def _is_timeout_exceeded() -> bool:
111
+ """Check if timeout has been exceeded"""
112
+ return _time_remaining() <= 0
113
+
114
+ # ---- Helpers ----
115
+ def _normalize_whitespace(s: str) -> str:
116
+ return " ".join(s.split())
117
+
118
+ def _text_hash(text: str) -> str:
119
+ return hashlib.sha256(text.encode("utf-8")).hexdigest()
120
+
121
+ def _clean_soup(soup: BeautifulSoup, prefer_main: bool = True, max_chars: Optional[int] = None) -> str:
122
+ for junk in soup(["script", "style", "nav", "footer", "noscript", "header"]):
123
+ junk.decompose()
124
+ parts = []
125
+ if prefer_main:
126
+ main = soup.find(["main", "article", "section"])
127
+ if main:
128
+ elems = main.find_all(["p", "h1", "h2", "h3", "li"])
129
+ else:
130
+ elems = soup.find_all(["p", "h1", "h2", "h3"])
131
+ else:
132
+ elems = soup.find_all(["p", "li"])
133
+ for el in elems:
134
+ t = el.get_text(separator=" ", strip=True)
135
+ if t and len(t) > 30:
136
+ parts.append(t)
137
+ text = _normalize_whitespace(" ".join(parts))
138
+ if max_chars and len(text) > max_chars:
139
+ return text[:max_chars]
140
+ return text
141
+
142
+ # ---- Google Search ----
143
+ @lru_cache(maxsize=256)
144
+ def google_search(query: str, num_results: int = 10):
145
+ api_key, search_engine_id = _get_next_api_credentials()
146
+
147
+ if not api_key or not search_engine_id:
148
+ logger.warning("Missing API_KEY or SEARCH_ENGINE_ID in config.py")
149
+ return []
150
+
151
+ url = "https://www.googleapis.com/customsearch/v1"
152
+ params = {"key": api_key, "cx": search_engine_id, "q": query, "num": num_results}
153
+ try:
154
+ r = _SESSION.get(url, params=params, timeout=GOOGLE_API_TIMEOUT)
155
+ r.raise_for_status()
156
+ data = r.json()
157
+ items = data.get("items", []) or []
158
+ out = []
159
+ for i in items:
160
+ out.append({
161
+ "link": i.get("link"),
162
+ "title": i.get("title", ""),
163
+ "snippet": i.get("snippet", "")
164
+ })
165
+ logger.info(f"google_search: got {len(out)} items for '{query[:60]}'")
166
+ return out
167
+ except Exception as e:
168
+ logger.warning(f"google_search failed: {e}")
169
+ return []
170
+
171
+ # ---- IMPROVED: Cloudscraper with early exit ----
172
+ def scrape_with_cloudscraper(url: str, timeout: int = 8):
173
+ if not CLOUDSCRAPER_AVAILABLE:
174
+ return ""
175
+ try:
176
+ logger.debug(f"cloudscraper: GET {url}")
177
+ scraper = cloudscraper.create_scraper()
178
+ r = scraper.get(url, timeout=timeout)
179
+ r.raise_for_status()
180
+ soup = BeautifulSoup(r.text, "html.parser")
181
+ text = _clean_soup(soup, prefer_main=True, max_chars=20000)
182
+ if text and len(text) > MIN_TEXT_LENGTH:
183
+ logger.info(f" βœ… Scraped {len(text)} chars (cloudscraper) for {url}")
184
+ return text
185
+ return ""
186
+ except Exception as e:
187
+ logger.debug(f"cloudscraper error for {url}: {e}")
188
+ return ""
189
+
190
+ # ---- IMPROVED: Playwright with hard timeout ----
191
+ def scrape_with_playwright(url: str, timeout: int = 8):
192
+ """Scrape JS-heavy pages with aggressive waiting and content extraction."""
193
+ try:
194
+ logger.debug(f"Playwright: GET {url}")
195
+ with sync_playwright() as p:
196
+ browser = p.chromium.launch(
197
+ headless=True,
198
+ args=[
199
+ "--disable-blink-features=AutomationControlled",
200
+ "--disable-dev-shm-usage",
201
+ "--no-sandbox",
202
+ "--disable-gpu"
203
+ ]
204
+ )
205
+ context = browser.new_context(
206
+ user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
207
+ )
208
+ context.set_default_timeout(timeout * 1000)
209
+ page = context.new_page()
210
+
211
+ try:
212
+ # Navigate with longer timeout for JS-heavy sites
213
+ page.goto(url, wait_until="networkidle", timeout=timeout * 1000)
214
+
215
+ # Wait for common content containers to load
216
+ try:
217
+ page.wait_for_selector("article, [role='article'], .post-content, .blog-content, main", timeout=3000)
218
+ except:
219
+ pass # Selector may not exist, continue anyway
220
+
221
+ # Extended wait for dynamic content
222
+ page.wait_for_timeout(3000) # 3 seconds for rendering
223
+
224
+ # Scroll down to load lazy-loaded images/content
225
+ page.evaluate("""
226
+ async () => {
227
+ await new Promise((resolve) => {
228
+ let totalHeight = 0;
229
+ const distance = 100;
230
+ const timer = setInterval(() => {
231
+ window.scrollBy(0, distance);
232
+ totalHeight += distance;
233
+ if (totalHeight >= document.body.scrollHeight) {
234
+ clearInterval(timer);
235
+ resolve();
236
+ }
237
+ }, 100);
238
+ });
239
+ }
240
+ """)
241
+
242
+ # Wait after scrolling
243
+ page.wait_for_timeout(1500)
244
+
245
+ # Scroll back to top
246
+ page.evaluate("window.scrollTo(0, 0)")
247
+ page.wait_for_timeout(500)
248
+
249
+ html = page.content()
250
+ finally:
251
+ context.close()
252
+ browser.close()
253
+
254
+ soup = BeautifulSoup(html, "html.parser")
255
+ text = _clean_soup(soup, prefer_main=True, max_chars=25000)
256
+
257
+ if text and len(text) > MIN_TEXT_LENGTH:
258
+ logger.info(f" βœ… Scraped {len(text)} chars (Playwright) for {url}")
259
+ return text
260
+
261
+ # If we got very little content, try a less aggressive cleanup
262
+ logger.debug(f"Initial extraction got {len(text)} chars, trying aggressive extraction")
263
+ text_aggressive = _clean_soup(soup, prefer_main=False, max_chars=25000)
264
+ if text_aggressive and len(text_aggressive) > MIN_TEXT_LENGTH and len(text_aggressive) > len(text):
265
+ logger.info(f" βœ… Scraped {len(text_aggressive)} chars (Playwright aggressive) for {url}")
266
+ return text_aggressive
267
+
268
+ logger.debug(f"Playwright extraction minimal for {url}: {len(text)} chars")
269
+ return text if text else ""
270
+
271
+ except (PlaywrightTimeoutError, Exception) as e:
272
+ logger.debug(f"Playwright error for {url}: {e}")
273
+ return ""
274
+
275
+ # ---- IMPROVED: Smart fallback strategy ----
276
+ def scrape_page(url: str, timeout: int = 5):
277
+ """Scrape pages in order: requests β†’ cloudscraper β†’ Playwright."""
278
+ # βœ… HARD CHECK: Exit immediately if timeout exceeded
279
+ if _is_timeout_exceeded():
280
+ logger.warning(f"Scraping timeout exceeded, skipping {url}")
281
+ return ""
282
+
283
+ try:
284
+ domain = urlparse(url).netloc.lower()
285
+
286
+ # Skip blacklisted domains entirely
287
+ if any(bd in domain for bd in BLACKLIST_DOMAINS):
288
+ logger.info(f"Skipping blacklisted domain: {domain}")
289
+ return ""
290
+
291
+ # --- 1️⃣ Requests (fastest) ---
292
+ headers = {
293
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
294
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
295
+ "Accept-Language": "en-US,en;q=0.5",
296
+ }
297
+ try:
298
+ logger.debug(f"requests: GET {url}")
299
+ r = _SESSION.get(url, headers=headers, timeout=timeout, allow_redirects=True)
300
+ r.raise_for_status()
301
+ soup = BeautifulSoup(r.text, "html.parser")
302
+ text = _clean_soup(soup, prefer_main=True, max_chars=20000)
303
+ if text and len(text) > MIN_TEXT_LENGTH:
304
+ logger.info(f" βœ… Scraped {len(text)} chars (requests) for {url}")
305
+ return text
306
+ except Exception as e:
307
+ logger.debug(f"requests failed for {url}: {e}")
308
+
309
+ # βœ… HARD CHECK: Exit if timeout exceeded between attempts
310
+ if _is_timeout_exceeded():
311
+ logger.warning(f"Scraping timeout exceeded, stopping fallback methods for {url}")
312
+ return ""
313
+
314
+ # --- 2️⃣ Cloudscraper (medium) ---
315
+ if CLOUDSCRAPER_AVAILABLE:
316
+ try:
317
+ logger.debug(f"cloudscraper: GET {url}")
318
+ scraper = cloudscraper.create_scraper()
319
+ r = scraper.get(url, timeout=timeout)
320
+ r.raise_for_status()
321
+ soup = BeautifulSoup(r.text, "html.parser")
322
+ text = _clean_soup(soup, prefer_main=True, max_chars=20000)
323
+ if text and len(text) > MIN_TEXT_LENGTH:
324
+ logger.info(f" βœ… Scraped {len(text)} chars (cloudscraper) for {url}")
325
+ return text
326
+ except Exception as e:
327
+ logger.debug(f"cloudscraper failed for {url}: {e}")
328
+
329
+ # βœ… HARD CHECK: Exit if timeout exceeded before Playwright
330
+ if _is_timeout_exceeded():
331
+ logger.warning(f"Scraping timeout exceeded, skipping Playwright for {url}")
332
+ return ""
333
+
334
+ # --- 3️⃣ Playwright (heaviest) ---
335
+ try:
336
+ logger.debug(f"Playwright: GET {url}")
337
+ res = scrape_with_playwright(url, timeout=12)
338
+ if res and len(res) > MIN_TEXT_LENGTH:
339
+ return res
340
+ except Exception as e:
341
+ logger.debug(f"Playwright failed for {url}: {e}")
342
+
343
+ except Exception as e:
344
+ logger.debug(f"All scrapers failed for {url}: {e}")
345
+
346
+ return ""
347
+
348
+ # ---- IMPROVED: Parallel fetch with HARD OVERALL TIMEOUT ----
349
+ def fetch_sources(query: str, num_results: int = 10):
350
+ """Fetch sources for a single query"""
351
+ logger.info(f"Fetching sources for query: '{query[:60]}'")
352
+ items = google_search(query, num_results=num_results)
353
+ if not items:
354
+ logger.warning("No URLs returned from Google Search")
355
+ return []
356
+
357
+ urls = [it["link"] for it in items if it.get("link")]
358
+ snippets = {it["link"]: it.get("snippet", "") for it in items if it.get("link")}
359
+ domain_last_hit: Dict[str, float] = {}
360
+
361
+ def _scrape_task(u: str) -> Dict:
362
+ try:
363
+ dom = urlparse(u).netloc
364
+ last = domain_last_hit.get(dom, 0.0)
365
+ now = time.time()
366
+ delta = now - last
367
+ if delta < POLITENESS_DELAY:
368
+ time.sleep(POLITENESS_DELAY - delta)
369
+ domain_last_hit[dom] = time.time()
370
+ logger.info(f"Scraping URL: {u}")
371
+
372
+ text = scrape_page(u, timeout=REQUEST_TIMEOUT)
373
+ if text:
374
+ return {"url": u, "content": text, "method": "scraped", "len": len(text), "hash": _text_hash(text[:4000])}
375
+ else:
376
+ return {"url": u, "content": "", "method": "failed", "len": 0, "hash": ""}
377
+ except Exception as e:
378
+ logger.debug(f"Exception in _scrape_task for {u}: {e}")
379
+ return {"url": u, "content": "", "method": "error", "len": 0, "hash": ""}
380
+
381
+ results = []
382
+ with ThreadPoolExecutor(max_workers=MAX_WORKERS) as ex:
383
+ future_to_url = {ex.submit(_scrape_task, u): u for u in urls}
384
+ for fut in as_completed(future_to_url):
385
+ u = future_to_url[fut]
386
+ try:
387
+ r = fut.result(timeout=PER_URL_TIMEOUT)
388
+ if r.get("content"):
389
+ results.append(r)
390
+ except FuturesTimeoutError:
391
+ logger.warning(f"Timeout scraping {u}")
392
+ except Exception as e:
393
+ logger.warning(f"Error scraping {u}: {e}")
394
+
395
+ if not results:
396
+ logger.info("No pages scraped; using snippets")
397
+ for u in urls:
398
+ snip = snippets.get(u, "")
399
+ if snip:
400
+ results.append({"url": u, "content": snip, "method": "google_snippet", "len": len(snip), "hash": _text_hash(snip)})
401
+ return results
402
+
403
+ # Deduplicate
404
+ seen: Dict[str, Dict] = {}
405
+ for r in results:
406
+ h = r.get("hash") or _text_hash(r.get("content", ""))
407
+ cur = seen.get(h)
408
+ if not cur or (r.get("len", 0) > cur.get("len", 0)):
409
+ seen[h] = r
410
+
411
+ deduped = list(seen.values())
412
+
413
+ # Fill from snippets if needed
414
+ if len(deduped) < num_results:
415
+ for u in urls:
416
+ if any(x["url"] == u for x in deduped):
417
+ continue
418
+ snip = snippets.get(u, "")
419
+ if snip:
420
+ deduped.append({"url": u, "content": snip, "method": "google_snippet", "len": len(snip), "hash": _text_hash(snip)})
421
+ if len(deduped) >= num_results:
422
+ break
423
+
424
+ logger.info(f"Fetched {len(deduped)} sources for query")
425
+ return deduped
426
+
427
+ # ---- NEW: Multi-query fetch - accepts pre-generated queries ----
428
+ def fetch_sources_multi_query(query: str, num_results: int = 10) -> List[Dict[str, str]]:
429
+ """
430
+ Accept a single pre-generated query and fetch sources.
431
+ NO internal query generation - just scrape this query.
432
+ """
433
+ _set_scraping_deadline()
434
+
435
+ logger.info(f"Processing single query with overall 3-minute timeout")
436
+
437
+ all_sources: Dict[str, Dict] = {}
438
+ DOC_EXTENSIONS = [".pdf", ".doc", ".docx", ".odf", ".xls", ".xlsx", ".ppt", ".pptx"]
439
+ lock = threading.Lock()
440
+
441
+ def _process_url(u: str, items: List) -> Optional[Dict]:
442
+ """Process single URL with timeout check"""
443
+ # βœ… HARD CHECK: Exit immediately if timeout exceeded
444
+ if _is_timeout_exceeded():
445
+ return None
446
+
447
+ if u in all_sources:
448
+ return None
449
+ if any(ext in u.lower() for ext in DOC_EXTENSIONS):
450
+ logger.info(f"Skipping document URL: {u}")
451
+ return None
452
+
453
+ logger.info(f"Scraping URL: {u}")
454
+
455
+ try:
456
+ # Try scraping the page
457
+ text_content = scrape_page(u, timeout=REQUEST_TIMEOUT)
458
+ if text_content and len(text_content) > MIN_TEXT_LENGTH:
459
+ logger.info(f" βœ… Scraped {len(text_content)} chars for {u}")
460
+ return {"url": u, "content": text_content, "source_url": u}
461
+
462
+ # If scraping failed, try snippet as fallback
463
+ snippet = next((it.get("snippet", "") for it in items if it.get("link") == u), "")
464
+ if snippet and len(snippet) > 50:
465
+ logger.info(f" ⚠️ Using snippet ({len(snippet)} chars) for {u}")
466
+ return {"url": u, "content": snippet, "source_url": u}
467
+
468
+ logger.warning(f" ❌ No content extracted for {u}")
469
+ return None
470
+
471
+ except Exception as e:
472
+ logger.debug(f"Error scraping {u}: {e}")
473
+ return None
474
+
475
+ # βœ… HARD CHECK: Exit if timeout exceeded
476
+ if _is_timeout_exceeded():
477
+ logger.warning(f"Timeout exceeded, skipping query")
478
+ return []
479
+
480
+ logger.info(f"Query: '{query[:60]}'")
481
+ items = google_search(query, num_results=num_results)
482
+ if not items:
483
+ return []
484
+
485
+ urls = [it["link"] for it in items if it.get("link")]
486
+
487
+ # Process URLs for this query in parallel
488
+ with ThreadPoolExecutor(max_workers=4) as url_ex:
489
+ url_futures = {url_ex.submit(_process_url, u, items): u for u in urls}
490
+ for fut in as_completed(url_futures):
491
+ # βœ… HARD CHECK: Exit if timeout exceeded
492
+ if _is_timeout_exceeded():
493
+ logger.warning(f"Timeout exceeded, stopping URL processing")
494
+ for f in url_futures:
495
+ f.cancel()
496
+ break
497
+
498
+ try:
499
+ result = fut.result(timeout=PER_URL_TIMEOUT)
500
+ if result:
501
+ with lock:
502
+ all_sources[result['url']] = result
503
+ logger.info(f" βœ… Added: {result['url'][:50]}")
504
+ except FuturesTimeoutError:
505
+ logger.warning(f"Timeout on URL")
506
+ except Exception as e:
507
+ logger.debug(f"Error: {e}")
508
+ time.sleep(0.05)
509
+
510
+ res = list(all_sources.values())
511
+ elapsed = time.time() - _scraping_start_time if _scraping_start_time else 0
512
+ logger.info(f"Sources for this query: {len(res)} (elapsed: {elapsed:.1f}s)")
513
+ return res
514
+
515
+
516
+ def fetch_brave_sources(query: str, num_results: int = 5) -> list[dict]:
517
+ """Fetch from Brave Search API."""
518
+ headers = {
519
+ "Accept": "application/json",
520
+ "X-API-KEY": BRAVE_API_KEY
521
+ }
522
+ params = {"q": query, "count": num_results}
523
+
524
+ try:
525
+ resp = requests.get(BRAVE_ENDPOINT, headers=headers, params=params, timeout=6)
526
+ resp.raise_for_status()
527
+ data = resp.json()
528
+ results = []
529
+ for item in data.get("webPages", []):
530
+ results.append({
531
+ "title": item.get("url") or "Unknown",
532
+ "content": item.get("snippet") or "",
533
+ })
534
+ return results
535
+ except Exception as e:
536
+ logging.warning(f"Brave search failed for query '{query}': {e}")
537
+ return []
538
+
539
+ def prepare_brave_query(text: str, min_len: int = 20, max_len: int = 200) -> str:
540
+ t = re.sub(r"\s+", " ", text).strip()
541
+ if len(t) < min_len:
542
+ return None
543
+ if len(t) > max_len:
544
+ t = t[:max_len].rsplit(" ", 1)[0]
545
+ return t