Manas281 commited on
Commit
a405e08
·
verified ·
1 Parent(s): 37bf8d6

Upload 4 files

Browse files
Files changed (4) hide show
  1. .dockerignore +11 -0
  2. Dockerfile +15 -0
  3. main.py +1028 -0
  4. requirements.txt +8 -0
.dockerignore ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __pycache__/
2
+ *.pyc
3
+ *.pyo
4
+ *.pyd
5
+ *.log
6
+ .env
7
+ .venv/
8
+ venv/
9
+ .git/
10
+ .gitignore
11
+ scholar-s-shield/
Dockerfile ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.12-slim
2
+
3
+ ENV PYTHONDONTWRITEBYTECODE=1
4
+ ENV PYTHONUNBUFFERED=1
5
+
6
+ WORKDIR /app
7
+
8
+ COPY requirements.txt .
9
+ RUN pip install --no-cache-dir -r requirements.txt
10
+
11
+ COPY main.py .
12
+
13
+ EXPOSE 8000
14
+
15
+ CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
main.py ADDED
@@ -0,0 +1,1028 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import time
3
+ import requests
4
+ import random
5
+ import re
6
+ from difflib import SequenceMatcher
7
+ from typing import List, Optional, Dict, Any
8
+ from urllib.parse import quote_plus
9
+
10
+ from fastapi import FastAPI, UploadFile, File, HTTPException
11
+ from fastapi.middleware.cors import CORSMiddleware
12
+ import uvicorn
13
+ from pydantic import BaseModel
14
+ from PyPDF2 import PdfReader
15
+
16
+ from langchain_groq import ChatGroq
17
+ from langchain_core.prompts import ChatPromptTemplate
18
+
19
+ # ==========================================
20
+ # 1. Environment & API Setup
21
+ # ==========================================
22
+ GROQ_API_KEY = os.getenv("GROQ_API_KEY","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")
23
+ SERPER_API_KEY = os.getenv("SERPER_API_KEY","e8f2e0a4b337e7e8b63d5bf6057f01441cfa6ca5")
24
+ SEMANTIC_SCHOLAR_API_KEY = os.getenv("SEMANTIC_SCHOLAR_API_KEY","0a00aUnvlY9iwf7sDOQEoaVRubNMtFTi3KOa6bB0")
25
+ SEMANTIC_SCHOLAR_BASE_URL = "https://api.semanticscholar.org/graph/v1"
26
+ SEMANTIC_SCHOLAR_MIN_INTERVAL_SECONDS = 1.2
27
+ SEMANTIC_SCHOLAR_MAX_RETRIES = 4
28
+
29
+ if not GROQ_API_KEY or not SERPER_API_KEY:
30
+ print("WARNING: GROQ_API_KEY or SERPER_API_KEY is missing!")
31
+
32
+ llm = ChatGroq(model="llama-3.3-70b-versatile", temperature=0.1)
33
+
34
+ # Basic Memory Cache to maintain API efficiency (as promised in the application)
35
+ query_cache = {}
36
+ semantic_query_cache: Dict[str, List[Dict[str, str]]] = {}
37
+ _last_semantic_scholar_call_ts = 0.0
38
+
39
+ # ==========================================
40
+ # 2. Pydantic Models
41
+ # ==========================================
42
+ class MatchReport(BaseModel):
43
+ chunk_text: str
44
+ is_plagiarized: bool
45
+ plagiarism_type: Optional[str] = None
46
+ source_url: Optional[str] = None
47
+ source_type: Optional[str] = None # "Academic" or "Web"
48
+ similarity_score: float
49
+
50
+ class PlagiarismReport(BaseModel):
51
+ filename: str
52
+ total_words: int
53
+ plagiarized_words: int
54
+ overall_plagiarism_score: float
55
+ severity_level: str # Low, Medium, High, Very High
56
+ details: List[MatchReport]
57
+
58
+ class DetailedPlagiarismReport(BaseModel):
59
+ """Comprehensive report generated by LLM"""
60
+ filename: str
61
+ scan_timestamp: str
62
+ executive_summary: str
63
+ overall_score: float
64
+ severity_level: str
65
+ matched_sources: List[Dict[str, Any]]
66
+ key_findings: List[str]
67
+ plagiarism_breakdown: Dict[str, Any] # Types and percentages
68
+ detailed_analysis: str # LLM-generated detailed analysis
69
+ affected_sections: List[Dict[str, Any]] # Which parts are problematic
70
+ recommendations: List[str]
71
+ academic_integrity_risk: str # Assessment level
72
+
73
+ app = FastAPI(title="Pro Plagiarism Detector (Turnitin Clone)")
74
+
75
+ app.add_middleware(
76
+ CORSMiddleware,
77
+ allow_origins=["*"],
78
+ allow_credentials=True,
79
+ allow_methods=["*"],
80
+ allow_headers=["*"],
81
+ )
82
+
83
+ # ==========================================
84
+ # 3. Agent Tools: Serper & Semantic Scholar
85
+ # ==========================================
86
+
87
+ def _semantic_scholar_headers() -> Dict[str, str]:
88
+ headers: Dict[str, str] = {}
89
+ if SEMANTIC_SCHOLAR_API_KEY:
90
+ # API key must be sent in x-api-key header.
91
+ headers["x-api-key"] = SEMANTIC_SCHOLAR_API_KEY
92
+ return headers
93
+
94
+
95
+ def _semantic_scholar_get(path: str, params: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
96
+ global _last_semantic_scholar_call_ts
97
+ filtered_params = {k: v for k, v in (params or {}).items() if v is not None}
98
+
99
+ for attempt in range(SEMANTIC_SCHOLAR_MAX_RETRIES):
100
+ elapsed = time.time() - _last_semantic_scholar_call_ts
101
+ if elapsed < SEMANTIC_SCHOLAR_MIN_INTERVAL_SECONDS:
102
+ time.sleep(SEMANTIC_SCHOLAR_MIN_INTERVAL_SECONDS - elapsed)
103
+
104
+ response = requests.get(
105
+ f"{SEMANTIC_SCHOLAR_BASE_URL}{path}",
106
+ headers=_semantic_scholar_headers(),
107
+ params=filtered_params,
108
+ timeout=20,
109
+ )
110
+ _last_semantic_scholar_call_ts = time.time()
111
+
112
+ if response.status_code == 429 and attempt < SEMANTIC_SCHOLAR_MAX_RETRIES - 1:
113
+ retry_after = response.headers.get("Retry-After")
114
+ if retry_after and retry_after.isdigit():
115
+ wait_seconds = float(retry_after)
116
+ else:
117
+ wait_seconds = (2 ** attempt) + random.uniform(0.2, 0.7)
118
+ time.sleep(wait_seconds)
119
+ continue
120
+
121
+ response.raise_for_status()
122
+ return response.json()
123
+
124
+ raise requests.HTTPError("Semantic Scholar request failed after retries")
125
+
126
+
127
+ def _semantic_scholar_post(path: str, body: Dict[str, Any], params: Optional[Dict[str, Any]] = None) -> Any:
128
+ global _last_semantic_scholar_call_ts
129
+ filtered_params = {k: v for k, v in (params or {}).items() if v is not None}
130
+
131
+ for attempt in range(SEMANTIC_SCHOLAR_MAX_RETRIES):
132
+ elapsed = time.time() - _last_semantic_scholar_call_ts
133
+ if elapsed < SEMANTIC_SCHOLAR_MIN_INTERVAL_SECONDS:
134
+ time.sleep(SEMANTIC_SCHOLAR_MIN_INTERVAL_SECONDS - elapsed)
135
+
136
+ response = requests.post(
137
+ f"{SEMANTIC_SCHOLAR_BASE_URL}{path}",
138
+ headers=_semantic_scholar_headers(),
139
+ params=filtered_params,
140
+ json=body,
141
+ timeout=25,
142
+ )
143
+ _last_semantic_scholar_call_ts = time.time()
144
+
145
+ if response.status_code == 429 and attempt < SEMANTIC_SCHOLAR_MAX_RETRIES - 1:
146
+ retry_after = response.headers.get("Retry-After")
147
+ if retry_after and retry_after.isdigit():
148
+ wait_seconds = float(retry_after)
149
+ else:
150
+ wait_seconds = (2 ** attempt) + random.uniform(0.2, 0.7)
151
+ time.sleep(wait_seconds)
152
+ continue
153
+
154
+ response.raise_for_status()
155
+ return response.json()
156
+
157
+ raise requests.HTTPError("Semantic Scholar request failed after retries")
158
+
159
+
160
+ def s2_paper_autocomplete(query: str) -> Dict[str, Any]:
161
+ return _semantic_scholar_get("/paper/autocomplete", {"query": query[:100]})
162
+
163
+
164
+ def s2_paper_batch(ids: List[str], fields: Optional[str] = None) -> Any:
165
+ return _semantic_scholar_post("/paper/batch", {"ids": ids[:500]}, {"fields": fields})
166
+
167
+
168
+ def s2_paper_search(
169
+ query: str,
170
+ fields: Optional[str] = None,
171
+ limit: int = 100,
172
+ offset: int = 0,
173
+ year: Optional[str] = None,
174
+ fields_of_study: Optional[str] = None,
175
+ open_access_pdf: bool = False,
176
+ ) -> Dict[str, Any]:
177
+ params: Dict[str, Any] = {
178
+ "query": query,
179
+ "fields": fields,
180
+ "limit": min(max(limit, 1), 100),
181
+ "offset": max(offset, 0),
182
+ "year": year,
183
+ "fieldsOfStudy": fields_of_study,
184
+ }
185
+ if open_access_pdf:
186
+ params["openAccessPdf"] = ""
187
+ return _semantic_scholar_get("/paper/search", params)
188
+
189
+
190
+ def s2_paper_search_bulk(
191
+ query: str,
192
+ fields: Optional[str] = None,
193
+ token: Optional[str] = None,
194
+ sort: Optional[str] = None,
195
+ ) -> Dict[str, Any]:
196
+ return _semantic_scholar_get(
197
+ "/paper/search/bulk",
198
+ {
199
+ "query": query,
200
+ "fields": fields,
201
+ "token": token,
202
+ "sort": sort,
203
+ },
204
+ )
205
+
206
+
207
+ def s2_paper_search_match(query: str, fields: Optional[str] = None) -> Dict[str, Any]:
208
+ return _semantic_scholar_get("/paper/search/match", {"query": query, "fields": fields})
209
+
210
+
211
+ def s2_paper_details(paper_id: str, fields: Optional[str] = None) -> Dict[str, Any]:
212
+ safe_id = quote_plus(paper_id)
213
+ return _semantic_scholar_get(f"/paper/{safe_id}", {"fields": fields})
214
+
215
+
216
+ def s2_paper_authors(
217
+ paper_id: str,
218
+ fields: Optional[str] = None,
219
+ limit: int = 100,
220
+ offset: int = 0,
221
+ ) -> Dict[str, Any]:
222
+ safe_id = quote_plus(paper_id)
223
+ return _semantic_scholar_get(
224
+ f"/paper/{safe_id}/authors",
225
+ {"fields": fields, "limit": min(max(limit, 1), 1000), "offset": max(offset, 0)},
226
+ )
227
+
228
+
229
+ def s2_paper_citations(
230
+ paper_id: str,
231
+ fields: Optional[str] = None,
232
+ limit: int = 100,
233
+ offset: int = 0,
234
+ publication_date_or_year: Optional[str] = None,
235
+ ) -> Dict[str, Any]:
236
+ safe_id = quote_plus(paper_id)
237
+ return _semantic_scholar_get(
238
+ f"/paper/{safe_id}/citations",
239
+ {
240
+ "fields": fields,
241
+ "limit": min(max(limit, 1), 1000),
242
+ "offset": max(offset, 0),
243
+ "publicationDateOrYear": publication_date_or_year,
244
+ },
245
+ )
246
+
247
+
248
+ def s2_paper_references(
249
+ paper_id: str,
250
+ fields: Optional[str] = None,
251
+ limit: int = 100,
252
+ offset: int = 0,
253
+ ) -> Dict[str, Any]:
254
+ safe_id = quote_plus(paper_id)
255
+ return _semantic_scholar_get(
256
+ f"/paper/{safe_id}/references",
257
+ {"fields": fields, "limit": min(max(limit, 1), 1000), "offset": max(offset, 0)},
258
+ )
259
+
260
+
261
+ def s2_author_batch(ids: List[str], fields: Optional[str] = None) -> Any:
262
+ return _semantic_scholar_post("/author/batch", {"ids": ids[:1000]}, {"fields": fields})
263
+
264
+
265
+ def s2_author_search(
266
+ query: str,
267
+ fields: Optional[str] = None,
268
+ limit: int = 100,
269
+ offset: int = 0,
270
+ ) -> Dict[str, Any]:
271
+ return _semantic_scholar_get(
272
+ "/author/search",
273
+ {
274
+ "query": query,
275
+ "fields": fields,
276
+ "limit": min(max(limit, 1), 1000),
277
+ "offset": max(offset, 0),
278
+ },
279
+ )
280
+
281
+
282
+ def s2_author_details(author_id: str, fields: Optional[str] = None) -> Dict[str, Any]:
283
+ safe_id = quote_plus(author_id)
284
+ return _semantic_scholar_get(f"/author/{safe_id}", {"fields": fields})
285
+
286
+
287
+ def s2_author_papers(
288
+ author_id: str,
289
+ fields: Optional[str] = None,
290
+ limit: int = 100,
291
+ offset: int = 0,
292
+ publication_date_or_year: Optional[str] = None,
293
+ ) -> Dict[str, Any]:
294
+ safe_id = quote_plus(author_id)
295
+ return _semantic_scholar_get(
296
+ f"/author/{safe_id}/papers",
297
+ {
298
+ "fields": fields,
299
+ "limit": min(max(limit, 1), 1000),
300
+ "offset": max(offset, 0),
301
+ "publicationDateOrYear": publication_date_or_year,
302
+ },
303
+ )
304
+
305
+
306
+ def s2_snippet_search(
307
+ query: str,
308
+ fields: Optional[str] = None,
309
+ limit: int = 10,
310
+ year: Optional[str] = None,
311
+ fields_of_study: Optional[str] = None,
312
+ ) -> Dict[str, Any]:
313
+ return _semantic_scholar_get(
314
+ "/snippet/search",
315
+ {
316
+ "query": query,
317
+ "fields": fields,
318
+ "limit": min(max(limit, 1), 1000),
319
+ "year": year,
320
+ "fieldsOfStudy": fields_of_study,
321
+ },
322
+ )
323
+
324
+
325
+ def build_search_query(text: str, max_terms: int = 10) -> str:
326
+ """Builds a compact keyword query to improve search recall and reduce noisy long queries."""
327
+ stopwords = {
328
+ "the", "and", "for", "that", "with", "this", "from", "into", "our", "their",
329
+ "were", "have", "has", "had", "been", "are", "was", "will", "would", "can",
330
+ "could", "should", "about", "through", "using", "based", "than", "then", "also",
331
+ "such", "these", "those", "while", "where", "when", "what", "which", "who",
332
+ }
333
+ words = re.findall(r"[A-Za-z0-9]+", text.lower())
334
+ keywords = [w for w in words if len(w) > 2 and w not in stopwords]
335
+ return " ".join(keywords[:max_terms]) if keywords else " ".join(words[:max_terms])
336
+
337
+ def search_google_serper(query: str) -> List[Dict]:
338
+ """Searches the open web using Google Serper API."""
339
+ url = "https://google.serper.dev/search"
340
+ payload = {"q": query}
341
+ headers = {
342
+ 'X-API-KEY': SERPER_API_KEY,
343
+ 'Content-Type': 'application/json'
344
+ }
345
+
346
+ try:
347
+ response = requests.post(url, headers=headers, json=payload)
348
+ response.raise_for_status()
349
+ data = response.json()
350
+
351
+ results = []
352
+ for item in data.get("organic", [])[:3]: # Top 3 web results
353
+ results.append({
354
+ "text": item.get("snippet", ""),
355
+ "url": item.get("link", ""),
356
+ "source_type": "Web (Google)"
357
+ })
358
+ return results
359
+ except Exception as e:
360
+ print(f"Serper Error: {e}")
361
+ return []
362
+
363
+ def search_semantic_scholar(query: str) -> List[Dict]:
364
+ """Searches academic papers using Semantic Scholar API."""
365
+ prepared_query = build_search_query(query, max_terms=10)
366
+ normalized_query = " ".join(prepared_query.split()).lower()
367
+ if normalized_query in semantic_query_cache:
368
+ return semantic_query_cache[normalized_query]
369
+
370
+ try:
371
+ results = []
372
+
373
+ # Try snippet search first because it returns passage-level text better suited for chunk comparison.
374
+ snippet_data = s2_snippet_search(
375
+ query=prepared_query,
376
+ fields="snippet.text,snippet.snippetKind",
377
+ limit=3,
378
+ )
379
+ for item in snippet_data.get("data", []):
380
+ snippet = item.get("snippet", {})
381
+ paper = item.get("paper", {})
382
+ snippet_text = snippet.get("text", "")
383
+ if snippet_text:
384
+ corpus_id = paper.get("corpusId")
385
+ paper_url = f"https://www.semanticscholar.org/paper/{corpus_id}" if corpus_id else None
386
+ results.append({
387
+ "text": snippet_text,
388
+ "url": paper_url,
389
+ "source_type": "Academic (Semantic Scholar Snippet)",
390
+ })
391
+
392
+ # Keep paper abstract search as fallback/secondary source.
393
+ data = s2_paper_search(
394
+ query=prepared_query,
395
+ limit=2,
396
+ fields="title,abstract,url",
397
+ )
398
+
399
+ for item in data.get("data", []):
400
+ if item.get("abstract"): # Only keep if abstract exists to compare text
401
+ results.append({
402
+ "text": item["abstract"],
403
+ "url": item.get("url", f"https://www.semanticscholar.org/paper/{item['paperId']}"),
404
+ "source_type": "Academic (Semantic Scholar)"
405
+ })
406
+ semantic_query_cache[normalized_query] = results
407
+ return results
408
+ except Exception as e:
409
+ print(f"Semantic Scholar Error: {e}")
410
+ return []
411
+
412
+ def aggregate_search(query: str) -> List[Dict]:
413
+ """Combines Academic and Web sources and implements caching."""
414
+ # Use the first 15 words to make the search query efficient
415
+ search_query = " ".join(query.split()[:15])
416
+
417
+ if search_query in query_cache:
418
+ return query_cache[search_query]
419
+
420
+ # Run both searches
421
+ web_results = search_google_serper(search_query)
422
+ academic_results = search_semantic_scholar(search_query)
423
+
424
+ combined = web_results + academic_results
425
+ query_cache[search_query] = combined # Save to cache
426
+
427
+ # Sleep to respect rate limits
428
+ time.sleep(1)
429
+
430
+ return combined
431
+
432
+ # ==========================================
433
+ # 4. Core Comparison Logic
434
+ # ==========================================
435
+
436
+ def calculate_exact_similarity(text1: str, text2: str) -> float:
437
+ return SequenceMatcher(None, text1.lower(), text2.lower()).ratio()
438
+
439
+ def check_paraphrasing_with_llm(chunk: str, source_text: str) -> bool:
440
+ prompt = ChatPromptTemplate.from_messages([
441
+ ("system", "You are an expert academic plagiarism detector. Determine if TEXT A is a direct paraphrase, stolen idea, or highly similar structure to TEXT B. Ignore generic academic phrases like 'In this paper we demonstrate'. Respond ONLY with 'YES' or 'NO'."),
442
+ ("user", "TEXT A: {chunk}\n\nTEXT B: {source_text}")
443
+ ])
444
+ chain = prompt | llm
445
+ response = chain.invoke({"chunk": chunk, "source_text": source_text})
446
+ return "YES" in response.content.upper()
447
+
448
+ def generate_detailed_report_with_llm(
449
+ filename: str,
450
+ match_reports: List[MatchReport],
451
+ total_words: int,
452
+ overall_score: float
453
+ ) -> DetailedPlagiarismReport:
454
+ """Generate a comprehensive report using LLM analysis"""
455
+ from datetime import datetime
456
+
457
+ # 1. Aggregate data for analysis
458
+ plagiarized_reports = [r for r in match_reports if r.is_plagiarized]
459
+ plagiarism_types = {}
460
+ sources_by_type = {"Academic": [], "Web": []}
461
+
462
+ for report in plagiarized_reports:
463
+ ptype = report.plagiarism_type or "Unknown"
464
+ plagiarism_types[ptype] = plagiarism_types.get(ptype, 0) + 1
465
+
466
+ if report.source_type:
467
+ if "Academic" in report.source_type:
468
+ if report.source_url not in sources_by_type["Academic"]:
469
+ sources_by_type["Academic"].append({
470
+ "url": report.source_url,
471
+ "type": report.source_type,
472
+ "max_similarity": report.similarity_score
473
+ })
474
+ else:
475
+ if report.source_url not in sources_by_type["Web"]:
476
+ sources_by_type["Web"].append({
477
+ "url": report.source_url,
478
+ "type": report.source_type,
479
+ "max_similarity": report.similarity_score
480
+ })
481
+
482
+ # 2. Determine severity level
483
+ if overall_score < 15:
484
+ severity = "Low"
485
+ risk_level = "Minimal - Normal citation variations detected"
486
+ elif overall_score < 30:
487
+ severity = "Medium"
488
+ risk_level = "Moderate - Multiple sources match detected"
489
+ elif overall_score < 50:
490
+ severity = "High"
491
+ risk_level = "Significant - Substantial plagiarism detected"
492
+ else:
493
+ severity = "Very High"
494
+ risk_level = "Critical - Extensive plagiarism detected"
495
+
496
+ # 3. Use LLM to generate detailed analysis
497
+ plagiarism_context = f"""
498
+ Document: {filename}
499
+ Total Words: {total_words}
500
+ Plagiarism Score: {overall_score}%
501
+ Plagiarism Types Found: {plagiarism_types}
502
+ Academic Matches: {len(sources_by_type['Academic'])}
503
+ Web Matches: {len(sources_by_type['Web'])}
504
+
505
+ Suspicious Sections (samples):
506
+ {chr(10).join([f"- {r.chunk_text[:100]}..." for r in plagiarized_reports[:5]])}
507
+ """
508
+
509
+ analysis_prompt = ChatPromptTemplate.from_messages([
510
+ ("system", """You are an expert academic integrity analyzer and plagiarism report generator.
511
+ Generate a professional, detailed plagiarism analysis report.
512
+ Focus on: severity assessment, academic integrity concerns, specific problem areas, and recommendations.
513
+ Be thorough but concise."""),
514
+ ("user", """Create a detailed plagiarism analysis for this document:
515
+
516
+ {plagiarism_context}
517
+
518
+ Provide:
519
+ 1. Executive Summary (2-3 sentences)
520
+ 2. Key Findings (3-4 bullet points)
521
+ 3. Detailed Analysis (2-3 paragraphs explaining the plagiarism pattern)
522
+ 4. Recommendations (3-4 specific actions to remediate)
523
+
524
+ Format clearly with section headers.""")
525
+ ])
526
+
527
+ chain = analysis_prompt | llm
528
+ llm_response = chain.invoke({"plagiarism_context": plagiarism_context})
529
+ llm_analysis = llm_response.content
530
+
531
+ # 4. Extract findings from LLM response
532
+ lines = llm_analysis.split('\n')
533
+ key_findings = []
534
+ recommendations = []
535
+ detailed_analysis = ""
536
+
537
+ in_findings = False
538
+ in_recommendations = False
539
+
540
+ for line in lines:
541
+ if 'Key Findings' in line:
542
+ in_findings = True
543
+ in_recommendations = False
544
+ elif 'Recommendations' in line:
545
+ in_findings = False
546
+ in_recommendations = True
547
+ elif 'Detailed Analysis' in line or 'Analysis' in line:
548
+ in_findings = False
549
+ in_recommendations = False
550
+ elif in_findings and line.strip().startswith(('-', '*', '•')):
551
+ key_findings.append(line.strip().lstrip('-*•').strip())
552
+ elif in_recommendations and line.strip().startswith(('-', '*', '•')):
553
+ recommendations.append(line.strip().lstrip('-*•').strip())
554
+ elif not in_findings and not in_recommendations and line.strip():
555
+ detailed_analysis += line + "\n"
556
+
557
+ if not key_findings:
558
+ key_findings = [
559
+ f"Overall plagiarism score: {overall_score}%",
560
+ f"Primary plagiarism type: {max(plagiarism_types.keys(), key=plagiarism_types.get) if plagiarism_types else 'Not detected'}",
561
+ f"Multiple sources detected: {len(sources_by_type['Academic']) + len(sources_by_type['Web'])} sources"
562
+ ]
563
+
564
+ if not recommendations:
565
+ recommendations = [
566
+ "Properly cite all sources according to your institution's guidelines",
567
+ "Use quotation marks for direct quotes and provide page numbers",
568
+ "Paraphrase content properly and cite original sources",
569
+ "Use plagiarism detection tools during the writing process"
570
+ ]
571
+
572
+ # 5. Affected sections
573
+ affected_sections = []
574
+ for i, report in enumerate(plagiarized_reports[:10]):
575
+ affected_sections.append({
576
+ "section_number": i + 1,
577
+ "text_snippet": report.chunk_text[:150],
578
+ "similarity_score": report.similarity_score,
579
+ "plagiarism_type": report.plagiarism_type,
580
+ "source": report.source_url,
581
+ "source_type": report.source_type
582
+ })
583
+
584
+ return DetailedPlagiarismReport(
585
+ filename=filename,
586
+ scan_timestamp=datetime.now().isoformat(),
587
+ executive_summary=llm_analysis.split('\n')[0] if llm_analysis else f"Document contains {overall_score}% plagiarized content",
588
+ overall_score=round(overall_score, 2),
589
+ severity_level=severity,
590
+ matched_sources=sources_by_type["Academic"] + sources_by_type["Web"],
591
+ key_findings=key_findings,
592
+ plagiarism_breakdown={
593
+ "total_plagiarism_percentage": round(overall_score, 2),
594
+ "types": plagiarism_types,
595
+ "academic_sources": len(sources_by_type["Academic"]),
596
+ "web_sources": len(sources_by_type["Web"])
597
+ },
598
+ detailed_analysis=detailed_analysis or llm_analysis,
599
+ affected_sections=affected_sections,
600
+ recommendations=recommendations,
601
+ academic_integrity_risk=risk_level
602
+ )
603
+
604
+ def analyze_chunk(chunk: str) -> MatchReport:
605
+ search_results = aggregate_search(chunk)
606
+
607
+ best_score = 0.0
608
+ best_url = None
609
+ best_source_type = None
610
+ plagiarism_type = None
611
+ is_plagiarized = False
612
+
613
+ for result in search_results:
614
+ source_text = result['text']
615
+
616
+ # 1. Math/Deterministic Check
617
+ exact_sim = calculate_exact_similarity(chunk, source_text)
618
+
619
+ if exact_sim > best_score:
620
+ best_score = exact_sim
621
+ best_url = result['url']
622
+ best_source_type = result['source_type']
623
+
624
+ if exact_sim > 0.50: # Lowered to 50% because we are comparing against abstracts/snippets
625
+ is_plagiarized = True
626
+ plagiarism_type = "Exact/Heavy Match"
627
+ break
628
+
629
+ # 2. Agentic Check for Mosaic Plagiarism
630
+ elif exact_sim > 0.25:
631
+ if check_paraphrasing_with_llm(chunk, source_text):
632
+ is_plagiarized = True
633
+ plagiarism_type = "Paraphrased Match (Mosaic)"
634
+ best_url = result['url']
635
+ best_source_type = result['source_type']
636
+ best_score = max(best_score, 0.85)
637
+ break
638
+
639
+ return MatchReport(
640
+ chunk_text=chunk,
641
+ is_plagiarized=is_plagiarized,
642
+ plagiarism_type=plagiarism_type,
643
+ source_url=best_url,
644
+ source_type=best_source_type,
645
+ similarity_score=round(best_score, 2)
646
+ )
647
+
648
+ # ==========================================
649
+ # 6. Report Formatting Functions
650
+ # ==========================================
651
+
652
+ def format_report_json(detailed_report: DetailedPlagiarismReport) -> Dict[str, Any]:
653
+ """Format report as JSON"""
654
+ return {
655
+ "filename": detailed_report.filename,
656
+ "scan_timestamp": detailed_report.scan_timestamp,
657
+ # Backward-compatible top-level fields expected by existing clients.
658
+ "overall_score": detailed_report.overall_score,
659
+ "severity_level": detailed_report.severity_level,
660
+ "academic_integrity_risk": detailed_report.academic_integrity_risk,
661
+ "summary": {
662
+ "overall_plagiarism_score": detailed_report.overall_score,
663
+ "severity_level": detailed_report.severity_level,
664
+ "academic_integrity_risk": detailed_report.academic_integrity_risk
665
+ },
666
+ "executive_summary": detailed_report.executive_summary,
667
+ "key_findings": detailed_report.key_findings,
668
+ "plagiarism_breakdown": detailed_report.plagiarism_breakdown,
669
+ "matched_sources": detailed_report.matched_sources,
670
+ "affected_sections": detailed_report.affected_sections,
671
+ "detailed_analysis": detailed_report.detailed_analysis,
672
+ "recommendations": detailed_report.recommendations
673
+ }
674
+
675
+ def format_report_text(detailed_report: DetailedPlagiarismReport) -> str:
676
+ """Format report as plain text"""
677
+ report = "=" * 80 + "\n"
678
+ report += "DETAILED PLAGIARISM DETECTION REPORT\n"
679
+ report += "=" * 80 + "\n\n"
680
+
681
+ report += f"FILE: {detailed_report.filename}\n"
682
+ report += f"SCAN DATE: {detailed_report.scan_timestamp}\n"
683
+ report += "-" * 80 + "\n\n"
684
+
685
+ report += "SUMMARY\n"
686
+ report += "-" * 80 + "\n"
687
+ report += f"Overall Plagiarism Score: {detailed_report.overall_score}%\n"
688
+ report += f"Severity Level: {detailed_report.severity_level}\n"
689
+ report += f"Academic Integrity Risk: {detailed_report.academic_integrity_risk}\n\n"
690
+
691
+ report += "EXECUTIVE SUMMARY\n"
692
+ report += "-" * 80 + "\n"
693
+ report += f"{detailed_report.executive_summary}\n\n"
694
+
695
+ report += "KEY FINDINGS\n"
696
+ report += "-" * 80 + "\n"
697
+ for i, finding in enumerate(detailed_report.key_findings, 1):
698
+ report += f"{i}. {finding}\n"
699
+ report += "\n"
700
+
701
+ report += "PLAGIARISM BREAKDOWN\n"
702
+ report += "-" * 80 + "\n"
703
+ report += f"Total Plagiarism %: {detailed_report.plagiarism_breakdown['total_plagiarism_percentage']}%\n"
704
+ report += f"Academic Sources: {detailed_report.plagiarism_breakdown['academic_sources']}\n"
705
+ report += f"Web Sources: {detailed_report.plagiarism_breakdown['web_sources']}\n"
706
+ if detailed_report.plagiarism_breakdown.get('types'):
707
+ report += "Types Detected:\n"
708
+ for ptype, count in detailed_report.plagiarism_breakdown['types'].items():
709
+ report += f" - {ptype}: {count} instances\n"
710
+ report += "\n"
711
+
712
+ report += "MATCHED SOURCES\n"
713
+ report += "-" * 80 + "\n"
714
+ if detailed_report.matched_sources:
715
+ for i, source in enumerate(detailed_report.matched_sources[:10], 1):
716
+ report += f"{i}. URL: {source.get('url', 'N/A')}\n"
717
+ report += f" Type: {source.get('type', 'N/A')}\n"
718
+ report += f" Similarity: {source.get('max_similarity', 'N/A')}\n\n"
719
+ else:
720
+ report += "No sources matched.\n\n"
721
+
722
+ report += "DETAILED ANALYSIS\n"
723
+ report += "-" * 80 + "\n"
724
+ report += f"{detailed_report.detailed_analysis}\n\n"
725
+
726
+ if detailed_report.affected_sections:
727
+ report += "AFFECTED SECTIONS (Top Issues)\n"
728
+ report += "-" * 80 + "\n"
729
+ for section in detailed_report.affected_sections[:5]:
730
+ report += f"\nSection {section['section_number']}:\n"
731
+ report += f"Text Snippet: {section['text_snippet']}\n"
732
+ report += f"Similarity Score: {section['similarity_score']}\n"
733
+ report += f"Plagiarism Type: {section['plagiarism_type']}\n"
734
+ report += f"Source: {section['source']}\n"
735
+ report += "\n"
736
+
737
+ report += "RECOMMENDATIONS\n"
738
+ report += "-" * 80 + "\n"
739
+ for i, rec in enumerate(detailed_report.recommendations, 1):
740
+ report += f"{i}. {rec}\n"
741
+ report += "\n"
742
+
743
+ report += "=" * 80 + "\n"
744
+ report += "End of Report\n"
745
+ report += "=" * 80 + "\n"
746
+
747
+ return report
748
+
749
+ def format_report_html(detailed_report: DetailedPlagiarismReport) -> str:
750
+ """Format report as HTML"""
751
+ html = f"""
752
+ <!DOCTYPE html>
753
+ <html lang="en">
754
+ <head>
755
+ <meta charset="UTF-8">
756
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
757
+ <title>Plagiarism Detection Report - {detailed_report.filename}</title>
758
+ <style>
759
+ body {{ font-family: Arial, sans-serif; margin: 40px; background-color: #f5f5f5; }}
760
+ .container {{ background-color: white; padding: 30px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1); }}
761
+ h1 {{ color: #333; border-bottom: 3px solid #2196F3; padding-bottom: 10px; }}
762
+ h2 {{ color: #2196F3; margin-top: 30px; }}
763
+ .summary {{ background-color: #f0f7ff; padding: 15px; border-left: 4px solid #2196F3; margin: 20px 0; }}
764
+ .score {{ font-size: 24px; font-weight: bold; color: #d32f2f; }}
765
+ .severity-low {{ color: #4caf50; }}
766
+ .severity-medium {{ color: #ff9800; }}
767
+ .severity-high {{ color: #f44336; }}
768
+ .severity-very-high {{ color: #c41c3b; }}
769
+ .findings {{ background-color: #fff3e0; padding: 15px; border-left: 4px solid #ff9800; }}
770
+ .source-item {{ background-color: #f5f5f5; padding: 10px; margin: 10px 0; border-radius: 4px; }}
771
+ .recommendation {{ background-color: #e8f5e9; padding: 10px; margin: 10px 0; border-left: 3px solid #4caf50; }}
772
+ table {{ width: 100%; border-collapse: collapse; margin: 15px 0; }}
773
+ th, td {{ padding: 10px; text-align: left; border-bottom: 1px solid #ddd; }}
774
+ th {{ background-color: #2196F3; color: white; }}
775
+ .affected-section {{ background-color: #fce4ec; padding: 15px; margin: 10px 0; border-radius: 4px; }}
776
+ </style>
777
+ </head>
778
+ <body>
779
+ <div class="container">
780
+ <h1>🔍 Plagiarism Detection Report</h1>
781
+
782
+ <div class="summary">
783
+ <p><strong>File:</strong> {detailed_report.filename}</p>
784
+ <p><strong>Scan Date:</strong> {detailed_report.scan_timestamp}</p>
785
+ <p><strong>Overall Plagiarism Score:</strong> <span class="score">{detailed_report.overall_score}%</span></p>
786
+ <p><strong>Severity Level:</strong> <span class="severity-{detailed_report.severity_level.lower().replace(' ', '-')}">{detailed_report.severity_level}</span></p>
787
+ <p><strong>Academic Integrity Risk:</strong> {detailed_report.academic_integrity_risk}</p>
788
+ </div>
789
+
790
+ <h2>Executive Summary</h2>
791
+ <p>{detailed_report.executive_summary}</p>
792
+
793
+ <h2>Key Findings</h2>
794
+ <div class="findings">
795
+ <ul>
796
+ {"".join([f"<li>{finding}</li>" for finding in detailed_report.key_findings])}
797
+ </ul>
798
+ </div>
799
+
800
+ <h2>Plagiarism Breakdown</h2>
801
+ <table>
802
+ <tr>
803
+ <th>Category</th>
804
+ <th>Value</th>
805
+ </tr>
806
+ <tr>
807
+ <td>Total Plagiarism %</td>
808
+ <td>{detailed_report.plagiarism_breakdown['total_plagiarism_percentage']}%</td>
809
+ </tr>
810
+ <tr>
811
+ <td>Academic Sources</td>
812
+ <td>{detailed_report.plagiarism_breakdown['academic_sources']}</td>
813
+ </tr>
814
+ <tr>
815
+ <td>Web Sources</td>
816
+ <td>{detailed_report.plagiarism_breakdown['web_sources']}</td>
817
+ </tr>
818
+ </table>
819
+
820
+ <h2>Matched Sources</h2>
821
+ {"".join([f'<div class="source-item"><strong>{source.get("type", "Unknown")}</strong><br/><a href="{source.get("url", "#")}" target="_blank">{source.get("url", "N/A")}</a><br/>Similarity: {source.get("max_similarity", "N/A")}</div>' for source in detailed_report.matched_sources[:10]])}
822
+
823
+ <h2>Detailed Analysis</h2>
824
+ <p>{detailed_report.detailed_analysis.replace(chr(10), "<br/>")}</p>
825
+
826
+ {"<h2>Affected Sections (Top Issues)</h2>" + "".join([f'<div class="affected-section"><strong>Section {section["section_number"]}</strong><br/><em>Text:</em> {section["text_snippet"]}...<br/><em>Similarity:</em> {section["similarity_score"]}<br/><em>Type:</em> {section["plagiarism_type"]}</div>' for section in detailed_report.affected_sections[:5]]) if detailed_report.affected_sections else ""}
827
+
828
+ <h2>Recommendations</h2>
829
+ <div>
830
+ {"".join([f'<div class="recommendation"><strong>✓</strong> {rec}</div>' for rec in detailed_report.recommendations])}
831
+ </div>
832
+ </div>
833
+ </body>
834
+ </html>
835
+ """
836
+ return html
837
+
838
+ # ==========================================
839
+ # 5. API Endpoints & Utility
840
+ # ==========================================
841
+
842
+ def extract_text_from_pdf(file_bytes) -> str:
843
+ reader = PdfReader(file_bytes)
844
+ return "".join([page.extract_text() + "\n" for page in reader.pages if page.extract_text()])
845
+
846
+ def chunk_text(text: str, words_per_chunk: int = 40) -> List[str]:
847
+ words = text.split()
848
+ chunks = []
849
+ for i in range(0, len(words), words_per_chunk - 10):
850
+ chunk = " ".join(words[i:i + words_per_chunk])
851
+ if len(chunk.split()) > 15:
852
+ chunks.append(chunk)
853
+ return chunks
854
+
855
+ @app.post("/scan-paper", response_model=PlagiarismReport)
856
+ async def scan_paper(file: UploadFile = File(...)):
857
+ text = extract_text_from_pdf(file.file)
858
+ total_words = len(text.split())
859
+
860
+ if total_words == 0:
861
+ raise HTTPException(status_code=400, detail="Could not extract text. Is this a scanned PDF?")
862
+
863
+ chunks = chunk_text(text)
864
+
865
+ # Cap chunks for safety during testing (remove in production)
866
+ if len(chunks) > 20:
867
+ chunks = chunks[:20]
868
+
869
+ detailed_reports = []
870
+ plagiarized_word_count = 0
871
+
872
+ for chunk in chunks:
873
+ report = analyze_chunk(chunk)
874
+ detailed_reports.append(report)
875
+
876
+ if report.is_plagiarized:
877
+ plagiarized_word_count += len(chunk.split())
878
+
879
+ plagiarized_word_count = min(plagiarized_word_count, total_words)
880
+ overall_score = (plagiarized_word_count / total_words) * 100
881
+
882
+ # Determine severity level
883
+ if overall_score < 15:
884
+ severity = "Low"
885
+ elif overall_score < 30:
886
+ severity = "Medium"
887
+ elif overall_score < 50:
888
+ severity = "High"
889
+ else:
890
+ severity = "Very High"
891
+
892
+ return PlagiarismReport(
893
+ filename=file.filename,
894
+ total_words=total_words,
895
+ plagiarized_words=plagiarized_word_count,
896
+ overall_plagiarism_score=round(overall_score, 2),
897
+ severity_level=severity,
898
+ details=detailed_reports
899
+ )
900
+
901
+ @app.post("/generate-detailed-report")
902
+ async def generate_detailed_report(file: UploadFile = File(...)):
903
+ """Generate comprehensive plagiarism report with LLM analysis"""
904
+ text = extract_text_from_pdf(file.file)
905
+ total_words = len(text.split())
906
+
907
+ if total_words == 0:
908
+ raise HTTPException(status_code=400, detail="Could not extract text. Is this a scanned PDF?")
909
+
910
+ chunks = chunk_text(text)
911
+
912
+ # Cap chunks
913
+ if len(chunks) > 20:
914
+ chunks = chunks[:20]
915
+
916
+ detailed_reports = []
917
+ plagiarized_word_count = 0
918
+
919
+ for chunk in chunks:
920
+ report = analyze_chunk(chunk)
921
+ detailed_reports.append(report)
922
+
923
+ if report.is_plagiarized:
924
+ plagiarized_word_count += len(chunk.split())
925
+
926
+ plagiarized_word_count = min(plagiarized_word_count, total_words)
927
+ overall_score = (plagiarized_word_count / total_words) * 100
928
+
929
+ # Generate detailed report with LLM analysis
930
+ detailed_report = generate_detailed_report_with_llm(
931
+ filename=file.filename,
932
+ match_reports=detailed_reports,
933
+ total_words=total_words,
934
+ overall_score=overall_score
935
+ )
936
+
937
+ return format_report_json(detailed_report)
938
+
939
+ @app.post("/report/text")
940
+ async def report_text(file: UploadFile = File(...)):
941
+ """Generate detailed plagiarism report as plain text"""
942
+ text = extract_text_from_pdf(file.file)
943
+ total_words = len(text.split())
944
+
945
+ if total_words == 0:
946
+ raise HTTPException(status_code=400, detail="Could not extract text. Is this a scanned PDF?")
947
+
948
+ chunks = chunk_text(text)
949
+
950
+ if len(chunks) > 20:
951
+ chunks = chunks[:20]
952
+
953
+ detailed_reports = []
954
+ plagiarized_word_count = 0
955
+
956
+ for chunk in chunks:
957
+ report = analyze_chunk(chunk)
958
+ detailed_reports.append(report)
959
+
960
+ if report.is_plagiarized:
961
+ plagiarized_word_count += len(chunk.split())
962
+
963
+ plagiarized_word_count = min(plagiarized_word_count, total_words)
964
+ overall_score = (plagiarized_word_count / total_words) * 100
965
+
966
+ # Generate detailed report
967
+ detailed_report = generate_detailed_report_with_llm(
968
+ filename=file.filename,
969
+ match_reports=detailed_reports,
970
+ total_words=total_words,
971
+ overall_score=overall_score
972
+ )
973
+
974
+ from fastapi.responses import PlainTextResponse
975
+ return PlainTextResponse(format_report_text(detailed_report))
976
+
977
+ @app.post("/report/html")
978
+ async def report_html(file: UploadFile = File(...)):
979
+ """Generate detailed plagiarism report as HTML"""
980
+ text = extract_text_from_pdf(file.file)
981
+ total_words = len(text.split())
982
+
983
+ if total_words == 0:
984
+ raise HTTPException(status_code=400, detail="Could not extract text. Is this a scanned PDF?")
985
+
986
+ chunks = chunk_text(text)
987
+
988
+ if len(chunks) > 20:
989
+ chunks = chunks[:20]
990
+
991
+ detailed_reports = []
992
+ plagiarized_word_count = 0
993
+
994
+ for chunk in chunks:
995
+ report = analyze_chunk(chunk)
996
+ detailed_reports.append(report)
997
+
998
+ if report.is_plagiarized:
999
+ plagiarized_word_count += len(chunk.split())
1000
+
1001
+ plagiarized_word_count = min(plagiarized_word_count, total_words)
1002
+ overall_score = (plagiarized_word_count / total_words) * 100
1003
+
1004
+ # Generate detailed report
1005
+ detailed_report = generate_detailed_report_with_llm(
1006
+ filename=file.filename,
1007
+ match_reports=detailed_reports,
1008
+ total_words=total_words,
1009
+ overall_score=overall_score
1010
+ )
1011
+
1012
+ from fastapi.responses import HTMLResponse
1013
+ return HTMLResponse(format_report_html(detailed_report))
1014
+
1015
+ @app.get("/")
1016
+ async def root():
1017
+ return {
1018
+ "message": "Pro Plagiarism Detector API",
1019
+ "endpoints": {
1020
+ "scan": "/scan-paper (POST - basic scan)",
1021
+ "detailed_report": "/generate-detailed-report (POST - JSON report with LLM analysis)",
1022
+ "text_report": "/report/text (POST - plain text report)",
1023
+ "html_report": "/report/html (POST - HTML report)"
1024
+ }
1025
+ }
1026
+
1027
+ if __name__ == "__main__":
1028
+ uvicorn.run(app, host="0.0.0.0", port=8000)
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ fastapi
2
+ uvicorn[standard]
3
+ requests
4
+ pydantic
5
+ PyPDF2
6
+ langchain-groq
7
+ langchain-core
8
+ python-multipart