File size: 12,295 Bytes
a561338
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
from fastapi import APIRouter, UploadFile, File, HTTPException, Depends
from typing import List, Tuple, Set
from datetime import datetime
from fastapi.security import OAuth2PasswordBearer
from jose import JWTError, jwt
from motor.motor_asyncio import AsyncIOMotorClient

from app.schemas.teacher_schemas import (
    DocumentInfo, OverlapDetail, ComparisonDetail,
    InternalReportDetail, InternalReportSummary
)
from app.utils.file_utils import extract_text_from_file, allowed_file
from app.utils.lexical_utils import (
    find_partial_phrase_match_for_internal,
    get_meaningful_sentences,
    find_exact_matches,
    find_partial_phrase_match,
)
from app.config import MONGODB_URI,ALGORITHM, SECRET_KEY

router = APIRouter(prefix="/teacher", tags=["teacher-internal"])

LEXICAL_PAIR_THRESHOLD = 0.50  # 50% - pairs above this are flagged
OVERLAP_MIN_TOKENS = 12

# Add these new thresholds for color coding:
HIGH_SIMILARITY_THRESHOLD = 0.85   # 85% - Red (very high)
MEDIUM_SIMILARITY_THRESHOLD = 0.70  # 70% - Yellow (medium)
LOW_SIMILARITY_THRESHOLD = 0.50  
oauth2_scheme = OAuth2PasswordBearer(tokenUrl="/token")


def verify_token(token: str = Depends(oauth2_scheme)):
    try:
        return jwt.decode(token, SECRET_KEY, algorithms=[ALGORITHM])
    except JWTError:
        raise HTTPException(status_code=401, detail="Invalid or expired token")


async def get_mongo_client():
    return AsyncIOMotorClient(MONGODB_URI)


def _percent(x: float) -> float:
    return round(float(x) * 100.0, 1)


def _ordered_pair_key(i: int, j: int) -> str:
    a, b = (i, j) if i < j else (j, i)
    return f"{a}-{b}"


def _aggregate_pair_score(overlaps: List[OverlapDetail]) -> float:
    return max((o.similarity for o in overlaps), default=0.0)


def _create_overlap_key(name_a: str, name_b: str, text: str, similarity: float, context: str) -> str:
    """Create unique key for overlap deduplication - includes context to distinguish different match types"""
    # Normalize text to handle whitespace variations
    text_normalized = ' '.join(text.split())
    return f"{name_a}|{name_b}|{text_normalized}|{similarity}|{context}"


def _extract_matched_text_from_sentence(sent_b: str, phrase: str) -> str:
    """Extract the actual text from sent_b that matches the phrase"""
    if not sent_b or not phrase:
        return phrase
    
    # Normalize both for comparison
    phrase_normalized = ' '.join(phrase.split()).lower()
    sent_normalized = ' '.join(sent_b.split()).lower()
    sent_b_normalized = ' '.join(sent_b.split())  # Keep original casing
    
    # If phrase exists in sentence, extract it as-is from original
    if phrase_normalized in sent_normalized:
        start_idx = sent_normalized.find(phrase_normalized)
        end_idx = start_idx + len(phrase_normalized)
        return sent_b_normalized[start_idx:end_idx].strip()
    
    # If not found exactly, try to find similar chunks
    # Split into words and try to find the best match
    phrase_words = phrase_normalized.split()
    sent_words = sent_normalized.split()
    
    # Look for the phrase words in the sentence
    for i in range(len(sent_words) - len(phrase_words) + 1):
        if sent_words[i:i+len(phrase_words)] == phrase_words:
            return ' '.join(sent_b_normalized.split()[i:i+len(phrase_words)])
    
    # Fallback: return the phrase as-is
    return phrase


def _find_overlaps_for_pair(
    name_a: str, sents_a: List[str],
    name_b: str, sents_b: List[str],
    seen_overlaps: Set[str]
) -> List[OverlapDetail]:
    """Find all overlaps between two document's sentences"""
    overlaps: List[OverlapDetail] = []
    
    for sent_a in sents_a:
        # Check exact matches
        for sent_b in sents_b:
            exact_score = find_exact_matches(sent_a, sent_b)
            if exact_score is not None:
                sim_pct = _percent(exact_score)
                if sim_pct >= LEXICAL_PAIR_THRESHOLD * 100:
                    context = "Exact/near-exact sentence overlap"
                    overlap_key = _create_overlap_key(name_a, name_b, sent_a, sim_pct, context)
                    if overlap_key not in seen_overlaps:
                        seen_overlaps.add(overlap_key)
                        overlaps.append(OverlapDetail(
                            fromDoc=name_a, 
                            toDoc=name_b, 
                            text=sent_a,
                            similarity=sim_pct,
                            sectionA=sent_a,
                            sectionB=sent_b,
                            context=context,
                        ))
        
        # Check partial phrase matches
        best_partial = None
        best_score = 0.0
        best_sent_b = None
        
        for sent_b in sents_b:
            partial_result = find_partial_phrase_match_for_internal(sent_a, sent_b)
            if partial_result:
                phrase, score = partial_result
                print(f"DEBUG: Partial match - phrase: {phrase[:80]}, score: {score}")
                if score > best_score:
                    best_score = score
                    best_partial = phrase
                    best_sent_b = sent_b
        
        # Add best partial match if it meets threshold
        if best_partial and best_sent_b and len(best_partial.split()) >= OVERLAP_MIN_TOKENS:
            sim_pct = _percent(best_score)
            if sim_pct >= LEXICAL_PAIR_THRESHOLD * 100:
                context = "High-overlap phrase (shingle/containment)"
                overlap_key = _create_overlap_key(name_a, name_b, best_partial, sim_pct, context)
                if overlap_key not in seen_overlaps:
                    seen_overlaps.add(overlap_key)
                    overlaps.append(OverlapDetail(
                        fromDoc=name_a, 
                        toDoc=name_b, 
                        text=best_partial,
                        similarity=sim_pct,
                        sectionA=sent_a,
                        sectionB=best_sent_b,
                        context=context,
                    ))
    
    return overlaps

@router.post("/internal-analysis", response_model=InternalReportDetail)
async def internal_analysis(
    files: List[UploadFile] = File(...),
    token_payload: dict = Depends(verify_token),
    mongo: AsyncIOMotorClient = Depends(get_mongo_client),
):
    if len(files) < 2:
        raise HTTPException(status_code=400, detail="Upload at least 2 files")

    t0 = datetime.utcnow()

    # --- Load & sentence-split all docs ---
    docs: List[Tuple[str, List[str]]] = []
    doc_infos: List[DocumentInfo] = []
    doc_texts = {}

    for idx, f in enumerate(files, start=1):
        if not allowed_file(f.filename):
            raise HTTPException(status_code=400, detail=f"Invalid file type: {f.filename}")
        raw = await f.read()
        text = extract_text_from_file(raw, f.filename) or ""
        sents = get_meaningful_sentences(text)
        doc_infos.append(DocumentInfo(id=idx, name=f.filename, author=None))
        docs.append((f.filename, sents))
        doc_texts[f.filename] = text

    # --- Pairwise comparisons ---
    comparisons: List[ComparisonDetail] = []
    seen_overlaps: Set[str] = set()

    for i in range(len(docs)):
        for j in range(i + 1, len(docs)):
            name_a, sents_a = docs[i]
            name_b, sents_b = docs[j]
            
            # Find all overlaps for this pair
            overlaps = _find_overlaps_for_pair(
                name_a, sents_a,
                name_b, sents_b,
                seen_overlaps
            )

            # Calculate pair score and flag if needed
            pair_score = _aggregate_pair_score(overlaps)
            flagged = pair_score >= LEXICAL_PAIR_THRESHOLD * 100
            
            comp = ComparisonDetail(
                id=_ordered_pair_key(i + 1, j + 1),
                docA=name_a,
                docB=name_b,
                similarity=round(pair_score, 1),
                flagged=flagged,
                overlaps=overlaps,
                contentA=doc_texts[name_a],
                contentB=doc_texts[name_b],
            )
            if flagged:
                comparisons.append(comp)

    # --- Compute per-document results ---
    doc_results = []
    total_matches = 0
    flagged_count = 0

    for d_idx, d in enumerate(doc_infos, start=1):
        name = d.name
        word_count = len(doc_texts[name].split())
        matches = [o for c in comparisons for o in c.overlaps if o.fromDoc == name or o.toDoc == name]
        highest_similarity = max((o.similarity for o in matches), default=0.0)
        flagged = highest_similarity >= LEXICAL_PAIR_THRESHOLD * 100
        if flagged:
            flagged_count += 1
        total_matches += len(matches)

        doc_results.append({
            "id": d.id,
            "name": d.name,
            "similarity": round(highest_similarity, 1),
            "flagged": flagged,
            "wordCount": word_count,
            "matchCount": len(matches),
            "matches": matches
        })

    highest_any = max(d['similarity'] for d in doc_results) if doc_results else 0.0
    avg_similarity = round(sum(d['similarity'] for d in doc_results) / len(doc_results), 1) if doc_results else 0.0
    elapsed = (datetime.utcnow() - t0).total_seconds()
    processing = f"{int(elapsed // 60)}m {int(elapsed % 60):02d}s"

    report = InternalReportDetail(
        id="internal_report",
        name="Internal Plagiarism Check",
        uploadDate=datetime.utcnow(),
        processingTime=processing,
        documents=doc_infos,
        comparisons=comparisons,
        summary=InternalReportSummary(
            totalDocuments=len(doc_results),
            totalComparisons=(len(docs) * (len(docs) - 1)) // 2,
            flaggedComparisons=flagged_count,
            highestSimilarity=round(highest_any, 1),
            averageSimilarity=avg_similarity,
        ),
    )

    # --- Save to MongoDB ---
    try:
        db = mongo.sluethink
        reports_collection = db.reports

        all_sources = set()
        for comp in comparisons:
            for o in comp.overlaps:
                all_sources.add(o.toDoc)

        report_doc = {
            "name": f"Internal_{datetime.utcnow().strftime('%Y%m%d_%H%M%S')}",
            "analysisType": "internal",
            "submittedBy": token_payload.get("name", "System"),
            "uploadDate": datetime.utcnow().strftime("%Y-%m-%d"),
            "similarity": highest_any,
            "status": "completed",
            "flagged": flagged_count > 0,
            "fileCount": len(doc_results),
            "processingTime": processing,
            "avgSimilarity": avg_similarity,
            "totalMatches": total_matches,
            "sources": list(all_sources),
            "createdAt": datetime.utcnow(),
            "userId": token_payload.get("sub") or token_payload.get("user_id"),
            "documents": [
                {
                    "id": d['id'],
                    "name": d['name'],
                    "similarity": d['similarity'],
                    "flagged": d['flagged'],
                    "wordCount": d['wordCount'],
                    "matchCount": d['matchCount'],
                    "matches": [
                        {
                            "matched_text": m.text,
                            "similarity": m.similarity,
                            "source_url": m.toDoc,
                            "source_title": m.toDoc,
                            "source_type": "internal",
                        } for m in d['matches']
                    ]
                } for d in doc_results
            ],
            "summary": {
                "totalDocuments": len(doc_results),
                "flaggedDocuments": flagged_count,
                "highestSimilarity": highest_any,
                "averageSimilarity": avg_similarity,
                "totalMatches": total_matches,
            }
        }

        insert_result = await reports_collection.insert_one(report_doc)
        print(f"๐Ÿ’พ Report saved to MongoDB with ID: {insert_result.inserted_id}")
        report.id = str(insert_result.inserted_id)

    except Exception as e:
        print(f"โŒ Error saving to MongoDB: {str(e)}")

    return report