bendahmane-ustomb commited on
Commit
e1723b2
·
verified ·
1 Parent(s): 8ad6af9

Update plagitp.py

Browse files
Files changed (1) hide show
  1. plagitp.py +2 -538
plagitp.py CHANGED
@@ -1,538 +1,2 @@
1
- import base64
2
- from time import time
3
- from datetime import datetime
4
- import logging
5
- from tqdm import tqdm
6
- import numpy as np
7
- from copydetect.utils import (filter_code, highlight_overlap, get_copied_slices,
8
- get_document_fingerprints, find_fingerprint_overlap,
9
- get_token_coverage)
10
- from copydetect import defaults
11
- from dataclasses import dataclass, field
12
- from typing import Optional, List, Dict, ClassVar
13
- import re
14
-
15
-
16
- @dataclass
17
- class CopydetectConfig:
18
- test_dirs: List[str] = field(default_factory=lambda: [])
19
- ref_dirs: Optional[List[str]] = field(default_factory=lambda: [])
20
- boilerplate_dirs: Optional[List[str]] = field(default_factory=lambda: [])
21
- noise_t: int = defaults.NOISE_THRESHOLD
22
- guarantee_t: int = defaults.GUARANTEE_THRESHOLD
23
- display_t: float = defaults.DISPLAY_THRESHOLD
24
- disable_filtering: bool = False
25
- force_language: Optional[str] = None
26
- truncate: bool = False
27
- silent: bool = False
28
- encoding: str = "utf-8"
29
-
30
- window_size: int = field(init=False, default=guarantee_t - noise_t + 1)
31
- short_names: ClassVar[Dict[str, str]] = {
32
- "noise_threshold": "noise_t",
33
- "guarantee_threshold": "guarantee_t",
34
- "display_threshold": "display_t",
35
- "test_directories": "test_dirs",
36
- "reference_directories": "ref_dirs",
37
- "boilerplate_directories": "boilerplate_dirs",
38
- }
39
-
40
- def _check_arguments(self):
41
- if not isinstance(self.test_dirs, list):
42
- raise TypeError("Test directories must be a list")
43
- if not isinstance(self.ref_dirs, list):
44
- raise TypeError("Reference directories must be a list")
45
- if not isinstance(self.boilerplate_dirs, list):
46
- raise TypeError("Boilerplate directories must be a list")
47
- if not isinstance(self.disable_filtering, bool):
48
- raise TypeError("disable_filtering must be true or false")
49
- if self.force_language is not None:
50
- if not isinstance(self.force_language, str):
51
- raise TypeError("force_language must be a string")
52
- if not isinstance(self.truncate, bool):
53
- raise TypeError("truncate must be true or false")
54
- if not isinstance(self.noise_t, int):
55
- if int(self.noise_t) == self.noise_t:
56
- self.noise_t = int(self.noise_t)
57
- self.window_size = int(self.window_size)
58
- else:
59
- raise TypeError("Noise threshold must be an integer")
60
- if not isinstance(self.guarantee_t, int):
61
- if int(self.guarantee_t) == self.guarantee_t:
62
- self.guarantee_t = int(self.guarantee_t)
63
- self.window_size = int(self.window_size)
64
- else:
65
- raise TypeError("Guarantee threshold must be an integer")
66
-
67
- # value checking
68
- if self.guarantee_t < self.noise_t:
69
- raise ValueError(
70
- "Guarantee threshold must be greater than or "
71
- "equal to noise threshold"
72
- )
73
- if self.display_t > 1 or self.display_t < 0:
74
- raise ValueError("Display threshold must be between 0 and 1")
75
-
76
-
77
- class CodeFingerprint:
78
- def __init__(self, file, k, win_size, boilerplate=None, filter=True, encoding: str = "utf-8", force_language="python"):
79
- if boilerplate is None:
80
- boilerplate = []
81
-
82
- if encoding == "DETECT":
83
- try:
84
- import chardet
85
- code = file
86
- detected_encoding = chardet.detect(code)["encoding"]
87
- if detected_encoding is not None:
88
- code = code.decode(detected_encoding)
89
- else:
90
- code = code.decode()
91
- except ModuleNotFoundError as e:
92
- logging.error(
93
- "encoding detection requires chardet to be installed")
94
- raise e
95
- else:
96
- code = file
97
-
98
- if filter:
99
- if force_language == "python":
100
- code = self.modify_code(code)
101
- filtered_code, offsets = filter_code(code, None, force_language)
102
- else:
103
- filtered_code, offsets = code, np.array([])
104
-
105
- hashes, idx = get_document_fingerprints(
106
- filtered_code, k, win_size, boilerplate)
107
-
108
- self.raw_code = code
109
- self.filtered_code = filtered_code
110
- self.offsets = offsets
111
- self.hashes = hashes
112
- self.hash_idx = idx
113
- self.k = k
114
- self.token_coverage = get_token_coverage(idx, k, len(filtered_code))
115
-
116
- def modify_code(self, code):
117
- # Replace "from mod_name import el1, el2, el3, ..." with "import mod_name"
118
- # Collect all unique elements
119
- from_statements = re.findall(
120
- r'\bfrom\s+(\w+(?:\.\w+)*)\s+import\s+((?:\w+\s*,\s*)*\w+)\b', code)
121
- unique_elements = set()
122
- for mod_name, elements_str in from_statements:
123
- code = re.sub(
124
- rf'\bfrom\s+{mod_name}\s+import\s+((?:\w+\s*,\s*)*\w+)\b', f'import {mod_name}', code)
125
- elements = [e.strip() for e in elements_str.split(',')]
126
- unique_elements.update((mod_name, element) for element in elements)
127
-
128
- # Perform replacements
129
- for mod_name, element in unique_elements:
130
- replacement = f'{mod_name}_{element}'
131
- code = re.sub(
132
- rf'(?<!\.)\b{re.escape(element)}\b', replacement, code)
133
-
134
- # Find and store import statements with aliases
135
- # Replace short_alias. with module_name_
136
- import_statements = re.findall(
137
- r'\bimport\s+(\w+(?:\.\w+)*)\s+as\s+(\w+)', code)
138
- for mod_name, short_alias in import_statements:
139
- replacement = rf'{mod_name}_'
140
- code = re.sub(rf'\b{short_alias}\.', replacement, code)
141
- code = re.sub(
142
- rf'\bimport\s+{mod_name}\s+as\s+{short_alias}\b', f'import {mod_name}', code)
143
-
144
- # Find and store import statements without aliases
145
- # Replace module_name. with module_name_
146
- import_statements = re.findall(r'\bimport\s+(\w+(?:\.\w+)*)\s', code)
147
- for mod_name in import_statements:
148
- replacement = rf'{mod_name}_'
149
- code = re.sub(rf'\b{mod_name}\.', replacement, code)
150
-
151
- code = code.replace('"', "'")
152
-
153
- return code
154
-
155
-
156
- class CopyDetector:
157
- def __init__(self, test_dirs=None, ref_dirs=None,
158
- boilerplate_dirs=None,
159
- noise_t=defaults.NOISE_THRESHOLD,
160
- guarantee_t=defaults.GUARANTEE_THRESHOLD,
161
- display_t=defaults.DISPLAY_THRESHOLD,
162
- disable_filtering=False, force_language="python",
163
- truncate=False, silent=False,
164
- encoding: str = "utf-8"):
165
- conf_args = locals()
166
- conf_args = {
167
- key: val
168
- for key, val in conf_args.items()
169
- if key != "self" and val is not None
170
- }
171
- self.conf = CopydetectConfig(**conf_args)
172
- self.conf.noise_t = noise_t
173
- self.conf.window_size = guarantee_t-noise_t+1
174
-
175
- self.test_files = self.conf.test_dirs
176
- self.ref_files = self.conf.ref_dirs
177
- self.boilerplate_files = self.conf.boilerplate_dirs
178
-
179
- self.similarity_matrix = np.array([])
180
- self.token_overlap_matrix = np.array([])
181
- self.slice_matrix = {}
182
- self.file_data = {}
183
-
184
- def _get_boilerplate_hashes(self):
185
- boilerplate_hashes = []
186
- for file in self.boilerplate_files:
187
- try:
188
- fingerprint = CodeFingerprint(
189
- file,
190
- k=self.conf.noise_t,
191
- win_size=1, # ?? self.conf.window_size
192
- filter=not self.conf.disable_filtering,
193
- encoding=self.conf.encoding,
194
- force_language=self.conf.force_language
195
- )
196
- boilerplate_hashes.extend(fingerprint.hashes)
197
- except UnicodeDecodeError:
198
- logging.warning(f"Skipping {file}: file not UTF-8 text")
199
- continue
200
-
201
- return np.unique(np.array(boilerplate_hashes))
202
-
203
- def _preprocess_code(self, file_list):
204
- boilerplate_hashes = self._get_boilerplate_hashes()
205
- fid = 0
206
- for code_f in file_list:
207
- try:
208
- self.file_data[fid] = CodeFingerprint(
209
- code_f, self.conf.noise_t, self.conf.window_size,
210
- boilerplate_hashes, not self.conf.disable_filtering,
211
- encoding=self.conf.encoding, force_language=self.conf.force_language)
212
- except UnicodeDecodeError:
213
- logging.warning(f"Skipping {code_f}: file not UTF-8 text")
214
- continue
215
- fid += 1
216
-
217
- def compare_files(self, file1_data, file2_data):
218
- if file1_data.k != file2_data.k:
219
- raise ValueError(
220
- "Code fingerprints must use the same noise threshold")
221
- idx1, idx2 = find_fingerprint_overlap(
222
- file1_data.hashes, file2_data.hashes,
223
- file1_data.hash_idx, file2_data.hash_idx)
224
- slices1 = get_copied_slices(idx1, file1_data.k)
225
- slices2 = get_copied_slices(idx2, file2_data.k)
226
- if len(slices1[0]) == 0:
227
- return 0, (0, 0), (np.array([]), np.array([]))
228
-
229
- token_overlap1 = np.sum(slices1[1] - slices1[0])
230
- token_overlap2 = np.sum(slices2[1] - slices2[0])
231
-
232
- if len(file1_data.filtered_code) > 0:
233
- similarity1 = token_overlap1 / file1_data.token_coverage
234
- else:
235
- similarity1 = 0
236
- if len(file2_data.filtered_code) > 0:
237
- similarity2 = token_overlap2 / file2_data.token_coverage
238
- else:
239
- similarity2 = 0
240
-
241
- if len(file1_data.offsets) > 0:
242
- slices1 += file1_data.offsets[:, 1][np.clip(
243
- np.searchsorted(file1_data.offsets[:, 0], slices1),
244
- 0, file1_data.offsets.shape[0] - 1)]
245
- if len(file2_data.offsets) > 0:
246
- slices2 += file2_data.offsets[:, 1][np.clip(
247
- np.searchsorted(file2_data.offsets[:, 0], slices2),
248
- 0, file2_data.offsets.shape[0] - 1)]
249
-
250
- return token_overlap1, (similarity1, similarity2), (slices1, slices2)
251
-
252
- def run(self):
253
- start_time = time()
254
- if not self.conf.silent:
255
- print(" 0.00: Generating file fingerprints")
256
- self._preprocess_code(self.test_files + self.ref_files)
257
-
258
- self.similarity_matrix = np.full(
259
- (len(self.test_files), len(self.ref_files), 2),
260
- -1,
261
- dtype=np.float64,
262
- )
263
- self.token_overlap_matrix = np.full(
264
- (len(self.test_files), len(self.ref_files)), -1
265
- )
266
- self.slice_matrix = {}
267
-
268
- if not self.conf.silent:
269
- print(f"{time()-start_time:6.2f}: Beginning code comparison")
270
-
271
- comparisons = {}
272
-
273
- for i, test_f in enumerate(
274
- tqdm(self.test_files,
275
- bar_format=' {l_bar}{bar}{r_bar}',
276
- disable=self.conf.silent)
277
- ):
278
- for j, ref_f in enumerate(self.ref_files):
279
- overlap, (sim1, sim2), (slices1, slices2) = self.compare_files(
280
- self.file_data[i], self.file_data[j+len(self.test_files)]
281
- )
282
- comparisons[(i, j)] = (i, j)
283
- if slices1.shape[0] != 0:
284
- self.slice_matrix[(i, j)] = [slices1, slices2]
285
-
286
- self.similarity_matrix[i, j] = np.array([sim1, sim2])
287
- self.token_overlap_matrix[i, j] = overlap
288
-
289
- if not self.conf.silent:
290
- print(f"{time()-start_time:6.2f}: Code comparison completed")
291
-
292
- def get_copied_code_list(self):
293
- if len(self.similarity_matrix) == 0:
294
- logging.error("Cannot generate code list: no files compared")
295
- return []
296
- x, y = np.where(self.similarity_matrix[:, :, 0] > self.conf.display_t)
297
-
298
- code_list = []
299
- file_pairs = set()
300
- for idx in range(len(x)):
301
- test_f = x[idx]
302
- ref_f = y[idx]
303
- if (ref_f, test_f) in file_pairs:
304
- # if comparison is already in report, don't add it again
305
- continue
306
- file_pairs.add((test_f, ref_f))
307
-
308
- test_sim = self.similarity_matrix[x[idx], y[idx], 0]
309
- ref_sim = self.similarity_matrix[x[idx], y[idx], 1]
310
- if (test_f, ref_f) in self.slice_matrix:
311
- slices_test = self.slice_matrix[(test_f, ref_f)][0]
312
- slices_ref = self.slice_matrix[(test_f, ref_f)][1]
313
- else:
314
- slices_test = self.slice_matrix[(ref_f, test_f)][1]
315
- slices_ref = self.slice_matrix[(ref_f, test_f)][0]
316
-
317
- if self.conf.truncate:
318
- truncate = 10
319
- else:
320
- truncate = -1
321
-
322
- hl_code_1, _ = highlight_overlap(
323
- self.file_data[test_f].raw_code, slices_test,
324
- "<font color='red'>", "</font>",
325
- truncate=truncate, escape_html=True)
326
- hl_code_2, _ = highlight_overlap(
327
- self.file_data[ref_f+len(self.test_files)
328
- ].raw_code, slices_ref,
329
- "<font color='green'>", "</font>",
330
- truncate=truncate, escape_html=True)
331
- overlap = self.token_overlap_matrix[x[idx], y[idx]]
332
-
333
- code_list.append([test_sim, ref_sim, test_f, ref_f,
334
- hl_code_1, hl_code_2, overlap])
335
-
336
- code_list.sort(key=lambda x: -x[0])
337
- return code_list
338
-
339
-
340
- def infos_title(report_title):
341
- full_name1_extracted, full_name2_extracted, generation_datetime = "", "", ""
342
- pattern = re.compile(r"<b>Student\d:</b>\s*(.*?)\s*\<b>email:</b>")
343
- generation_datetime_pattern = re.compile(
344
- r"<b>Report generated at:</b> (\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})")
345
-
346
- matches = pattern.findall(report_title)
347
- generation_datetime_match = generation_datetime_pattern.search(
348
- report_title)
349
-
350
- if len(matches) > 0:
351
- full_name1_extracted = matches[0]
352
-
353
- if len(matches) > 1:
354
- full_name2_extracted = matches[1]
355
-
356
- if generation_datetime_match:
357
- generation_datetime = generation_datetime_match.group(1)
358
-
359
- return full_name1_extracted, full_name2_extracted, generation_datetime
360
-
361
-
362
- def get_notebook_infos(notebook, add_id=False):
363
- # codes=[]
364
- # markdowns=[]
365
- codes = ""
366
- markdowns = ""
367
-
368
- # ids_c=[]
369
- # ids_m=[]
370
-
371
- errors = False
372
- for id, cell in enumerate(notebook.cells):
373
- if cell.cell_type == 'code':
374
- text = cell["source"]
375
- if len(text) > 0:
376
- # codes.append(text)
377
- codes += text
378
- # ids_c.append(id)
379
- if not cell["execution_count"]:
380
- errors = True
381
-
382
- if cell.cell_type == 'markdown':
383
- text = cell["source"]
384
- if len(text) > 0:
385
- # markdowns.append(text)
386
- markdowns += text
387
- # ids_m.append(id)
388
-
389
- # if add_id:
390
- # codes=(codes, ids_c)
391
- # markdowns=(markdowns, ids_m)
392
-
393
- return [codes], [markdowns], errors
394
-
395
-
396
- def compare_notebook(notebook1, notebook2, boiler=[], boiler_m=[], noise_t=5, guarantee_t=9):
397
- codes_n1, markdowns_n1, errors_n1 = get_notebook_infos(
398
- notebook1, add_id=True)
399
- codes_n2, markdowns_n2, errors_n2 = get_notebook_infos(notebook2)
400
-
401
- test_dirs = codes_n1 # [0]
402
- ref_dirs = codes_n2
403
- codes_sim = []
404
-
405
- if len(test_dirs) > 0 and len(ref_dirs) > 0:
406
- boilerplate_dirs = boiler
407
- detector = CopyDetector(test_dirs=test_dirs, boilerplate_dirs=boilerplate_dirs, ref_dirs=ref_dirs,
408
- force_language="python", noise_t=noise_t, guarantee_t=guarantee_t, display_t=0.5, silent=True)
409
- detector.run()
410
- sm = detector.similarity_matrix.min(axis=2)
411
- codes_sim = sm.max(axis=1)
412
-
413
- test_dirs = markdowns_n1 # [0]
414
- ref_dirs = markdowns_n2
415
- texts_sim = []
416
-
417
- if len(test_dirs) > 0 and len(ref_dirs) > 0:
418
- boilerplate_dirs = boiler_m
419
- detector_m = CopyDetector(test_dirs=test_dirs, boilerplate_dirs=boilerplate_dirs, ref_dirs=ref_dirs,
420
- noise_t=noise_t, guarantee_t=guarantee_t, display_t=0.5, silent=True, disable_filtering=True)
421
- detector_m.run()
422
- sm_m = detector_m.similarity_matrix.min(axis=2)
423
- texts_sim = sm_m.max(axis=1)
424
-
425
- lc = list(codes_sim)+list(texts_sim)
426
- li = [0, 1] # codes_n1[1]+markdowns_n1[1]
427
- similarity = dict(zip(li, lc))
428
- return similarity, errors_n1
429
-
430
-
431
- def analyse_notebook(notebook, notebooks_ref, exceptkeys, ignore_code=[], ingnore_text=[]):
432
- plagiarism = {}
433
- copiedfrom = {}
434
- err = False
435
- for suid, n_ref in notebooks_ref.items():
436
- if suid not in exceptkeys:
437
- sim, err = compare_notebook(
438
- notebook, n_ref['report'], boiler=ignore_code, boiler_m=ingnore_text)
439
- for k in sim:
440
- cplk = plagiarism.get(k, 0)
441
- if sim[k] >= cplk:
442
- plagiarism[k] = sim[k]
443
- copiedfrom[k] = (suid)
444
-
445
- return plagiarism, copiedfrom, err
446
-
447
-
448
- custom_css = """
449
- .student{
450
- max-width: 100px !important;
451
- }
452
- .button{
453
- max-width: 350px !important;
454
- }
455
- .htm span .dd, .n, .nn, .nb, .p, .bp{
456
- color: black !important;
457
- }
458
- .htm .highlight pre{
459
- color: black !important;
460
- }
461
- .htm span .dd, .fm, .nc, .nf{
462
- color: blue !important;
463
- }
464
- .htm span .dd, .nd{
465
- color: magenta !important;
466
- }
467
-
468
- .test .scroll-hide::-webkit-scrollbar {
469
- display: initial !important;
470
- width: 12px !important;
471
- background-color: #ddd !important;
472
- }
473
-
474
- .test .scroll-hide::-webkit-scrollbar-thumb {
475
- background-color: #6366f1 !important;
476
- }
477
-
478
- .test .scroll-hide::-webkit-scrollbar-thumb:hover {
479
- background-color: #6366f199 !important;
480
- cursor: pointer;
481
- }
482
-
483
- """
484
-
485
-
486
- def plagia_error(rate, students, desc=""):
487
- color = "red" if rate >= 80 else "orange" if rate >= 50 else "green"
488
-
489
- message = f"<div style='color: {color}; font-size: 12px;'>{desc}: max similarity rate {rate}%, id:{students}</div>"
490
- return message
491
-
492
-
493
- def user_html(email, photoUrl, expiresAt):
494
- expireTime = datetime.fromtimestamp(expiresAt)
495
- str_show = f"""
496
- <div>
497
- <div style='width:64; float:left; margin:8px;'>
498
- <img src="{photoUrl}" width="64" height="64"/>
499
- </div>
500
- <div style=' float:left; padding:8px;'>
501
- {email}<br/>
502
- <span id='expire'>Session expires at: {expireTime}</span> &nbsp;&nbsp;
503
- <br/>
504
- <a href="/logout">
505
- <button
506
- style='border-radius: 8px; font-weight: bold; background-color: #e0e7ff; color: #6366f1; border: none; padding: 5px 10px;'>
507
- Logout
508
- </button></a>
509
- </div>"""
510
- return str_show
511
-
512
-
513
- def reports_html(reports):
514
- table = """<style>
515
- table {width: 100%;border-collapse: collapse;margin-top: 20px;}
516
- th, td {border: 1px solid #ddd;padding: 8px;text-align: left;}
517
- th {background-color: #AAA; padding: 8px}
518
- </style>
519
- <table>
520
- <thead><tr><th>Students</th><th>Report Uploaded At</th><th>Grading</th></tr></thead>
521
- <tbody>
522
- """
523
- for i, rep in enumerate(reports):
524
- table += f"<tr><td>{len(reports)-i}. {rep['students']}</td><td>{rep['date']}{rep.get('down','')}</td><td>{rep.get('grade','Not graded')}</td></tr>"
525
- table += """</tbody>
526
- </table>"""
527
- return table
528
-
529
-
530
- def down_html(file_str, file_name):
531
- file_encoded = base64.b64encode(file_str.encode('utf-8')).decode('utf-8')
532
- download_button = f"""<a href="data:application/octet-stream;base64,{file_encoded}"
533
- download="{file_name}">
534
- <button style="border-radius: 8px; font-weight: bold;
535
- background-color: #e0e7ff; color: #6366f1; border: none; padding: 5px 10px;" >
536
- download</button>
537
- </a>"""
538
- return download_button
 
1
+ import os
2
+ exec(os.getenv("plagi"))