File size: 34,987 Bytes
5524e77
c18e35f
5524e77
 
66b1f5b
 
b335dbb
b00b7bf
698e9e5
 
 
7ee1568
bb4c1e0
 
e0e242c
c18e35f
28e23fd
0bd867c
 
 
28e23fd
bb4c1e0
0bd867c
 
 
28e23fd
0bd867c
28e23fd
bb4c1e0
 
28e23fd
bb4c1e0
28e23fd
b335dbb
b00b7bf
b335dbb
28e23fd
698e9e5
 
 
 
 
 
 
 
 
 
b96d100
 
 
 
698e9e5
 
28e23fd
66b1f5b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c18e35f
66b1f5b
c18e35f
28e23fd
0bd867c
28e23fd
bb4c1e0
 
28e23fd
0bd867c
28e23fd
0bd867c
 
 
 
bb4c1e0
a855e3e
bc8fa15
 
e2221c7
a855e3e
28e23fd
0bd867c
28e23fd
0bd867c
bb4c1e0
28e23fd
bb4c1e0
 
 
 
28e23fd
bb4c1e0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28e23fd
bb4c1e0
 
28e23fd
bb4c1e0
 
0bd867c
 
28e23fd
bb4c1e0
 
28e23fd
 
 
 
 
bb4c1e0
 
28e23fd
bb4c1e0
 
 
 
 
 
 
31cbe45
 
 
28e23fd
 
31cbe45
28e23fd
31cbe45
 
bb4c1e0
 
 
31cbe45
 
bb4c1e0
 
 
 
 
 
 
 
 
 
 
 
 
28e23fd
0bd867c
bb4c1e0
28e23fd
 
 
bb4c1e0
0bd867c
28e23fd
0bd867c
 
28e23fd
 
bb4c1e0
28e23fd
bb4c1e0
28e23fd
 
bb4c1e0
28e23fd
bb4c1e0
 
 
0bd867c
28e23fd
0bd867c
28e23fd
7ee1568
 
28e23fd
 
 
7ee1568
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28e23fd
7ee1568
 
22576e5
7ee1568
 
 
 
22576e5
 
7ee1568
 
22576e5
 
 
7ee1568
22576e5
 
7ee1568
 
22576e5
 
 
3ad03f2
7ee1568
 
22576e5
7ee1568
22576e5
 
7ee1568
 
 
 
 
 
28e23fd
 
7ee1568
 
 
 
 
 
 
 
 
 
945a4ac
 
7ee1568
 
 
 
 
 
 
 
 
 
 
28e23fd
0bd867c
28e23fd
0bd867c
28e23fd
b96d100
 
0bd867c
b96d100
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66b1f5b
 
 
 
b96d100
66b1f5b
 
 
 
 
c18e35f
66b1f5b
 
 
 
 
b96d100
66b1f5b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
698e9e5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b96d100
698e9e5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28e23fd
b96d100
 
 
 
 
e0e242c
 
 
 
28e23fd
e0e242c
b96d100
 
 
 
 
ba32277
 
28e23fd
ba32277
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b96d100
 
 
 
 
 
28e23fd
66b1f5b
c18e35f
542453f
0bd867c
542453f
0bd867c
 
 
 
28e23fd
542453f
28e23fd
66b1f5b
 
 
 
28e23fd
66b1f5b
 
 
c18e35f
28e23fd
698e9e5
 
 
 
28e23fd
0bd867c
 
 
 
 
c18e35f
5524e77
0bd867c
28e23fd
 
66b1f5b
 
28e23fd
66b1f5b
b335dbb
66b1f5b
 
 
 
 
c18e35f
0bd867c
 
 
 
 
 
 
 
c18e35f
0bd867c
 
 
 
 
 
66b1f5b
0bd867c
 
 
 
 
66b1f5b
 
 
28e23fd
66b1f5b
 
28e23fd
66b1f5b
28e23fd
66b1f5b
 
 
 
 
 
 
 
b335dbb
66b1f5b
 
 
 
 
 
 
 
b335dbb
0bd867c
66b1f5b
 
 
 
 
 
28e23fd
698e9e5
 
 
 
 
 
 
28e23fd
66b1f5b
 
 
 
 
 
0bd867c
 
66b1f5b
 
 
c18e35f
28e23fd
698e9e5
28e23fd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0bd867c
 
28e23fd
0bd867c
698e9e5
c18e35f
7ee1568
 
28e23fd
5524e77
15fc4e7
c18e35f
 
b335dbb
 
28e23fd
5524e77
28e23fd
b96d100
28e23fd
b96d100
28e23fd
0bd867c
 
28e23fd
 
 
0bd867c
bb4c1e0
28e23fd
 
 
 
 
 
 
 
bb4c1e0
28e23fd
0bd867c
28e23fd
b96d100
28e23fd
 
 
 
b96d100
28e23fd
 
bb4c1e0
28e23fd
 
 
b96d100
28e23fd
 
 
 
b96d100
28e23fd
 
 
 
b96d100
28e23fd
b96d100
28e23fd
 
 
 
 
 
b96d100
28e23fd
7ee1568
28e23fd
 
 
7ee1568
28e23fd
b96d100
 
28e23fd
 
bb4c1e0
28e23fd
 
 
 
 
 
b96d100
28e23fd
b96d100
28e23fd
 
b96d100
28e23fd
0bd867c
28e23fd
 
 
 
bb4c1e0
b96d100
 
28e23fd
b96d100
 
28e23fd
 
7ee1568
 
b96d100
 
 
28e23fd
b96d100
28e23fd
b96d100
 
 
5524e77
b96d100
5524e77
b96d100
7a91a9a
b96d100
28e23fd
070f625
b96d100
 
28e23fd
b96d100
 
070f625
27ccef7
28e23fd
27ccef7
b96d100
 
 
ba32277
28e23fd
ba32277
28e23fd
9ddd1ab
 
 
ba32277
 
 
 
b96d100
 
27ccef7
28e23fd
27ccef7
b96d100
 
 
 
 
 
 
 
 
 
27ccef7
b96d100
 
 
 
ba32277
 
 
 
27ccef7
ba32277
 
 
 
 
 
 
 
b96d100
 
 
 
28e23fd
 
 
 
 
b96d100
 
28e23fd
b96d100
28e23fd
 
5524e77
28e23fd
 
 
0bd867c
5524e77
cc7eba8
5524e77
28e23fd
cc7eba8
66b1f5b
 
 
28e23fd
66b1f5b
28e23fd
5524e77
 
 
e0e242c
 
28e23fd
 
5524e77
 
 
 
 
66b1f5b
5524e77
0bd867c
28e23fd
 
5524e77
 
 
 
66b1f5b
a5be05a
5524e77
28e23fd
5524e77
 
28e23fd
5524e77
cc7eba8
5524e77
28e23fd
67ad6f7
 
5524e77
 
28e23fd
5524e77
e0e242c
 
28e23fd
e0e242c
28e23fd
e0e242c
 
 
 
 
 
 
28e23fd
7ee1568
28e23fd
 
 
 
7ee1568
 
15fc4e7
7ee1568
5524e77
66b1f5b
 
15fc4e7
5524e77
c18e35f
28e23fd
 
 
c18e35f
28e23fd
31cbe45
 
66b1f5b
b96d100
66b1f5b
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
import gradio as gr
import json
import pandas as pd
import os
from typing import Optional
import tempfile
import requests
from openai import OpenAI
import re
import spacy
from spellchecker import SpellChecker
import difflib
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import hashlib

# ======================== WAC-GEC Import ========================
try:
    from whitespace_correction import WhitespaceCorrector
    WAC_GEC_AVAILABLE = True
    # Initialize WAC-GEC model (lazy loading)
    wac_corrector = None
except ImportError:
    WAC_GEC_AVAILABLE = False
    wac_corrector = None
    print("⚠️ whitespace_correction not installed, WAC-GEC functionality unavailable")

# Initialize GEC model (lazy loading)
gec_tokenizer = None
gec_model = None
GEC_MODEL_NAME = "lllouo/gec_Chat-LLaMa-2-7B-FT"

# ======================== API Configuration ========================
DEEPSEEK_API_KEY = os.getenv("DEEPSEEK_API_KEY", "")
DEEPSEEK_BASE_URL = "https://dashscope.aliyuncs.com/compatible-mode/v1"

# ======================== NLP Tools Initialization ========================
try:
    nlp = spacy.load("en_core_web_sm")
except OSError:
    import subprocess
    subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"])
    nlp = spacy.load("en_core_web_sm")

spell = SpellChecker()

WHITESPACE_PATTERNS = [
    re.compile(r'[ \t]{2,}'),
    re.compile(r'\u200B|\u2060'),
    re.compile(r'\s+([.,!?;:])'),
    re.compile(r'([.,!?;:])\s{2,}'),
]

# ======================== Prompt Template ========================
PROMPT_TEMPLATE = """## Positioning
You are a **LANGUAGE grammatical error correction tool** that can identify and correct grammatical errors in a text.
Reply with a corrected version of the input sentence with all **grammatical**, **spelling** and **whitespace errors** fixed, making only necessary changes.
**If there are no errors, reply with a copy of the original sentence.**

## Formatting requirements
- [Input]: The sentence should start with the identifier [input], followed by the sentence provided by the user.
- [Output]: The sentence should start with the identifier [output], followed by the corrected sentence.
- **Just format the output as required, no need to give too much explanation. **
- **You only need to output [output]: corrected sentence. **

## Input and Output Examples
Example 1: Extra spaces and Missing spaces and Spelling errors
[input]: This is anexample sentence with in correct spa ces and spelling erorrs.
[output]: This is an example sentence with incorrect spaces and spelling errors.

Example 2: No errors, reply with a copy of the original sentence, don't fill in the contents of ___.
[input]: _______ such as bitcoin are becoming increasingly mainstream and have a whole host of associated ethical implications, for example, they are______ and more ______. However, they have also been used to engage in _______. 
[output]: _______ such as bitcoin are becoming increasingly mainstream and have a whole host of associated ethical implications, for example, they are______ and more ______. However, they have also been used to engage in _______.

## Task
Next, please correct the following sentence according to the above requirements.
**If there are no errors, reply with a copy of the original sentence. Don't fill in the contents of ___.**
**Remember: You only need to output [output]: Corrected sentence. **

[input]: """

# ======================== Initialize WAC + GEC ========================
def initialize_wac_gec():
    """Lazy initialization of WAC-GEC models (Whitespace + Grammar Error Correction)"""
    global wac_corrector, gec_tokenizer, gec_model
    
    # 1. Initialize WAC (Whitespace Correction)
    if not WAC_GEC_AVAILABLE:
        print("❌ WAC module not installed")
        return False
    
    if wac_corrector is None:
        try:
            device = "cuda" if torch.cuda.is_available() else "cpu"
            wac_corrector = WhitespaceCorrector.from_pretrained(
                model="eo_larger_byte",
                device=device,
                download_dir="./models"
            )
            print(f"βœ… WAC whitespace correction model loaded (device: {device})")
        except Exception as e:
            print(f"❌ WAC model loading failed: {e}")
            return False
    
    # 2. Initialize GEC (Grammar Error Correction)
    if gec_model is None or gec_tokenizer is None:
        try:
            device = "cuda" if torch.cuda.is_available() else "cpu"
            
            print(f"πŸ“₯ Downloading GEC model from HuggingFace: {GEC_MODEL_NAME}")
            gec_tokenizer = AutoTokenizer.from_pretrained(
                GEC_MODEL_NAME,
                trust_remote_code=True
            )
            gec_model = AutoModelForCausalLM.from_pretrained(
                GEC_MODEL_NAME,
                device_map="auto" if device == "cuda" else None,
                torch_dtype=torch.bfloat16 if device == "cuda" else torch.float32,
                trust_remote_code=True
            )
            
            if device == "cpu":
                gec_model = gec_model.to(device)
            
            gec_tokenizer.pad_token_id = gec_tokenizer.eos_token_id
            gec_tokenizer.padding_side = "left"
            
            print(f"βœ… GEC grammar correction model loaded (device: {device})")
            
        except Exception as e:
            print(f"❌ GEC model loading failed: {e}")
            return False
    
    return True

# ======================== GEC Grammar Correction Function ========================
def correct_sentence_gec(input_sentence):
    """
    Use GEC model for grammar correction
    Args:
        input_sentence (str): Sentence to be corrected
    Returns:
        str: Corrected sentence
    """
    if gec_model is None or gec_tokenizer is None:
        raise ValueError("GEC model not initialized")
    
    prompt = f"""Rewrite the following sentence to correct grammatical errors. Return ONLY the corrected sentence.
Original: {input_sentence}
Corrected:"""
    
    inputs = gec_tokenizer(prompt, return_tensors="pt").to(gec_model.device)
    
    is_cpu = str(gec_model.device) == "cpu" or not torch.cuda.is_available()
    
    if is_cpu:
        max_tokens = 256
        beams = 2
    else:
        max_tokens = 512
        beams = 4
    
    with torch.no_grad():
        outputs = gec_model.generate(
            **inputs,
            max_new_tokens=max_tokens,
            num_beams=beams,
            do_sample=False,
            temperature=None,
            top_p=None
        )
    
    full_output = gec_tokenizer.decode(outputs[0], skip_special_tokens=True)
    corrected_text = full_output.replace(prompt, "").strip()
    
    if corrected_text.startswith("Corrected:"):
        corrected_text = corrected_text[len("Corrected:"):].strip()
    
    return corrected_text

# ======================== WAC-GEC Combined Processing ========================
def call_wac_gec(text):
    """
    Use WAC-GEC two-step correction:
    1. GEC model for grammar and spelling correction
    2. WAC model for whitespace correction
    """
    if not initialize_wac_gec():
        raise ValueError("⚠️ WAC-GEC models not installed or failed to load")
    
    try:
        # Step 1: Use GEC model for grammar correction
        print(f"πŸ” GEC processing: {text[:50]}...")
        gec_corrected = correct_sentence_gec(text)
        print(f"βœ… GEC result: {gec_corrected[:50]}...")
        
        # Step 2: Use WAC model for whitespace correction
        print(f"πŸ” WAC processing: {gec_corrected[:50]}...")
        final_corrected = wac_corrector.correct_text(gec_corrected)
        print(f"βœ… WAC result: {final_corrected[:50]}...")
        
        return f"[output]: {final_corrected}"
    
    except Exception as e:
        raise Exception(f"WAC-GEC processing error: {str(e)}")

# ======================== Color Diff Functions ========================
def generate_colored_diff(original, cleaned):
    """
    Generate HTML diff with color annotations
    Errors in original text: red
    Corrections after denoising: green
    """
    original_words = original.split()
    cleaned_words = cleaned.split()
    
    matcher = difflib.SequenceMatcher(None, original_words, cleaned_words)
    
    original_html = []
    cleaned_html = []
    
    for tag, i1, i2, j1, j2 in matcher.get_opcodes():
        if tag == 'equal':
            original_html.extend(original_words[i1:i2])
            cleaned_html.extend(cleaned_words[j1:j2])
        elif tag == 'replace':
            original_html.extend([f'<span style="color: #dc3545; font-weight: bold;">{w}</span>' 
                                 for w in original_words[i1:i2]])
            cleaned_html.extend([f'<span style="color: #28a745; font-weight: bold;">{w}</span>' 
                                for w in cleaned_words[j1:j2]])
        elif tag == 'delete':
            original_html.extend([f'<span style="color: #dc3545; text-decoration: line-through;">{w}</span>' 
                                 for w in original_words[i1:i2]])
        elif tag == 'insert':
            cleaned_html.extend([f'<span style="color: #28a745; font-weight: bold;">{w}</span>' 
                                for w in cleaned_words[j1:j2]])
    
    return ' '.join(original_html), ' '.join(cleaned_html)

def create_comparison_html(original_list, cleaned_list):
    """
    Create HTML table for comparison
    """
    html = """
    <div style="font-family: 'Times New Roman', serif; max-width: 100%; overflow-x: auto;">
        <style>
            .comparison-table {
                width: 100%;
                border-collapse: collapse;
                margin: 20px 0;
                border: 1px solid #000;
            }
            .comparison-table th {
                background-color: #f2f2f2;
                color: #000;
                padding: 8px;
                text-align: left;
                font-weight: bold;
                border-bottom: 2px solid #000;
            }
            .comparison-table td {
                padding: 8px;
                border-bottom: 1px solid #ccc;
                line-height: 1.5;
                vertical-align: top;
            }
            .index-col {
                width: 50px;
                text-align: center;
                font-weight: bold;
                color: #555;
            }
        </style>
        <table class="comparison-table">
            <thead>
                <tr>
                    <th class="index-col">#</th>
                    <th>Original Question</th>
                    <th>Denoised Question</th>
                </tr>
            </thead>
            <tbody>
    """
    
    for idx, (orig, clean) in enumerate(zip(original_list, cleaned_list), 1):
        orig_colored, clean_colored = generate_colored_diff(str(orig), str(clean))
        html += f"""
                <tr>
                    <td class="index-col">{idx}</td>
                    <td class="original-col">{orig_colored}</td>
                    <td class="cleaned-col">{clean_colored}</td>
                </tr>
        """
    
    html += """
            </tbody>
        </table>
    </div>
    """
    
    return html

# ======================== Utility Functions ========================
def check_api_key(model_choice):
    """Check API key (only required for DeepSeek)"""
    if model_choice == "deepseek-r1-distill-llama-8b" and not DEEPSEEK_API_KEY:
        raise ValueError("⚠️ Please configure DEEPSEEK_API_KEY in Space Settings!")

def call_deepseek_api(prompt, model="deepseek-r1-distill-llama-8b", temperature=0.1, stream=True):
    check_api_key(model)
    client = OpenAI(api_key=DEEPSEEK_API_KEY, base_url=DEEPSEEK_BASE_URL)
    completion = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}],
        temperature=temperature,
        stream=stream
    )
    
    if stream:
        response_content = ""
        for chunk in completion:
            if chunk.choices and chunk.choices[0].delta.content:
                response_content += chunk.choices[0].delta.content
        return response_content
    else:
        return completion.choices[0].message.content

def process_sentence(sentence):
    sentence = sentence.strip()
    lines = [line.strip() for line in sentence.split('\n') if line.strip()]
    is_multiline = len(lines) > 1
    target_line = lines[-1] if is_multiline else sentence
    last_char = target_line[-1] if target_line else ''
    if last_char in {'.', '?', '!', ';', ','}:
        return target_line
    else:
        return target_line + " ___."

def is_valid_output(content_2, content_1, content_0):
    if not (content_2.startswith('[output]:') and '\n' not in content_2):
        return False
    if ('___' in content_0 or '___' in content_1) and '___' not in content_2:
        return False
    if len(content_2) > 2 * len(content_1) or len(content_1) > 2 * len(content_2):
        return False
    return True

def extract_output_content(item):
    if item.startswith('[output]:'):
        output_content = item[len('[output]:'):].strip()
        if output_content and output_content[0] == '"' and output_content[-1] == '"':
            return output_content[1:-1]
        return output_content
    elif item.startswith('[ERROR] Failed to process:'):
        error_content = item[len('[ERROR] Failed to process:'):].strip()
        if error_content and error_content[0] == '"' and error_content[-1] == '"':
            return error_content[1:-1]
        return error_content
    else:
        return None

def has_missing_spaces(sentence):
    if ' ' in sentence:
        return False
    doc = nlp(sentence)
    alpha_tokens = [t for t in doc if t.is_alpha]
    return len(alpha_tokens) >= 2

def calculate_whitespace_anomaly_rate(sentences):
    if not sentences:
        return 0.0
    anomaly_count = 0
    for sent in sentences:
        if has_missing_spaces(sent):
            anomaly_count += 1
            continue
        if any(p.search(sent) for p in WHITESPACE_PATTERNS):
            anomaly_count += 1
    return anomaly_count / len(sentences) * 100

def normalize_tokens(text):
    doc = nlp(text)
    tokens = []
    for t in doc:
        if not t.is_alpha or len(t.text) <= 2 or t.text.isupper():
            continue
        tokens.append(t.text.lower())
    return tokens

def calculate_spelling_error_density(sentences):
    total_words = 0
    total_errors = 0
    for sent in sentences:
        if has_missing_spaces(sent):
            continue
        tokens = normalize_tokens(sent)
        if not tokens:
            continue
        misspelled = spell.unknown(tokens)
        total_errors += len(misspelled)
        total_words += len(tokens)
    if total_words == 0:
        return 0.0
    return total_errors / total_words * 100

# ======================== Leaderboard Data Processing ========================
def load_leaderboard_data():
    json_path = "leaderboard.json"
    try:
        with open(json_path, 'r', encoding='utf-8') as f:
            data = json.load(f)

        for item in data:
            benchmark = item['Benchmark']
            hash_object = hashlib.md5(benchmark.encode())
            item['ID'] = hash_object.hexdigest()[:8]

        return pd.DataFrame(data)
    except Exception as e:
        print(f"Error loading leaderboard: {e}")
        return pd.DataFrame()

def filter_leaderboard(df, category_query, version_query):
    """
    Filter by both category and version
    """
    result = df.copy()
    
    if category_query != "all":
        result = result[result['Category'] == category_query]
    
    if version_query != "all":
        if version_query == "original":
            result = result[result['Benchmark'].str.contains('_original', case=False, na=False)]
        elif version_query == "deepseek":
            result = result[result['Benchmark'].str.contains('deepseek_r1_denoising', case=False, na=False)]
        elif version_query == "wac_gec":
            result = result[result['Benchmark'].str.contains('wac_gec', case=False, na=False)]
    
    return result

def search_leaderboard(df, query):
    if not query:
        return df
    return df[df['Benchmark'].str.contains(query, case=False, na=False)]

# ======================== Dataset Denoising Function ========================
def clean_dataset(file_path, question_column, model_choice, temperature, max_samples, progress=gr.Progress()):
    try:
        try:
            check_api_key(model_choice)
        except ValueError as e:
            if model_choice == "deepseek-r1-distill-llama-8b":
                return str(e), None, ""
        
        if model_choice == "WAC-GEC" and not WAC_GEC_AVAILABLE:
            return "❌ WAC-GEC model not installed! Please install whitespace_correction package.", None, ""
        
        progress(0.05, desc="πŸ“ Reading data file...")
        df = pd.read_parquet(file_path)
        
        if question_column not in df.columns:
            available_columns = ", ".join(df.columns.tolist())
            return f"❌ Column '{question_column}' not found!\nAvailable columns: {available_columns}", None, ""
        
        data_ori = df[question_column].tolist()[:int(max_samples)]
        total = len(data_ori)
        
        progress(0.08, desc="πŸ“Š Calculating original metrics...")
        original_sentences = [str(item) for item in data_ori]
        war_original = calculate_whitespace_anomaly_rate(original_sentences)
        sed_original = calculate_spelling_error_density(original_sentences)
        
        progress(0.1, desc=f"πŸš€ Starting denoising of {total} samples (model: {model_choice})...")
        
        if model_choice == "WAC-GEC":
            data_corrupt = [str(item) for item in data_ori]
        else:
            data_corrupt = [process_sentence(str(item)) for item in data_ori]
        
        results = []
        max_retries = 5 if model_choice == "deepseek-r1-distill-llama-8b" else 3
        log_text = f"πŸš€ Processing {total} samples...\n"
        log_text += f"πŸ“Œ Using model: {model_choice}\n\n"
        
        for idx in range(total):
            progress((0.1 + 0.7 * idx / total), desc=f"Processing: {idx+1}/{total}")
            
            unprocess_text = str(data_ori[idx])
            original_text = data_corrupt[idx]
            response_content = ""
            retry_count = 0
            
            while retry_count < max_retries:
                try:
                    if model_choice == "WAC-GEC":
                        response_content = call_wac_gec(original_text)
                    else:
                        response_content = call_deepseek_api(
                            PROMPT_TEMPLATE + original_text,
                            model=model_choice,
                            temperature=float(temperature)
                        )
                    
                    if model_choice == "WAC-GEC":
                        if response_content.startswith('[output]:'):
                            results.append(response_content)
                            break
                        else:
                            retry_count += 1
                    else:
                        if is_valid_output(response_content, original_text, unprocess_text):
                            results.append(response_content)
                            break
                        else:
                            retry_count += 1
                        
                except Exception as e:
                    retry_count += 1
                    log_text += f"⚠️ Sample {idx+1} error, retry {retry_count}/{max_retries}: {str(e)}\n"
            else:
                results.append(f"[ERROR] Failed to process: {original_text}")
                log_text += f"❌ Sample {idx+1} processing failed\n"
        
        progress(0.85, desc="πŸ“Š Post-processing...")
        
        lst_extracted = []
        error_count = 0
        unknown_count = 0
        
        for i, item in enumerate(results):
            extracted = extract_output_content(item)
            if extracted is None:
                lst_extracted.append(str(data_ori[i]))
                unknown_count += 1
            else:
                lst_extracted.append(extracted)
                if item.startswith('[ERROR]'):
                    error_count += 1
        
        lst_final = []
        for i in range(len(data_ori)):
            item = str(data_ori[i])
            if '\n' in item and model_choice != "WAC-GEC":
                tmp_lines = [line.strip() for line in item.strip().split('\n') if line.strip()]
                tmp_lines[-1] = lst_extracted[i]
                lst_final.append('\n'.join(tmp_lines))
            else:
                lst_final.append(lst_extracted[i])
        
        progress(0.90, desc="πŸ“Š Calculating denoised metrics...")
        cleaned_sentences = [str(item) for item in lst_final]
        war_cleaned = calculate_whitespace_anomaly_rate(cleaned_sentences)
        sed_cleaned = calculate_spelling_error_density(cleaned_sentences)
        
        delta_war = war_cleaned - war_original
        delta_sed = sed_cleaned - sed_original
        
        progress(0.95, desc="πŸ’Ύ Saving results...")
        
        df_cleaned = df.copy()
        df_cleaned[question_column + '_cleaned'] = lst_final[:len(df)]
        
        original_filename = os.path.basename(file_path)
        base_name = original_filename.replace('.parquet', '')
        model_suffix = "WAC-GEC" if model_choice == "WAC-GEC" else "DeepSeek"
        output_filename = f"{base_name}-Denoising-{model_suffix}.parquet"
        output_path = os.path.join(tempfile.gettempdir(), output_filename)
        
        df_cleaned.to_parquet(output_path, index=False)
        
        log_text += f"\n\nπŸ“Š Processing Complete!\n"
        log_text += f"{'='*50}\n"
        log_text += f"【Basic Statistics】\n"
        log_text += f"- Model used: {model_choice}\n"
        log_text += f"- Total samples: {total}\n"
        log_text += f"- Successfully processed: {total - error_count - unknown_count}\n"
        log_text += f"- Failed samples: {error_count}\n"
        log_text += f"- Unknown format: {unknown_count}\n"
        log_text += f"- Output file: {output_filename}\n\n"
        
        log_text += f"【Quality Metrics】\n"
        log_text += f"πŸ“ Whitespace Anomaly Rate (WAR):\n"
        log_text += f"   Original: {war_original:.2f}% β†’ Denoised: {war_cleaned:.2f}%\n"
        log_text += f"   Change: {delta_war:+.2f}% {'βœ… Improved' if delta_war < 0 else '⚠️ Increased'}\n\n"
        
        log_text += f"πŸ“ Spelling Error Density (SED):\n"
        log_text += f"   Original: {sed_original:.2f}% β†’ Denoised: {sed_cleaned:.2f}%\n"
        log_text += f"   Change: {delta_sed:+.2f}% {'βœ… Improved' if delta_sed < 0 else '⚠️ Increased'}\n"
        
        if model_choice == "WAC-GEC":
            log_text += f"\nπŸ’‘ Note: WAC-GEC uses two-step correction (GEC grammar + WAC whitespace)\n"
        
        log_text += f"{'='*50}\n"
        
        preview_html = create_comparison_html(data_ori[:5], lst_final[:5])
        
        progress(1.0, desc="βœ… Complete!")
        
        return log_text, output_path, preview_html
        
    except Exception as e:
        import traceback
        error_detail = traceback.format_exc()
        return f"❌ Processing error: {str(e)}\n\nDetailed error:\n{error_detail}", None, ""

# ======================== Text Content ========================
ABOUT_TEXT = """
## Denoising Workflow

### Supported Models

#### 1. DeepSeek-R1 (deepseek-r1-distill-llama-8b)
- **Function**: Comprehensive grammar, spelling, and whitespace error correction
- **Advantages**: Strong comprehensive capability, handles multiple error types
- **Configuration**: Requires DEEPSEEK_API_KEY in Space Settings

#### 2. WAC-GEC (Whitespace + Grammar Error Correction)
- **Function**: Two-step correction workflow
  - **Step 1 (GEC)**: Use LLaMA-2-7B fine-tuned model for grammar and spelling correction
  - **Step 2 (WAC)**: Use whitespace correction model for spacing issues
- **Advantages**: 
  - Fully local, no API key required
  - Combines two specialized models
  - Suitable for offline environments and limited budgets
- **Model Source**: 
  - GEC: [lllouo/gec_Chat-LLaMa-2-7B-FT](https://huggingface.co/lllouo/gec_Chat-LLaMa-2-7B-FT)
  - WAC: whitespace_correction library

### Core Algorithm

1. **Preprocessing (process_sentence)**
   - Detect sentence completeness
   - Add marker `___` for incomplete sentences (DeepSeek only)
   - Preserve multi-line text format

2. **Model Denoising**
   - **DeepSeek**: Use API for comprehensive error correction, up to 5 retries
   - **WAC-GEC**: 
     - First use GEC model for grammar and spelling correction
     - Then use WAC model for whitespace correction
     - Up to 3 retries

3. **Format Validation**
   - Verify output format correctness
   - Check marker preservation
   - Length reasonability check

4. **Post-processing**
   - Extract denoised content
   - Restore original multi-line format
   - Generate Parquet file with model identifier

### Supported Datasets

- **MMLU**: Multiple choice questions across 57 subjects
- **GSM8K**: Math reasoning problems
- **ARC-Challenge**: Science Q&A
- **MedMCQA**: Medical multiple choice
- **CoQA**: Conversational Q&A
- And more...

### Color Annotation Legend

- πŸ”΄ **Red**: Errors in original text (spelling, grammar, spacing, etc.)
- 🟒 **Green**: Corrections after denoising
- ⚫ **Black**: Unchanged correct parts

### Tech Stack

- **LLM**: DeepSeek API (deepseek-r1-distill-llama-8b)
- **Local Models**: 
  - GEC: LLaMA-2-7B (fine-tuned for grammar correction)
  - WAC: Whitespace Correction Model
- **Frontend**: Gradio 4.16.0
- **Data Processing**: Pandas + PyArrow (Parquet)
- **Diff Comparison**: Python difflib
- **NLP Tools**: spaCy, pyspellchecker
- **API Calls**: OpenAI SDK
- **Deployment**: Hugging Face Spaces

### Quality Metrics

- **WAR (Whitespace Anomaly Rate)**: Whitespace anomaly rate
- **SED (Spelling Error Density)**: Spelling error density

### Model Selection Guide

- **Need comprehensive denoising + API budget**: Choose DeepSeek-R1
- **Local deployment + complete correction**: Choose WAC-GEC (Recommended)
- **Only need spacing correction**: Use WAC module alone
- **Fastest speed**: Use GPU-accelerated WAC-GEC

---

**Graduate Thesis Research Showcase** | Powered by DeepSeek API & WAC-GEC
"""

# ======================== Gradio Interface ========================
demo = gr.Blocks(title="Dataset Denoising Framework Demo System", css="""
    .markdown-text { font-size: 16px; line-height: 1.6; }
""")

with demo:
    gr.Markdown(
        """<div style="text-align: center;"><h1>⭐ <span style='color: #e6b800;'>Denoising Factory</span> Based on Benchmark Denoising Framework</h1></div>
        <br>
        <p>This system demonstrates the denoising effects of DeepSeek-R1 and WAC-GEC methods on mainstream benchmark datasets based on <a href="https://github.com/LLLoUo/bd-toolkit" target="_blank">BD-toolkit</a>. Quality is evaluated using WAR (Whitespace Anomaly Rate) and SED (Spelling Error Density) metrics.</p>
        """,
        elem_classes="markdown-text"
    )
    
    leaderboard_data = load_leaderboard_data()
    
    with gr.Tabs(elem_classes="tab-buttons") as tabs:
        with gr.TabItem("πŸ“Š BD-benchmarks Leaderboard", id=0):
            with gr.Column():
                gr.Markdown("### Mainstream Benchmark Leaderboard After BD Denoising")
                
                with gr.Row():
                    search_bar = gr.Textbox(
                        placeholder="πŸ” Search benchmark name and press ENTER...",
                        show_label=False,
                        elem_id="search-bar",
                    )
                    filter_categories = gr.Radio(
                        label="πŸ“‚ Filter by Benchmark Category",
                        choices=["all", "BT", "RA", "TG", "SU", "ME", "GR"],
                        value="all",
                        elem_id="filter-columns",
                    )
                    filter_versions = gr.Radio(
                        label="πŸ”– Filter by Dataset Version",
                        choices=[
                            ("All Versions", "all"),
                            ("Original", "original"),
                            ("DeepSeek-R1-denoised", "deepseek"),
                            ("WAC-GEC", "wac_gec")
                        ],
                        value="all",
                        elem_id="filter-versions",
                    )
                
                leaderboard_table = gr.Dataframe(
                    value=leaderboard_data[['ID', 'Category', 'Benchmark', 'WAR', 'SED', 'Download']],
                    headers=['ID', 'Category', 'Benchmark', 'WAR (%)', 'SED', 'Download'],
                    datatype=['number', 'str', 'str', 'number', 'number', 'markdown'],
                    elem_id="leaderboard-table",
                    interactive=False,
                )
                
                hidden_leaderboard = gr.Dataframe(
                    value=leaderboard_data,
                    visible=False
                )
                
                search_bar.submit(
                    lambda df, query: search_leaderboard(df, query)[['ID', 'Category', 'Benchmark', 'WAR', 'SED', 'Download']],
                    [hidden_leaderboard, search_bar],
                    leaderboard_table
                )
                
                def combined_filter(df, category, version):
                    filtered = filter_leaderboard(df, category, version)
                    return filtered[['ID', 'Category', 'Benchmark', 'WAR', 'SED', 'Download']]
                
                filter_categories.change(
                    combined_filter,
                    [hidden_leaderboard, filter_categories, filter_versions],
                    leaderboard_table
                )
                
                filter_versions.change(
                    combined_filter,
                    [hidden_leaderboard, filter_categories, filter_versions],
                    leaderboard_table
                )
                
                gr.Markdown("""
                **Legend:**
                - **Category**: BT=Basic Tasks, RA=Reasoning Abilities, TG=Text Generation, SU=Speech Understanding, ME=Medical, GR=Grammar
                - **Version**: Original=Unprocessed dataset, DeepSeek-R1=DeepSeek denoised version, WAC-GEC=WAC-GEC denoised version
                - **WAR**: Whitespace Anomaly Rate (lower is better)
                - **SED**: Spelling Error Density (lower is better)
                """, elem_classes="markdown-text")
        

        
        with gr.TabItem("πŸš€ BD-toolkit Demo", id=2):
            gr.Markdown("## BD-toolkit Lightweight Demo")
            
            model_status = "βœ… WAC-GEC: " + ("Available" if WAC_GEC_AVAILABLE else "Not Installed")
            model_status += " | βœ… DeepSeek-R1: " + ("Configured" if DEEPSEEK_API_KEY else "API Key Not Configured")
            gr.Markdown(f"**Model Status**: {model_status}")
            
            with gr.Row():
                with gr.Column():
                    file_input = gr.File(
                        label="πŸ“ Upload Parquet File",
                        file_types=[".parquet"]
                    )
                    
                    question_column = gr.Textbox(
                        label="πŸ“ Question Column Name",
                        value="question",
                        placeholder="e.g., question, input_text, prompt"
                    )
                    
                    model_choice = gr.Dropdown(
                        choices=["WAC-GEC", "deepseek-r1-distill-llama-8b"],
                        value="WAC-GEC",
                        label="πŸ€– Select Model",
                        info="DeepSeek: Comprehensive correction | WAC-GEC: Grammar + whitespace (local model)"
                    )
                    
                    temperature = gr.Slider(
                        minimum=0.0,
                        maximum=1.0,
                        value=0.1,
                        step=0.1,
                        label="🌑️ Temperature",
                        info="Only effective for DeepSeek",
                        interactive=False
                    )
                    
                    max_samples = gr.Slider(
                        minimum=1,
                        maximum=100,
                        value=5,
                        step=1,
                        label="πŸ“Š Number of Samples to Process (Demo Limit)"
                    )
                    
                    clean_btn = gr.Button("πŸš€ Start Denoising", variant="primary", size="lg")
                
                with gr.Column():
                    output_text = gr.Textbox(
                        label="⏳ Processing Progress",
                        lines=10,
                        max_lines=15
                    )
                    
                    download_file = gr.File(label="πŸ“₯ Download Denoised Dataset")
            
            def update_temperature_interactive(model):
                if model == "deepseek-r1-distill-llama-8b":
                    return gr.update(interactive=True, info="Adjust generation randomness")
                else:
                    return gr.update(interactive=False, info="WAC-GEC model does not support temperature parameter")
            
            model_choice.change(
                fn=update_temperature_interactive,
                inputs=[model_choice],
                outputs=[temperature]
            )
            
            gr.Markdown("### 🎨 Denoising Effect Comparison Preview")
            gr.Markdown("""
            **Color Legend**: 
            - πŸ”΄ <span style="color: #dc3545;">Red</span> = Errors in original text
            - 🟒 <span style="color: #28a745;">Green</span> = Corrections after denoising
            - ⚫ Black = Unchanged correct parts
            """)
            
            colored_preview = gr.HTML(label="")
            
            clean_btn.click(
                fn=clean_dataset,
                inputs=[file_input, question_column, model_choice, temperature, max_samples],
                outputs=[output_text, download_file, colored_preview]
            )

        with gr.TabItem("πŸ“ About", id=3):
            gr.Markdown(ABOUT_TEXT, elem_classes="markdown-text")

if __name__ == "__main__":
    print("πŸš€ Preloading WAC-GEC models...")
    initialize_wac_gec()
    
    demo.launch(
        server_name="0.0.0.0",
        server_port=7860,
        ssr_mode=False
    )