File size: 3,177 Bytes
1d39b8a
 
 
cb2415a
4524238
3ee1970
4235ba5
11c10f2
1d39b8a
3ee1970
4524238
 
 
 
 
 
 
 
 
 
 
d0e1ffd
4524238
 
3ee1970
ed2703c
4524238
11c10f2
d0e1ffd
ed2703c
d0e1ffd
 
11c10f2
3ee1970
ed2703c
 
 
 
 
 
 
 
 
 
3ee1970
ed2703c
 
 
 
 
 
 
 
 
cfb6a26
094d492
 
 
 
 
a385b05
330dfff
11c10f2
abdc326
3ee1970
094d492
3ee1970
ed2703c
d0e1ffd
ed2703c
094d492
3ee1970
a385b05
3ee1970
708dbc3
ed2703c
708dbc3
 
ed2703c
708dbc3
1d39b8a
674efd1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import gradio as gr
from transformers import pipeline
import spacy
from gradio_client import Client
import re

# Initialize models
nlp = spacy.load("en_core_web_sm")
spell_checker = pipeline("text2text-generation", model="oliverguhr/spelling-correction-english-base")

def preprocess_capitalization(text: str) -> str:
    """Preprocess input text to handle capitalization rules."""
    words = text.split(" ")
    processed_words = []
    
    for word in words:
        if re.match(r"^[A-Z]+$", word):
            processed_words.append(word)  # Leave acronyms unchanged
        elif re.search(r"[A-Z]", word) and re.search(r"[a-z]", word):
            processed_words.append(word[0].upper() + word[1:].lower())  # Correct capitalization
        else:
            processed_words.append(word)  # Leave other words unchanged
    
    return " ".join(processed_words)

def preprocess_text(text: str, is_suggestion_applied: bool = False):
    """Process text and return corrections with position information."""
    result = {
        "spell_suggestions": [],
        "other_suggestions": [],
        "entities": [],
        "tags": []
    }

    # Only generate suggestions if no suggestion has been applied
    if not is_suggestion_applied:
        # Apply capitalization preprocessing (spell suggestions)
        capitalized_text = preprocess_capitalization(text)
        if capitalized_text != text:
            result["spell_suggestions"].append({
                "original": text,
                "corrected": capitalized_text
            })
            text = capitalized_text  # Update text for further processing

        # Transformer spell check (other suggestions)
        spell_checked = spell_checker(text, max_length=512)[0]['generated_text']
        if spell_checked != text:
            result["other_suggestions"].append({
                "original": text,
                "corrected": spell_checked
            })

    # Translate the text (after preprocessing if first pass, or as-is if suggestion applied)
    client = Client("Frenchizer/space_21")
    try:
        translation = client.predict(text)
    except Exception as e:
        translation = f"Error: {str(e)}"

    # Add entities and tags
    doc = nlp(text)
    result["entities"] = [{"text": ent.text, "label": ent.label_} for ent in doc.ents]
    result["tags"] = [token.text for token in doc if token.text.startswith(('#', '@'))]

    return translation, result

def preprocess_and_forward(text: str, is_suggestion_applied: bool = False):
    """Process text and forward to translation service."""
    translation, preprocessing_result = preprocess_text(text, is_suggestion_applied)
    return translation, preprocessing_result

# Gradio interface
with gr.Blocks() as demo:
    input_text = gr.Textbox(label="Input Text")
    is_suggestion_applied = gr.Checkbox(label="Suggestion Applied", value=False, visible=False)  # Hidden flag
    output_text = gr.Textbox(label="Output Text")
    preprocess_button = gr.Button("Process")
    preprocess_button.click(fn=preprocess_and_forward, inputs=[input_text, is_suggestion_applied], outputs=[output_text])

if __name__ == "__main__":
    demo.launch()