File size: 2,833 Bytes
1d39b8a
 
 
cb2415a
4524238
1d39b8a
4235ba5
11c10f2
1d39b8a
 
4524238
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1d39b8a
4524238
11c10f2
abdc326
 
 
11c10f2
 
4524238
 
 
abdc326
 
a3aabf5
abdc326
4524238
028169b
11c10f2
 
028169b
abdc326
 
a3aabf5
abdc326
1d39b8a
abdc326
028169b
11c10f2
abdc326
674ee5c
abdc326
11c10f2
 
4524238
abdc326
11c10f2
abdc326
c7735d5
674ee5c
abdc326
a11ae53
674ee5c
a11ae53
1d39b8a
 
 
 
11c10f2
1d39b8a
11c10f2
1d39b8a
 
674efd1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import gradio as gr
from transformers import pipeline
import spacy
from gradio_client import Client
import re

# Initialize models
nlp = spacy.load("en_core_web_sm")
spell_checker = pipeline("text2text-generation", model="oliverguhr/spelling-correction-english-base")

def preprocess_capitalization(text: str) -> str:
    """Preprocess input text to handle capitalization rules."""
    words = text.split(" ")
    processed_words = []
    
    for word in words:
        # Check if the word is an acronym (all uppercase letters)
        if re.match(r"^[A-Z]+$", word):
            processed_words.append(word)  # Leave acronyms unchanged
        # Check if the word has mixed capitalization (e.g., "HEllo")
        elif re.search(r"[A-Z]", word) and re.search(r"[a-z]", word):
            processed_words.append(word[0].upper() + word[1:].lower())  # Correct capitalization
        else:
            processed_words.append(word)  # Leave other words unchanged
    
    return " ".join(processed_words)

def preprocess_text(text: str):
    """Process text and return corrections with position information."""
    result = {
        "spell_suggestions": [],
        "entities": [],
        "tags": []
    }

    # Apply capitalization preprocessing
    capitalized_text = preprocess_capitalization(text)
    if capitalized_text != text:
        result["spell_suggestions"].append({
            "original": text,
            "corrected": capitalized_text
        })
        text = capitalized_text  # Update text for further processing

    # Transformer spell check
    spell_checked = spell_checker(text, max_length=512)[0]['generated_text']
    if spell_checked != text:
        result["spell_suggestions"].append({
            "original": text,
            "corrected": spell_checked
        })

    # Add entities and tags
    doc = nlp(text)
    result["entities"] = [{"text": ent.text, "label": ent.label_} for ent in doc.ents]
    result["tags"] = [token.text for token in doc if token.text.startswith(('#', '@'))]

    return text, result

def preprocess_and_forward(text: str):
    """Process text and forward to translation service."""
    original_text, preprocessing_result = preprocess_text(text)
    
    # Forward original text to translation service
    client = Client("Frenchizer/space_21")
    try:
        translation = client.predict(original_text)
        return translation, preprocessing_result
    except Exception as e:
        return f"Error: {str(e)}", preprocessing_result

# Gradio interface
with gr.Blocks() as demo:
    input_text = gr.Textbox(label="Input Text")
    output_text = gr.Textbox(label="Output Text")
    preprocess_button = gr.Button("Process")
    preprocess_button.click(fn=preprocess_and_forward, inputs=[input_text], outputs=[output_text])

if __name__ == "__main__":
    demo.launch()