Spaces:
Sleeping
Sleeping
File size: 2,833 Bytes
1d39b8a cb2415a 4524238 1d39b8a 4235ba5 11c10f2 1d39b8a 4524238 1d39b8a 4524238 11c10f2 abdc326 11c10f2 4524238 abdc326 a3aabf5 abdc326 4524238 028169b 11c10f2 028169b abdc326 a3aabf5 abdc326 1d39b8a abdc326 028169b 11c10f2 abdc326 674ee5c abdc326 11c10f2 4524238 abdc326 11c10f2 abdc326 c7735d5 674ee5c abdc326 a11ae53 674ee5c a11ae53 1d39b8a 11c10f2 1d39b8a 11c10f2 1d39b8a 674efd1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 |
import gradio as gr
from transformers import pipeline
import spacy
from gradio_client import Client
import re
# Initialize models
nlp = spacy.load("en_core_web_sm")
spell_checker = pipeline("text2text-generation", model="oliverguhr/spelling-correction-english-base")
def preprocess_capitalization(text: str) -> str:
"""Preprocess input text to handle capitalization rules."""
words = text.split(" ")
processed_words = []
for word in words:
# Check if the word is an acronym (all uppercase letters)
if re.match(r"^[A-Z]+$", word):
processed_words.append(word) # Leave acronyms unchanged
# Check if the word has mixed capitalization (e.g., "HEllo")
elif re.search(r"[A-Z]", word) and re.search(r"[a-z]", word):
processed_words.append(word[0].upper() + word[1:].lower()) # Correct capitalization
else:
processed_words.append(word) # Leave other words unchanged
return " ".join(processed_words)
def preprocess_text(text: str):
"""Process text and return corrections with position information."""
result = {
"spell_suggestions": [],
"entities": [],
"tags": []
}
# Apply capitalization preprocessing
capitalized_text = preprocess_capitalization(text)
if capitalized_text != text:
result["spell_suggestions"].append({
"original": text,
"corrected": capitalized_text
})
text = capitalized_text # Update text for further processing
# Transformer spell check
spell_checked = spell_checker(text, max_length=512)[0]['generated_text']
if spell_checked != text:
result["spell_suggestions"].append({
"original": text,
"corrected": spell_checked
})
# Add entities and tags
doc = nlp(text)
result["entities"] = [{"text": ent.text, "label": ent.label_} for ent in doc.ents]
result["tags"] = [token.text for token in doc if token.text.startswith(('#', '@'))]
return text, result
def preprocess_and_forward(text: str):
"""Process text and forward to translation service."""
original_text, preprocessing_result = preprocess_text(text)
# Forward original text to translation service
client = Client("Frenchizer/space_21")
try:
translation = client.predict(original_text)
return translation, preprocessing_result
except Exception as e:
return f"Error: {str(e)}", preprocessing_result
# Gradio interface
with gr.Blocks() as demo:
input_text = gr.Textbox(label="Input Text")
output_text = gr.Textbox(label="Output Text")
preprocess_button = gr.Button("Process")
preprocess_button.click(fn=preprocess_and_forward, inputs=[input_text], outputs=[output_text])
if __name__ == "__main__":
demo.launch() |