Spaces:
Runtime error
Runtime error
File size: 3,210 Bytes
1d39b8a 4235ba5 cb2415a 4524238 1d39b8a 4235ba5 11c10f2 1d39b8a 4524238 1d39b8a 4524238 11c10f2 4524238 11c10f2 1d39b8a 34b88ee 11c10f2 1d39b8a 11c10f2 674ee5c 11c10f2 4524238 11c10f2 674ee5c 11c10f2 a11ae53 674ee5c a11ae53 1d39b8a 11c10f2 1d39b8a 11c10f2 1d39b8a a11ae53 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 |
import gradio as gr
from transformers import pipeline
import spacy
from textblob import TextBlob
from gradio_client import Client
import re
# Initialize models
nlp = spacy.load("en_core_web_sm")
spell_checker = pipeline("text2text-generation", model="oliverguhr/spelling-correction-english-base")
def preprocess_capitalization(text: str) -> str:
"""Preprocess input text to handle capitalization rules."""
words = text.split(" ")
processed_words = []
for word in words:
# Check if the word is an acronym (all uppercase letters)
if re.match(r"^[A-Z]+$", word):
processed_words.append(word) # Leave acronyms unchanged
# Check if the word has mixed capitalization (e.g., "HEllo")
elif re.search(r"[A-Z]", word) and re.search(r"[a-z]", word):
processed_words.append(word[0].upper() + word[1:].lower()) # Correct capitalization
else:
processed_words.append(word) # Leave other words unchanged
return " ".join(processed_words)
def preprocess_text(text: str):
"""Process text and return corrections with position information."""
result = {
"spell_suggestions": [],
"entities": [],
"tags": []
}
# Apply capitalization preprocessing
capitalized_text = preprocess_capitalization(text)
if capitalized_text != text:
result["spell_suggestions"].append({
"original": text,
"corrected": capitalized_text
})
text = capitalized_text # Update text for further processing
# Find and record positions of corrections
doc = nlp(text)
# TextBlob spell check with position tracking
blob = TextBlob(text)
corrected = str(blob.correct())
if corrected != text:
result["spell_suggestions"].append({
"original": text,
"corrected": corrected
})
# Transformer spell check
spell_checked = spell_checker(text, max_length=512)[0]['generated_text']
if spell_checked != text and spell_checked != corrected:
result["spell_suggestions"].append({
"original": text,
"corrected": spell_checked
})
# Add entities and tags
result["entities"] = [{"text": ent.text, "label": ent.label_} for ent in doc.ents]
result["tags"] = [token.text for token in doc if token.text.startswith(('#', '@'))]
return text, result
def preprocess_and_forward(text: str):
"""Process text and forward to translation service."""
original_text, preprocessing_result = preprocess_text(text)
# Forward original text to translation service
client = Client("Frenchizer/space_17")
try:
translation = client.predict(original_text)
return translation, preprocessing_result
except Exception as e:
return f"Error: {str(e)}", preprocessing_result
# Gradio interface
with gr.Blocks() as demo:
input_text = gr.Textbox(label="Input Text")
output_text = gr.Textbox(label="Output Text")
preprocess_button = gr.Button("Process")
preprocess_button.click(fn=preprocess_and_forward, inputs=[input_text], outputs=[output_text])
if __name__ == "__main__":
demo.launch() |