Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -27,53 +27,31 @@ def preprocess_text(text: str, is_spell_corrected: bool = False):
|
|
| 27 |
"""Process text and return corrections with position information."""
|
| 28 |
result = {
|
| 29 |
"spell_suggestions": [],
|
| 30 |
-
"other_suggestions": [], # For
|
| 31 |
"entities": [],
|
| 32 |
"tags": []
|
| 33 |
}
|
| 34 |
|
| 35 |
-
# Apply capitalization preprocessing
|
| 36 |
capitalized_text = preprocess_capitalization(text)
|
| 37 |
-
if capitalized_text != text:
|
| 38 |
result["spell_suggestions"].append({
|
| 39 |
"original": text,
|
| 40 |
"corrected": capitalized_text
|
| 41 |
})
|
| 42 |
text = capitalized_text # Update text for further processing
|
| 43 |
|
| 44 |
-
# Transformer spell check
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
text = spell_checked # Update text after spell correction
|
| 53 |
-
|
| 54 |
-
# Add NLP-based "other" suggestions using spaCy
|
| 55 |
-
doc = nlp(text)
|
| 56 |
-
for token in doc:
|
| 57 |
-
# Example: Suggest adding an article before a noun if missing
|
| 58 |
-
if token.pos_ == "NOUN" and token.dep_ != "compound" and token.i > 0:
|
| 59 |
-
prev_token = doc[token.i - 1]
|
| 60 |
-
if prev_token.pos_ not in ("DET", "PRON") and not prev_token.text.endswith("'s"):
|
| 61 |
-
suggested = f"{text[:token.idx]}the {text[token.idx:]}"
|
| 62 |
-
result["other_suggestions"].append({
|
| 63 |
-
"original": text,
|
| 64 |
-
"corrected": suggested
|
| 65 |
-
})
|
| 66 |
-
# Example: Suggest "is" for subject-verb agreement (rudimentary)
|
| 67 |
-
elif token.pos_ == "NOUN" and token.dep_ == "nsubj" and token.i + 1 < len(doc):
|
| 68 |
-
next_token = doc[token.i + 1]
|
| 69 |
-
if next_token.pos_ != "VERB":
|
| 70 |
-
suggested = f"{text[:next_token.idx]}is {text[next_token.idx:]}"
|
| 71 |
-
result["other_suggestions"].append({
|
| 72 |
-
"original": text,
|
| 73 |
-
"corrected": suggested
|
| 74 |
-
})
|
| 75 |
|
| 76 |
# Add entities and tags
|
|
|
|
| 77 |
result["entities"] = [{"text": ent.text, "label": ent.label_} for ent in doc.ents]
|
| 78 |
result["tags"] = [token.text for token in doc if token.text.startswith(('#', '@'))]
|
| 79 |
|
|
|
|
| 27 |
"""Process text and return corrections with position information."""
|
| 28 |
result = {
|
| 29 |
"spell_suggestions": [],
|
| 30 |
+
"other_suggestions": [], # For spell_checker suggestions
|
| 31 |
"entities": [],
|
| 32 |
"tags": []
|
| 33 |
}
|
| 34 |
|
| 35 |
+
# Apply capitalization preprocessing (spell suggestions)
|
| 36 |
capitalized_text = preprocess_capitalization(text)
|
| 37 |
+
if capitalized_text != text and not is_spell_corrected:
|
| 38 |
result["spell_suggestions"].append({
|
| 39 |
"original": text,
|
| 40 |
"corrected": capitalized_text
|
| 41 |
})
|
| 42 |
text = capitalized_text # Update text for further processing
|
| 43 |
|
| 44 |
+
# Transformer spell check (other suggestions)
|
| 45 |
+
spell_checked = spell_checker(text, max_length=512)[0]['generated_text']
|
| 46 |
+
if spell_checked != text:
|
| 47 |
+
result["other_suggestions"].append({
|
| 48 |
+
"original": text,
|
| 49 |
+
"corrected": spell_checked
|
| 50 |
+
})
|
| 51 |
+
text = spell_checked # Update text after spell correction
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
|
| 53 |
# Add entities and tags
|
| 54 |
+
doc = nlp(text)
|
| 55 |
result["entities"] = [{"text": ent.text, "label": ent.label_} for ent in doc.ents]
|
| 56 |
result["tags"] = [token.text for token in doc if token.text.startswith(('#', '@'))]
|
| 57 |
|