Spaces:
Sleeping
Sleeping
Upload app.py
Browse files
app.py
CHANGED
|
@@ -274,6 +274,23 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.8, top_k=50):
|
|
| 274 |
|
| 275 |
# Decode
|
| 276 |
generated_text = enc.decode(tokens[0].tolist())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 277 |
return generated_text
|
| 278 |
except Exception as e:
|
| 279 |
import traceback
|
|
|
|
| 274 |
|
| 275 |
# Decode
|
| 276 |
generated_text = enc.decode(tokens[0].tolist())
|
| 277 |
+
|
| 278 |
+
# Post-process to fix spacing issues (common with BPE tokenizers)
|
| 279 |
+
import re
|
| 280 |
+
# Fix 1: lowercase followed by uppercase (e.g., "perpetualWith" -> "perpetual With")
|
| 281 |
+
generated_text = re.sub(r'([a-z])([A-Z])', r'\1 \2', generated_text)
|
| 282 |
+
|
| 283 |
+
# Fix 2: Common word boundaries that got merged (e.g., "perpetualwith" -> "perpetual with")
|
| 284 |
+
# Add space before common words that might have been merged
|
| 285 |
+
common_words = ['with', 'the', 'and', 'that', 'this', 'have', 'from', 'not', 'but', 'for', 'are', 'was', 'were', 'been', 'will', 'shall', 'would', 'could', 'should']
|
| 286 |
+
for word in common_words:
|
| 287 |
+
# Only add space if it's not already separated and follows a lowercase letter
|
| 288 |
+
pattern = r'([a-z])(' + word + r'\b)'
|
| 289 |
+
generated_text = re.sub(pattern, r'\1 \2', generated_text, flags=re.IGNORECASE)
|
| 290 |
+
|
| 291 |
+
# Fix 3: Add space before character names (all caps words)
|
| 292 |
+
generated_text = re.sub(r'([a-z])([A-Z]{2,})', r'\1 \2', generated_text)
|
| 293 |
+
|
| 294 |
return generated_text
|
| 295 |
except Exception as e:
|
| 296 |
import traceback
|