shwethd commited on
Commit
4e5f1e6
·
verified ·
1 Parent(s): adc8386

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -0
app.py CHANGED
@@ -274,6 +274,23 @@ def generate_text(prompt, max_new_tokens=100, temperature=0.8, top_k=50):
274
 
275
  # Decode
276
  generated_text = enc.decode(tokens[0].tolist())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
277
  return generated_text
278
  except Exception as e:
279
  import traceback
 
274
 
275
  # Decode
276
  generated_text = enc.decode(tokens[0].tolist())
277
+
278
+ # Post-process to fix spacing issues (common with BPE tokenizers)
279
+ import re
280
+ # Fix 1: lowercase followed by uppercase (e.g., "perpetualWith" -> "perpetual With")
281
+ generated_text = re.sub(r'([a-z])([A-Z])', r'\1 \2', generated_text)
282
+
283
+ # Fix 2: Common word boundaries that got merged (e.g., "perpetualwith" -> "perpetual with")
284
+ # Add space before common words that might have been merged
285
+ common_words = ['with', 'the', 'and', 'that', 'this', 'have', 'from', 'not', 'but', 'for', 'are', 'was', 'were', 'been', 'will', 'shall', 'would', 'could', 'should']
286
+ for word in common_words:
287
+ # Only add space if it's not already separated and follows a lowercase letter
288
+ pattern = r'([a-z])(' + word + r'\b)'
289
+ generated_text = re.sub(pattern, r'\1 \2', generated_text, flags=re.IGNORECASE)
290
+
291
+ # Fix 3: Add space before character names (all caps words)
292
+ generated_text = re.sub(r'([a-z])([A-Z]{2,})', r'\1 \2', generated_text)
293
+
294
  return generated_text
295
  except Exception as e:
296
  import traceback