Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -215,7 +215,8 @@ class ArenaDatabase:
|
|
| 215 |
# Always initialize local database as fallback
|
| 216 |
self.init_database()
|
| 217 |
|
| 218 |
-
# Sync from HF
|
|
|
|
| 219 |
if self.use_hf:
|
| 220 |
self._sync_from_hf()
|
| 221 |
|
|
@@ -234,46 +235,159 @@ class ArenaDatabase:
|
|
| 234 |
print(f"Dataset repo creation note: {e}")
|
| 235 |
|
| 236 |
def _sync_from_hf(self):
|
| 237 |
-
"""Sync data from Hugging Face to local database"""
|
| 238 |
try:
|
| 239 |
-
|
| 240 |
-
dataset = load_dataset(self.hf_repo_id, split="train", use_auth_token=self.hf_token)
|
| 241 |
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
|
| 248 |
-
|
| 249 |
|
| 250 |
-
|
| 251 |
-
|
| 252 |
-
|
| 253 |
-
|
| 254 |
-
|
| 255 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 256 |
|
| 257 |
-
#
|
| 258 |
try:
|
| 259 |
-
stats_dataset = load_dataset(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 260 |
if stats_dataset and len(stats_dataset) > 0:
|
|
|
|
|
|
|
|
|
|
| 261 |
stats_df = stats_dataset.to_pandas()
|
| 262 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 263 |
|
| 264 |
-
|
| 265 |
-
|
| 266 |
-
|
| 267 |
-
|
| 268 |
-
|
| 269 |
-
pass # Stats split might not exist yet
|
| 270 |
|
| 271 |
-
|
| 272 |
-
|
| 273 |
-
print("✅ Synced data from Hugging Face")
|
| 274 |
|
| 275 |
except Exception as e:
|
| 276 |
-
print(f"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 277 |
|
| 278 |
def _sync_to_hf(self):
|
| 279 |
"""Sync local database to Hugging Face"""
|
|
@@ -293,6 +407,7 @@ class ArenaDatabase:
|
|
| 293 |
token=self.hf_token,
|
| 294 |
private=True
|
| 295 |
)
|
|
|
|
| 296 |
|
| 297 |
# Export model stats
|
| 298 |
stats_df = pd.read_sql_query("SELECT * FROM model_stats", conn)
|
|
@@ -304,12 +419,12 @@ class ArenaDatabase:
|
|
| 304 |
token=self.hf_token,
|
| 305 |
private=True
|
| 306 |
)
|
|
|
|
| 307 |
|
| 308 |
conn.close()
|
| 309 |
-
print("✅ Synced data to Hugging Face")
|
| 310 |
|
| 311 |
except Exception as e:
|
| 312 |
-
print(f"Warning: Could not sync to HF: {e}")
|
| 313 |
|
| 314 |
def init_database(self):
|
| 315 |
"""Initialize SQLite database"""
|
|
@@ -589,23 +704,19 @@ class LLMInterface:
|
|
| 589 |
full_response = self._get_jetxa_response(full_prompt)
|
| 590 |
|
| 591 |
if full_response:
|
| 592 |
-
# Format jetXA response
|
| 593 |
formatted_response = self._format_jetxa_response(full_response)
|
| 594 |
|
| 595 |
-
#
|
| 596 |
-
|
| 597 |
accumulated = ""
|
|
|
|
| 598 |
|
| 599 |
-
|
| 600 |
-
|
| 601 |
-
|
| 602 |
-
|
| 603 |
-
|
| 604 |
-
if accumulated:
|
| 605 |
-
accumulated += " "
|
| 606 |
-
accumulated += word
|
| 607 |
-
yield accumulated # Yield accumulated text after each batch
|
| 608 |
-
time.sleep(0.03) # Small delay between batches
|
| 609 |
else:
|
| 610 |
# Use fallback if jetXA fails
|
| 611 |
fallback = self._generate_fallback(model, prompt, language)
|
|
@@ -822,7 +933,7 @@ class LLMInterface:
|
|
| 822 |
yield fallback
|
| 823 |
|
| 824 |
def _get_jetxa_response(self, prompt: str) -> str:
|
| 825 |
-
"""Get complete response from jetXA"""
|
| 826 |
if not self.gradio_client:
|
| 827 |
return ""
|
| 828 |
|
|
@@ -838,37 +949,163 @@ class LLMInterface:
|
|
| 838 |
|
| 839 |
response_text = ""
|
| 840 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 841 |
if result and isinstance(result, (tuple, list)) and len(result) >= 1:
|
|
|
|
|
|
|
|
|
|
| 842 |
chat_history = result[0]
|
| 843 |
|
| 844 |
if isinstance(chat_history, list) and len(chat_history) > 0:
|
|
|
|
| 845 |
for msg in reversed(chat_history):
|
| 846 |
if isinstance(msg, dict):
|
| 847 |
-
|
| 848 |
-
|
|
|
|
| 849 |
response_text = str(content)
|
|
|
|
| 850 |
break
|
| 851 |
elif isinstance(msg, (list, tuple)) and len(msg) >= 2:
|
| 852 |
-
|
|
|
|
| 853 |
response_text = str(msg[1])
|
|
|
|
| 854 |
break
|
|
|
|
|
|
|
|
|
|
|
|
|
| 855 |
|
|
|
|
| 856 |
if not response_text:
|
| 857 |
-
for i in range(1, min(
|
| 858 |
-
if result[i]
|
| 859 |
-
|
| 860 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 861 |
|
| 862 |
if response_text:
|
| 863 |
-
|
| 864 |
-
|
| 865 |
-
|
| 866 |
-
|
|
|
|
|
|
|
| 867 |
|
| 868 |
except Exception as e:
|
| 869 |
print(f"jetXA response error: {e}")
|
|
|
|
|
|
|
| 870 |
return ""
|
| 871 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 872 |
def _clean_markdown_response(self, text: str) -> str:
|
| 873 |
"""Clean and fix common markdown formatting issues"""
|
| 874 |
# Remove any duplicate markers or broken formatting
|
|
|
|
| 215 |
# Always initialize local database as fallback
|
| 216 |
self.init_database()
|
| 217 |
|
| 218 |
+
# IMPORTANT: Sync from HF AFTER initializing local DB
|
| 219 |
+
# This will load all historical data
|
| 220 |
if self.use_hf:
|
| 221 |
self._sync_from_hf()
|
| 222 |
|
|
|
|
| 235 |
print(f"Dataset repo creation note: {e}")
|
| 236 |
|
| 237 |
def _sync_from_hf(self):
|
| 238 |
+
"""Sync data from Hugging Face to local database - PRESERVES existing data"""
|
| 239 |
try:
|
| 240 |
+
print("📥 Loading historical data from Hugging Face...")
|
|
|
|
| 241 |
|
| 242 |
+
# Try to load existing battles data
|
| 243 |
+
try:
|
| 244 |
+
battles_dataset = load_dataset(
|
| 245 |
+
self.hf_repo_id,
|
| 246 |
+
split="train",
|
| 247 |
+
use_auth_token=self.hf_token
|
| 248 |
+
)
|
| 249 |
|
| 250 |
+
if battles_dataset and len(battles_dataset) > 0:
|
| 251 |
+
conn = sqlite3.connect(self.db_path)
|
| 252 |
+
cursor = conn.cursor()
|
| 253 |
+
|
| 254 |
+
# Convert to DataFrame
|
| 255 |
+
battles_df = battles_dataset.to_pandas()
|
| 256 |
+
print(f" Found {len(battles_df)} historical battles")
|
| 257 |
+
|
| 258 |
+
# Insert battles one by one, ignoring duplicates
|
| 259 |
+
for _, row in battles_df.iterrows():
|
| 260 |
+
try:
|
| 261 |
+
cursor.execute('''
|
| 262 |
+
INSERT OR IGNORE INTO battles
|
| 263 |
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
| 264 |
+
''', (
|
| 265 |
+
row.get('id'),
|
| 266 |
+
row.get('prompt_id'),
|
| 267 |
+
row.get('prompt_text'),
|
| 268 |
+
row.get('category'),
|
| 269 |
+
row.get('model_a'),
|
| 270 |
+
row.get('model_b'),
|
| 271 |
+
row.get('response_a'),
|
| 272 |
+
row.get('response_b'),
|
| 273 |
+
row.get('winner'),
|
| 274 |
+
row.get('voter_id'),
|
| 275 |
+
row.get('timestamp'),
|
| 276 |
+
row.get('custom_prompt', 0),
|
| 277 |
+
row.get('language', 'en')
|
| 278 |
+
))
|
| 279 |
+
except Exception as e:
|
| 280 |
+
print(f" Skipping battle {row.get('id')}: {e}")
|
| 281 |
+
continue
|
| 282 |
+
|
| 283 |
+
conn.commit()
|
| 284 |
+
conn.close()
|
| 285 |
+
print(f" ✅ Loaded {len(battles_df)} battles from HF")
|
| 286 |
+
except Exception as e:
|
| 287 |
+
print(f" Note: No battles data found (might be first run): {e}")
|
| 288 |
|
| 289 |
+
# Try to load model stats
|
| 290 |
try:
|
| 291 |
+
stats_dataset = load_dataset(
|
| 292 |
+
self.hf_repo_id,
|
| 293 |
+
split="stats",
|
| 294 |
+
use_auth_token=self.hf_token
|
| 295 |
+
)
|
| 296 |
+
|
| 297 |
if stats_dataset and len(stats_dataset) > 0:
|
| 298 |
+
conn = sqlite3.connect(self.db_path)
|
| 299 |
+
cursor = conn.cursor()
|
| 300 |
+
|
| 301 |
stats_df = stats_dataset.to_pandas()
|
| 302 |
+
print(f" Found stats for {len(stats_df)} models")
|
| 303 |
+
|
| 304 |
+
# Update model stats with the latest from HF
|
| 305 |
+
for _, row in stats_df.iterrows():
|
| 306 |
+
cursor.execute('''
|
| 307 |
+
INSERT OR REPLACE INTO model_stats
|
| 308 |
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
|
| 309 |
+
''', (
|
| 310 |
+
row.get('model_name'),
|
| 311 |
+
row.get('overall_score', 5.0),
|
| 312 |
+
row.get('storytelling_score', 5.0),
|
| 313 |
+
row.get('innovation_score', 5.0),
|
| 314 |
+
row.get('business_score', 5.0),
|
| 315 |
+
row.get('total_battles', 0),
|
| 316 |
+
row.get('wins', 0),
|
| 317 |
+
row.get('losses', 0),
|
| 318 |
+
row.get('elo_rating', 1500)
|
| 319 |
+
))
|
| 320 |
|
| 321 |
+
conn.commit()
|
| 322 |
+
conn.close()
|
| 323 |
+
print(f" ✅ Loaded stats for {len(stats_df)} models from HF")
|
| 324 |
+
except Exception as e:
|
| 325 |
+
print(f" Note: No stats data found: {e}")
|
|
|
|
| 326 |
|
| 327 |
+
# Recalculate stats based on loaded battles
|
| 328 |
+
self._recalculate_stats_from_battles()
|
|
|
|
| 329 |
|
| 330 |
except Exception as e:
|
| 331 |
+
print(f"⚠️ Could not sync from HF (might be first run): {e}")
|
| 332 |
+
|
| 333 |
+
def _recalculate_stats_from_battles(self):
|
| 334 |
+
"""Recalculate model stats from battle history"""
|
| 335 |
+
conn = sqlite3.connect(self.db_path)
|
| 336 |
+
cursor = conn.cursor()
|
| 337 |
+
|
| 338 |
+
# Reset stats for GPT-5 and jetXA
|
| 339 |
+
models = ["GPT-5", "jetXA"]
|
| 340 |
+
for model in models:
|
| 341 |
+
cursor.execute('''
|
| 342 |
+
INSERT OR REPLACE INTO model_stats
|
| 343 |
+
(model_name, overall_score, storytelling_score, innovation_score,
|
| 344 |
+
business_score, total_battles, wins, losses, elo_rating)
|
| 345 |
+
VALUES (?, 5.0, 5.0, 5.0, 5.0, 0, 0, 0, 1500)
|
| 346 |
+
''', (model,))
|
| 347 |
+
|
| 348 |
+
# Get all battles with winners
|
| 349 |
+
cursor.execute('''
|
| 350 |
+
SELECT category, model_a, model_b, winner
|
| 351 |
+
FROM battles
|
| 352 |
+
WHERE winner IS NOT NULL AND winner != ''
|
| 353 |
+
ORDER BY timestamp ASC
|
| 354 |
+
''')
|
| 355 |
+
|
| 356 |
+
battles = cursor.fetchall()
|
| 357 |
+
print(f" Recalculating stats from {len(battles)} completed battles...")
|
| 358 |
+
|
| 359 |
+
# Process each battle
|
| 360 |
+
for category, model_a, model_b, winner in battles:
|
| 361 |
+
loser = model_b if winner == model_a else model_a
|
| 362 |
+
|
| 363 |
+
# Update battle counts
|
| 364 |
+
cursor.execute('''
|
| 365 |
+
UPDATE model_stats
|
| 366 |
+
SET total_battles = total_battles + 1, wins = wins + 1
|
| 367 |
+
WHERE model_name = ?
|
| 368 |
+
''', (winner,))
|
| 369 |
+
|
| 370 |
+
cursor.execute('''
|
| 371 |
+
UPDATE model_stats
|
| 372 |
+
SET total_battles = total_battles + 1, losses = losses + 1
|
| 373 |
+
WHERE model_name = ?
|
| 374 |
+
''', (loser,))
|
| 375 |
+
|
| 376 |
+
# Update category scores
|
| 377 |
+
try:
|
| 378 |
+
cat_enum = Category(category)
|
| 379 |
+
self._update_category_scores(cursor, winner, cat_enum, True)
|
| 380 |
+
self._update_category_scores(cursor, loser, cat_enum, False)
|
| 381 |
+
|
| 382 |
+
# Update ELO
|
| 383 |
+
self._update_elo_ratings(cursor, winner, loser)
|
| 384 |
+
except Exception as e:
|
| 385 |
+
print(f" Error processing battle: {e}")
|
| 386 |
+
continue
|
| 387 |
+
|
| 388 |
+
conn.commit()
|
| 389 |
+
conn.close()
|
| 390 |
+
print(f" ✅ Stats recalculated successfully")
|
| 391 |
|
| 392 |
def _sync_to_hf(self):
|
| 393 |
"""Sync local database to Hugging Face"""
|
|
|
|
| 407 |
token=self.hf_token,
|
| 408 |
private=True
|
| 409 |
)
|
| 410 |
+
print(f" 📤 Synced {len(battles_df)} battles to HF")
|
| 411 |
|
| 412 |
# Export model stats
|
| 413 |
stats_df = pd.read_sql_query("SELECT * FROM model_stats", conn)
|
|
|
|
| 419 |
token=self.hf_token,
|
| 420 |
private=True
|
| 421 |
)
|
| 422 |
+
print(f" 📤 Synced stats for {len(stats_df)} models to HF")
|
| 423 |
|
| 424 |
conn.close()
|
|
|
|
| 425 |
|
| 426 |
except Exception as e:
|
| 427 |
+
print(f"⚠️ Warning: Could not sync to HF: {e}")
|
| 428 |
|
| 429 |
def init_database(self):
|
| 430 |
"""Initialize SQLite database"""
|
|
|
|
| 704 |
full_response = self._get_jetxa_response(full_prompt)
|
| 705 |
|
| 706 |
if full_response:
|
| 707 |
+
# Format jetXA response AFTER getting it
|
| 708 |
formatted_response = self._format_jetxa_response(full_response)
|
| 709 |
|
| 710 |
+
# Stream the formatted response character by character for smooth effect
|
| 711 |
+
# This preserves all formatting including line breaks
|
| 712 |
accumulated = ""
|
| 713 |
+
chunk_size = 5 # Characters at a time
|
| 714 |
|
| 715 |
+
for i in range(0, len(formatted_response), chunk_size):
|
| 716 |
+
chunk = formatted_response[i:i+chunk_size]
|
| 717 |
+
accumulated += chunk
|
| 718 |
+
yield accumulated
|
| 719 |
+
time.sleep(0.01) # Small delay for streaming effect
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 720 |
else:
|
| 721 |
# Use fallback if jetXA fails
|
| 722 |
fallback = self._generate_fallback(model, prompt, language)
|
|
|
|
| 933 |
yield fallback
|
| 934 |
|
| 935 |
def _get_jetxa_response(self, prompt: str) -> str:
|
| 936 |
+
"""Get complete response from jetXA with improved parsing"""
|
| 937 |
if not self.gradio_client:
|
| 938 |
return ""
|
| 939 |
|
|
|
|
| 949 |
|
| 950 |
response_text = ""
|
| 951 |
|
| 952 |
+
# Debug: Print the result structure
|
| 953 |
+
print(f"jetXA result type: {type(result)}")
|
| 954 |
+
if isinstance(result, (tuple, list)):
|
| 955 |
+
print(f"jetXA result length: {len(result)}")
|
| 956 |
+
for i, item in enumerate(result[:3]): # Print first 3 items
|
| 957 |
+
print(f" Item {i} type: {type(item)}")
|
| 958 |
+
if isinstance(item, str):
|
| 959 |
+
print(f" String preview: {item[:100]}...")
|
| 960 |
+
elif isinstance(item, list) and len(item) > 0:
|
| 961 |
+
print(f" List length: {len(item)}")
|
| 962 |
+
|
| 963 |
if result and isinstance(result, (tuple, list)) and len(result) >= 1:
|
| 964 |
+
# Try multiple extraction methods
|
| 965 |
+
|
| 966 |
+
# Method 1: Check if first element is chat history
|
| 967 |
chat_history = result[0]
|
| 968 |
|
| 969 |
if isinstance(chat_history, list) and len(chat_history) > 0:
|
| 970 |
+
# Look for the last assistant message
|
| 971 |
for msg in reversed(chat_history):
|
| 972 |
if isinstance(msg, dict):
|
| 973 |
+
# Check for 'content' or 'message' key
|
| 974 |
+
content = msg.get('content') or msg.get('message') or msg.get('text', '')
|
| 975 |
+
if content and str(content).strip():
|
| 976 |
response_text = str(content)
|
| 977 |
+
print(f" Found response in dict format")
|
| 978 |
break
|
| 979 |
elif isinstance(msg, (list, tuple)) and len(msg) >= 2:
|
| 980 |
+
# Format: [user_msg, assistant_msg]
|
| 981 |
+
if msg[1] and str(msg[1]).strip():
|
| 982 |
response_text = str(msg[1])
|
| 983 |
+
print(f" Found response in tuple format")
|
| 984 |
break
|
| 985 |
+
elif isinstance(msg, str) and msg.strip():
|
| 986 |
+
response_text = msg
|
| 987 |
+
print(f" Found response as string")
|
| 988 |
+
break
|
| 989 |
|
| 990 |
+
# Method 2: If no response yet, check other indices
|
| 991 |
if not response_text:
|
| 992 |
+
for i in range(1, min(4, len(result))):
|
| 993 |
+
if result[i]:
|
| 994 |
+
if isinstance(result[i], str) and result[i].strip():
|
| 995 |
+
response_text = result[i]
|
| 996 |
+
print(f" Found response at index {i}")
|
| 997 |
+
break
|
| 998 |
+
elif isinstance(result[i], dict):
|
| 999 |
+
# Try to extract from dict
|
| 1000 |
+
for key in ['content', 'message', 'text', 'response']:
|
| 1001 |
+
if key in result[i] and result[i][key]:
|
| 1002 |
+
response_text = str(result[i][key])
|
| 1003 |
+
print(f" Found response in dict at index {i}")
|
| 1004 |
+
break
|
| 1005 |
|
| 1006 |
if response_text:
|
| 1007 |
+
print(f" Response length: {len(response_text)} chars")
|
| 1008 |
+
# DO NOT clean or modify the response here - preserve original formatting
|
| 1009 |
+
return response_text
|
| 1010 |
+
else:
|
| 1011 |
+
print(f" No response text found in result")
|
| 1012 |
+
return ""
|
| 1013 |
|
| 1014 |
except Exception as e:
|
| 1015 |
print(f"jetXA response error: {e}")
|
| 1016 |
+
import traceback
|
| 1017 |
+
traceback.print_exc()
|
| 1018 |
return ""
|
| 1019 |
|
| 1020 |
+
def _format_jetxa_response(self, text: str) -> str:
|
| 1021 |
+
"""Format jetXA response with proper spacing and line breaks for better readability"""
|
| 1022 |
+
if not text:
|
| 1023 |
+
return text
|
| 1024 |
+
|
| 1025 |
+
# First, preserve the original line breaks and structure
|
| 1026 |
+
# Do NOT strip or clean the text aggressively
|
| 1027 |
+
|
| 1028 |
+
# Split into lines preserving empty lines
|
| 1029 |
+
lines = text.split('\n')
|
| 1030 |
+
formatted_lines = []
|
| 1031 |
+
|
| 1032 |
+
for i, line in enumerate(lines):
|
| 1033 |
+
# Don't strip lines completely - preserve indentation
|
| 1034 |
+
line_stripped = line.strip()
|
| 1035 |
+
|
| 1036 |
+
# Keep empty lines
|
| 1037 |
+
if not line_stripped:
|
| 1038 |
+
formatted_lines.append('')
|
| 1039 |
+
continue
|
| 1040 |
+
|
| 1041 |
+
# Headers - add spacing
|
| 1042 |
+
if line_stripped.startswith('#'):
|
| 1043 |
+
# Add spacing before headers (except first line)
|
| 1044 |
+
if i > 0 and formatted_lines and formatted_lines[-1].strip():
|
| 1045 |
+
formatted_lines.append('')
|
| 1046 |
+
formatted_lines.append(line_stripped)
|
| 1047 |
+
# Add spacing after headers
|
| 1048 |
+
formatted_lines.append('')
|
| 1049 |
+
|
| 1050 |
+
# Lists - preserve formatting
|
| 1051 |
+
elif (line_stripped.startswith('- ') or
|
| 1052 |
+
line_stripped.startswith('* ') or
|
| 1053 |
+
re.match(r'^\d+\. ', line_stripped)):
|
| 1054 |
+
# Add space before first list item if needed
|
| 1055 |
+
if (i > 0 and formatted_lines and
|
| 1056 |
+
formatted_lines[-1].strip() and
|
| 1057 |
+
not re.match(r'^[-*]|\d+\.', formatted_lines[-1].strip())):
|
| 1058 |
+
formatted_lines.append('')
|
| 1059 |
+
formatted_lines.append(line_stripped)
|
| 1060 |
+
|
| 1061 |
+
# Blockquotes
|
| 1062 |
+
elif line_stripped.startswith('>'):
|
| 1063 |
+
# Add spacing around blockquotes
|
| 1064 |
+
if i > 0 and formatted_lines and formatted_lines[-1].strip():
|
| 1065 |
+
formatted_lines.append('')
|
| 1066 |
+
formatted_lines.append(line_stripped)
|
| 1067 |
+
if i < len(lines) - 1 and not lines[i + 1].strip().startswith('>'):
|
| 1068 |
+
formatted_lines.append('')
|
| 1069 |
+
|
| 1070 |
+
# Tables
|
| 1071 |
+
elif '|' in line_stripped:
|
| 1072 |
+
formatted_lines.append(line_stripped)
|
| 1073 |
+
|
| 1074 |
+
# Regular text
|
| 1075 |
+
else:
|
| 1076 |
+
# Check if this line ends a sentence
|
| 1077 |
+
prev_line = formatted_lines[-1] if formatted_lines else ''
|
| 1078 |
+
|
| 1079 |
+
# Add paragraph break after complete sentences
|
| 1080 |
+
if (prev_line and
|
| 1081 |
+
not prev_line.startswith('#') and
|
| 1082 |
+
not prev_line.startswith('-') and
|
| 1083 |
+
not prev_line.startswith('*') and
|
| 1084 |
+
not re.match(r'^\d+\.', prev_line) and
|
| 1085 |
+
(prev_line.endswith('.') or
|
| 1086 |
+
prev_line.endswith('!') or
|
| 1087 |
+
prev_line.endswith('?') or
|
| 1088 |
+
prev_line.endswith(':**') or
|
| 1089 |
+
prev_line.endswith('다.') or
|
| 1090 |
+
prev_line.endswith('요.') or
|
| 1091 |
+
prev_line.endswith('니다.') or
|
| 1092 |
+
prev_line.endswith('습니다.'))):
|
| 1093 |
+
# This is a new paragraph
|
| 1094 |
+
formatted_lines.append('')
|
| 1095 |
+
|
| 1096 |
+
formatted_lines.append(line_stripped)
|
| 1097 |
+
|
| 1098 |
+
# Join lines
|
| 1099 |
+
result = '\n'.join(formatted_lines)
|
| 1100 |
+
|
| 1101 |
+
# Clean up excessive blank lines (max 2)
|
| 1102 |
+
while '\n\n\n\n' in result:
|
| 1103 |
+
result = result.replace('\n\n\n\n', '\n\n')
|
| 1104 |
+
while '\n\n\n' in result:
|
| 1105 |
+
result = result.replace('\n\n\n', '\n\n')
|
| 1106 |
+
|
| 1107 |
+
return result.strip()
|
| 1108 |
+
|
| 1109 |
def _clean_markdown_response(self, text: str) -> str:
|
| 1110 |
"""Clean and fix common markdown formatting issues"""
|
| 1111 |
# Remove any duplicate markers or broken formatting
|