Creative-Arena-Leaderboard

Sleeping

App Files Files Community

openfree commited on Aug 21, 2025

Commit

c581326

verified ·

1 Parent(s): 6947b37

Update app.py

Browse files

Files changed (1) hide show

app.py +294 -57

app.py CHANGED Viewed

@@ -215,7 +215,8 @@ class ArenaDatabase:
         # Always initialize local database as fallback
         self.init_database()
-        # Sync from HF if available
         if self.use_hf:
             self._sync_from_hf()
@@ -234,46 +235,159 @@ class ArenaDatabase:
             print(f"Dataset repo creation note: {e}")
     def _sync_from_hf(self):
-        """Sync data from Hugging Face to local database"""
         try:
-            # Try to load existing data from HF
-            dataset = load_dataset(self.hf_repo_id, split="train", use_auth_token=self.hf_token)
-            conn = sqlite3.connect(self.db_path)
-            cursor = conn.cursor()
-            # Load battles
-            if "battles" in dataset.column_names:
-                battles_df = dataset.to_pandas()
-                battles_df.to_sql('battles_temp', conn, if_exists='replace', index=False)
-                # Merge with existing battles (avoid duplicates)
-                cursor.execute('''
-                    INSERT OR IGNORE INTO battles
-                    SELECT * FROM battles_temp
-                ''')
-                cursor.execute('DROP TABLE battles_temp')
-            # Load model stats
             try:
-                stats_dataset = load_dataset(self.hf_repo_id, split="stats", use_auth_token=self.hf_token)
                 if stats_dataset and len(stats_dataset) > 0:
                     stats_df = stats_dataset.to_pandas()
-                    stats_df.to_sql('model_stats_temp', conn, if_exists='replace', index=False)
-                    # Update stats with latest from HF
-                    cursor.execute('DELETE FROM model_stats')
-                    cursor.execute('INSERT INTO model_stats SELECT * FROM model_stats_temp')
-                    cursor.execute('DROP TABLE model_stats_temp')
-            except:
-                pass  # Stats split might not exist yet
-            conn.commit()
-            conn.close()
-            print("✅ Synced data from Hugging Face")
         except Exception as e:
-            print(f"Note: Could not sync from HF (might be first run): {e}")
     def _sync_to_hf(self):
         """Sync local database to Hugging Face"""
@@ -293,6 +407,7 @@ class ArenaDatabase:
                     token=self.hf_token,
                     private=True
                 )
             # Export model stats
             stats_df = pd.read_sql_query("SELECT * FROM model_stats", conn)
@@ -304,12 +419,12 @@ class ArenaDatabase:
                     token=self.hf_token,
                     private=True
                 )
             conn.close()
-            print("✅ Synced data to Hugging Face")
         except Exception as e:
-            print(f"Warning: Could not sync to HF: {e}")
     def init_database(self):
         """Initialize SQLite database"""
@@ -589,23 +704,19 @@ class LLMInterface:
                 full_response = self._get_jetxa_response(full_prompt)
                 if full_response:
-                    # Format jetXA response with proper spacing
                     formatted_response = self._format_jetxa_response(full_response)
-                    # Simulate streaming word by word for jetXA for smoother effect
-                    words = formatted_response.split()
                     accumulated = ""
-                    # Stream words in small batches for natural effect
-                    batch_size = 2  # Stream 2 words at a time
-                    for i in range(0, len(words), batch_size):
-                        batch = words[i:i+batch_size]
-                        for word in batch:
-                            if accumulated:
-                                accumulated += " "
-                            accumulated += word
-                        yield accumulated  # Yield accumulated text after each batch
-                        time.sleep(0.03)  # Small delay between batches
                 else:
                     # Use fallback if jetXA fails
                     fallback = self._generate_fallback(model, prompt, language)
@@ -822,7 +933,7 @@ class LLMInterface:
             yield fallback
     def _get_jetxa_response(self, prompt: str) -> str:
-        """Get complete response from jetXA"""
         if not self.gradio_client:
             return ""
@@ -838,37 +949,163 @@ class LLMInterface:
             response_text = ""
             if result and isinstance(result, (tuple, list)) and len(result) >= 1:
                 chat_history = result[0]
                 if isinstance(chat_history, list) and len(chat_history) > 0:
                     for msg in reversed(chat_history):
                         if isinstance(msg, dict):
-                            content = msg.get('content', '')
-                            if content:
                                 response_text = str(content)
                                 break
                         elif isinstance(msg, (list, tuple)) and len(msg) >= 2:
-                            if msg[1]:
                                 response_text = str(msg[1])
                                 break
                 if not response_text:
-                    for i in range(1, min(3, len(result))):
-                        if result[i] and isinstance(result[i], str) and result[i].strip():
-                            response_text = result[i]
-                            break
             if response_text:
-                # Clean up any potential formatting issues
-                response_text = self._clean_markdown_response(response_text)
-            return response_text
         except Exception as e:
             print(f"jetXA response error: {e}")
             return ""
     def _clean_markdown_response(self, text: str) -> str:
         """Clean and fix common markdown formatting issues"""
         # Remove any duplicate markers or broken formatting

         # Always initialize local database as fallback
         self.init_database()
+        # IMPORTANT: Sync from HF AFTER initializing local DB
+        # This will load all historical data
         if self.use_hf:
             self._sync_from_hf()
             print(f"Dataset repo creation note: {e}")
     def _sync_from_hf(self):
+        """Sync data from Hugging Face to local database - PRESERVES existing data"""
         try:
+            print("📥 Loading historical data from Hugging Face...")
+            # Try to load existing battles data
+            try:
+                battles_dataset = load_dataset(
+                    self.hf_repo_id,
+                    split="train",
+                    use_auth_token=self.hf_token
+                )
+                if battles_dataset and len(battles_dataset) > 0:
+                    conn = sqlite3.connect(self.db_path)
+                    cursor = conn.cursor()
+                    # Convert to DataFrame
+                    battles_df = battles_dataset.to_pandas()
+                    print(f"  Found {len(battles_df)} historical battles")
+                    # Insert battles one by one, ignoring duplicates
+                    for _, row in battles_df.iterrows():
+                        try:
+                            cursor.execute('''
+                                INSERT OR IGNORE INTO battles
+                                VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+                            ''', (
+                                row.get('id'),
+                                row.get('prompt_id'),
+                                row.get('prompt_text'),
+                                row.get('category'),
+                                row.get('model_a'),
+                                row.get('model_b'),
+                                row.get('response_a'),
+                                row.get('response_b'),
+                                row.get('winner'),
+                                row.get('voter_id'),
+                                row.get('timestamp'),
+                                row.get('custom_prompt', 0),
+                                row.get('language', 'en')
+                            ))
+                        except Exception as e:
+                            print(f"    Skipping battle {row.get('id')}: {e}")
+                            continue
+                    conn.commit()
+                    conn.close()
+                    print(f"  ✅ Loaded {len(battles_df)} battles from HF")
+            except Exception as e:
+                print(f"  Note: No battles data found (might be first run): {e}")
+            # Try to load model stats
             try:
+                stats_dataset = load_dataset(
+                    self.hf_repo_id,
+                    split="stats",
+                    use_auth_token=self.hf_token
+                )
                 if stats_dataset and len(stats_dataset) > 0:
+                    conn = sqlite3.connect(self.db_path)
+                    cursor = conn.cursor()
                     stats_df = stats_dataset.to_pandas()
+                    print(f"  Found stats for {len(stats_df)} models")
+                    # Update model stats with the latest from HF
+                    for _, row in stats_df.iterrows():
+                        cursor.execute('''
+                            INSERT OR REPLACE INTO model_stats
+                            VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
+                        ''', (
+                            row.get('model_name'),
+                            row.get('overall_score', 5.0),
+                            row.get('storytelling_score', 5.0),
+                            row.get('innovation_score', 5.0),
+                            row.get('business_score', 5.0),
+                            row.get('total_battles', 0),
+                            row.get('wins', 0),
+                            row.get('losses', 0),
+                            row.get('elo_rating', 1500)
+                        ))
+                    conn.commit()
+                    conn.close()
+                    print(f"  ✅ Loaded stats for {len(stats_df)} models from HF")
+            except Exception as e:
+                print(f"  Note: No stats data found: {e}")
+            # Recalculate stats based on loaded battles
+            self._recalculate_stats_from_battles()
         except Exception as e:
+            print(f"⚠️ Could not sync from HF (might be first run): {e}")
+    def _recalculate_stats_from_battles(self):
+        """Recalculate model stats from battle history"""
+        conn = sqlite3.connect(self.db_path)
+        cursor = conn.cursor()
+        # Reset stats for GPT-5 and jetXA
+        models = ["GPT-5", "jetXA"]
+        for model in models:
+            cursor.execute('''
+                INSERT OR REPLACE INTO model_stats
+                (model_name, overall_score, storytelling_score, innovation_score,
+                 business_score, total_battles, wins, losses, elo_rating)
+                VALUES (?, 5.0, 5.0, 5.0, 5.0, 0, 0, 0, 1500)
+            ''', (model,))
+        # Get all battles with winners
+        cursor.execute('''
+            SELECT category, model_a, model_b, winner
+            FROM battles
+            WHERE winner IS NOT NULL AND winner != ''
+            ORDER BY timestamp ASC
+        ''')
+        battles = cursor.fetchall()
+        print(f"  Recalculating stats from {len(battles)} completed battles...")
+        # Process each battle
+        for category, model_a, model_b, winner in battles:
+            loser = model_b if winner == model_a else model_a
+            # Update battle counts
+            cursor.execute('''
+                UPDATE model_stats
+                SET total_battles = total_battles + 1, wins = wins + 1
+                WHERE model_name = ?
+            ''', (winner,))
+            cursor.execute('''
+                UPDATE model_stats
+                SET total_battles = total_battles + 1, losses = losses + 1
+                WHERE model_name = ?
+            ''', (loser,))
+            # Update category scores
+            try:
+                cat_enum = Category(category)
+                self._update_category_scores(cursor, winner, cat_enum, True)
+                self._update_category_scores(cursor, loser, cat_enum, False)
+                # Update ELO
+                self._update_elo_ratings(cursor, winner, loser)
+            except Exception as e:
+                print(f"    Error processing battle: {e}")
+                continue
+        conn.commit()
+        conn.close()
+        print(f"  ✅ Stats recalculated successfully")
     def _sync_to_hf(self):
         """Sync local database to Hugging Face"""
                     token=self.hf_token,
                     private=True
                 )
+                print(f"  📤 Synced {len(battles_df)} battles to HF")
             # Export model stats
             stats_df = pd.read_sql_query("SELECT * FROM model_stats", conn)
                     token=self.hf_token,
                     private=True
                 )
+                print(f"  📤 Synced stats for {len(stats_df)} models to HF")
             conn.close()
         except Exception as e:
+            print(f"⚠️ Warning: Could not sync to HF: {e}")
     def init_database(self):
         """Initialize SQLite database"""
                 full_response = self._get_jetxa_response(full_prompt)
                 if full_response:
+                    # Format jetXA response AFTER getting it
                     formatted_response = self._format_jetxa_response(full_response)
+                    # Stream the formatted response character by character for smooth effect
+                    # This preserves all formatting including line breaks
                     accumulated = ""
+                    chunk_size = 5  # Characters at a time
+                    for i in range(0, len(formatted_response), chunk_size):
+                        chunk = formatted_response[i:i+chunk_size]
+                        accumulated += chunk
+                        yield accumulated
+                        time.sleep(0.01)  # Small delay for streaming effect
                 else:
                     # Use fallback if jetXA fails
                     fallback = self._generate_fallback(model, prompt, language)
             yield fallback
     def _get_jetxa_response(self, prompt: str) -> str:
+        """Get complete response from jetXA with improved parsing"""
         if not self.gradio_client:
             return ""
             response_text = ""
+            # Debug: Print the result structure
+            print(f"jetXA result type: {type(result)}")
+            if isinstance(result, (tuple, list)):
+                print(f"jetXA result length: {len(result)}")
+                for i, item in enumerate(result[:3]):  # Print first 3 items
+                    print(f"  Item {i} type: {type(item)}")
+                    if isinstance(item, str):
+                        print(f"    String preview: {item[:100]}...")
+                    elif isinstance(item, list) and len(item) > 0:
+                        print(f"    List length: {len(item)}")
             if result and isinstance(result, (tuple, list)) and len(result) >= 1:
+                # Try multiple extraction methods
+                # Method 1: Check if first element is chat history
                 chat_history = result[0]
                 if isinstance(chat_history, list) and len(chat_history) > 0:
+                    # Look for the last assistant message
                     for msg in reversed(chat_history):
                         if isinstance(msg, dict):
+                            # Check for 'content' or 'message' key
+                            content = msg.get('content') or msg.get('message') or msg.get('text', '')
+                            if content and str(content).strip():
                                 response_text = str(content)
+                                print(f"  Found response in dict format")
                                 break
                         elif isinstance(msg, (list, tuple)) and len(msg) >= 2:
+                            # Format: [user_msg, assistant_msg]
+                            if msg[1] and str(msg[1]).strip():
                                 response_text = str(msg[1])
+                                print(f"  Found response in tuple format")
                                 break
+                        elif isinstance(msg, str) and msg.strip():
+                            response_text = msg
+                            print(f"  Found response as string")
+                            break
+                # Method 2: If no response yet, check other indices
                 if not response_text:
+                    for i in range(1, min(4, len(result))):
+                        if result[i]:
+                            if isinstance(result[i], str) and result[i].strip():
+                                response_text = result[i]
+                                print(f"  Found response at index {i}")
+                                break
+                            elif isinstance(result[i], dict):
+                                # Try to extract from dict
+                                for key in ['content', 'message', 'text', 'response']:
+                                    if key in result[i] and result[i][key]:
+                                        response_text = str(result[i][key])
+                                        print(f"  Found response in dict at index {i}")
+                                        break
             if response_text:
+                print(f"  Response length: {len(response_text)} chars")
+                # DO NOT clean or modify the response here - preserve original formatting
+                return response_text
+            else:
+                print(f"  No response text found in result")
+                return ""
         except Exception as e:
             print(f"jetXA response error: {e}")
+            import traceback
+            traceback.print_exc()
             return ""
+    def _format_jetxa_response(self, text: str) -> str:
+        """Format jetXA response with proper spacing and line breaks for better readability"""
+        if not text:
+            return text
+        # First, preserve the original line breaks and structure
+        # Do NOT strip or clean the text aggressively
+        # Split into lines preserving empty lines
+        lines = text.split('\n')
+        formatted_lines = []
+        for i, line in enumerate(lines):
+            # Don't strip lines completely - preserve indentation
+            line_stripped = line.strip()
+            # Keep empty lines
+            if not line_stripped:
+                formatted_lines.append('')
+                continue
+            # Headers - add spacing
+            if line_stripped.startswith('#'):
+                # Add spacing before headers (except first line)
+                if i > 0 and formatted_lines and formatted_lines[-1].strip():
+                    formatted_lines.append('')
+                formatted_lines.append(line_stripped)
+                # Add spacing after headers
+                formatted_lines.append('')
+            # Lists - preserve formatting
+            elif (line_stripped.startswith('- ') or
+                  line_stripped.startswith('* ') or
+                  re.match(r'^\d+\. ', line_stripped)):
+                # Add space before first list item if needed
+                if (i > 0 and formatted_lines and
+                    formatted_lines[-1].strip() and
+                    not re.match(r'^[-*]|\d+\.', formatted_lines[-1].strip())):
+                    formatted_lines.append('')
+                formatted_lines.append(line_stripped)
+            # Blockquotes
+            elif line_stripped.startswith('>'):
+                # Add spacing around blockquotes
+                if i > 0 and formatted_lines and formatted_lines[-1].strip():
+                    formatted_lines.append('')
+                formatted_lines.append(line_stripped)
+                if i < len(lines) - 1 and not lines[i + 1].strip().startswith('>'):
+                    formatted_lines.append('')
+            # Tables
+            elif '|' in line_stripped:
+                formatted_lines.append(line_stripped)
+            # Regular text
+            else:
+                # Check if this line ends a sentence
+                prev_line = formatted_lines[-1] if formatted_lines else ''
+                # Add paragraph break after complete sentences
+                if (prev_line and
+                    not prev_line.startswith('#') and
+                    not prev_line.startswith('-') and
+                    not prev_line.startswith('*') and
+                    not re.match(r'^\d+\.', prev_line) and
+                    (prev_line.endswith('.') or
+                     prev_line.endswith('!') or
+                     prev_line.endswith('?') or
+                     prev_line.endswith(':**') or
+                     prev_line.endswith('다.') or
+                     prev_line.endswith('요.') or
+                     prev_line.endswith('니다.') or
+                     prev_line.endswith('습니다.'))):
+                    # This is a new paragraph
+                    formatted_lines.append('')
+                formatted_lines.append(line_stripped)
+        # Join lines
+        result = '\n'.join(formatted_lines)
+        # Clean up excessive blank lines (max 2)
+        while '\n\n\n\n' in result:
+            result = result.replace('\n\n\n\n', '\n\n')
+        while '\n\n\n' in result:
+            result = result.replace('\n\n\n', '\n\n')
+        return result.strip()
     def _clean_markdown_response(self, text: str) -> str:
         """Clean and fix common markdown formatting issues"""
         # Remove any duplicate markers or broken formatting