File size: 25,177 Bytes

c28358e

#!/usr/bin/env python3
"""
Comprehensive GPT-OSS-120B Demonstration with Output Saving
"""

from mlx_lm import load, generate
import logging
import re
import time
import json
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import plotly.graph_objects as go
import plotly.express as px
from collections import Counter
import numpy as np
from typing import List, Dict
import textwrap
import os
from datetime import datetime

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class GPTOSSDemo:
    def __init__(self):
        logger.info("🚀 Loading GPT-OSS-120B...")
        self.model, self.tokenizer = load("mlx-community/gpt-oss-120b-MXFP4-Q4")
        logger.info("✅ Model loaded successfully!")
        self.transcript = ""
        self.timestamps = {}
        self.timestamps_2 = {}
        self.output_dir = f"gpt_oss_output_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
        os.makedirs(self.output_dir, exist_ok=True)
        logger.info(f"📁 Output directory created: {self.output_dir}")
    
    def save_output(self, content: str, filename: str):
        """Save content to a file in the output directory"""
        filepath = os.path.join(self.output_dir, filename)
        with open(filepath, 'w', encoding='utf-8') as f:
            f.write(content)
        logger.info(f"💾 Saved output to: {filepath}")
        return filepath
    
    def save_plotly_html(self, fig, filename: str):
        """Save a plotly figure as HTML"""
        filepath = os.path.join(self.output_dir, filename)
        fig.write_html(filepath)
        logger.info(f"📊 Saved Plotly visualization to: {filepath}")
        return filepath
    
    def save_matplotlib_figure(self, fig, filename: str):
        """Save a matplotlib figure to file"""
        filepath = os.path.join(self.output_dir, filename)
        fig.savefig(filepath, bbox_inches='tight', dpi=300)
        logger.info(f"📈 Saved matplotlib figure to: {filepath}")
        return filepath
    
    def load_data(self, transcript_path: str, timestamps_path: str = None, timestamps_2_path: str = None):
        """Load lecture data"""
        try:
            with open(transcript_path, 'r', encoding='utf-8') as f:
                self.transcript = f.read()
            logger.info(f"✅ Loaded transcript: {len(self.transcript)} characters")
            
            # Save transcript
            self.save_output(self.transcript, "original_transcript.txt")
            
            if timestamps_path:
                with open(timestamps_path, 'r', encoding='utf-8') as f:
                    self.timestamps = json.load(f)
                logger.info("✅ Loaded timestamps data")
                self.save_output(json.dumps(self.timestamps, indent=2), "timestamps_1.json")
                
            if timestamps_2_path:
                with open(timestamps_2_path, 'r', encoding='utf-8') as f:
                    self.timestamps_2 = json.load(f)
                logger.info("✅ Loaded timestamps_2 data")
                self.save_output(json.dumps(self.timestamps_2, indent=2), "timestamps_2.json")
                
        except Exception as e:
            logger.error(f"❌ Error loading data: {e}")
            raise
    
    def extract_final_response(self, response: str) -> str:
        """Extract the final assistant response from the chat template"""
        if "<|start|>assistant" in response:
            parts = response.split("<|start|>assistant")
            if len(parts) > 1:
                final_part = parts[-1]
                final_part = re.sub(r'<\|channel\|>[^<]+', '', final_part)
                final_part = final_part.replace('<|message|>', '')
                final_part = final_part.replace('<|end|>', '')
                final_part = re.sub(r'<[^>]+>', '', final_part)
                final_part = final_part.strip()
                if final_part:
                    return final_part
        
        cleaned = re.sub(r'<\|[^>]+\|>', '', response)
        cleaned = re.sub(r'<[^>]+>', '', cleaned)
        return cleaned.strip()
    
    def generate_response(self, prompt: str, max_tokens: int = 2048) -> str:
        """Generate a response with proper formatting"""
        try:
            messages = [{"role": "user", "content": prompt}]
            formatted_prompt = self.tokenizer.apply_chat_template(
                messages, add_generation_prompt=True
            )
            
            response = generate(
                self.model,
                self.tokenizer,
                prompt=formatted_prompt,
                max_tokens=max_tokens,
                verbose=False
            )
            
            return self.extract_final_response(response)
            
        except Exception as e:
            logger.error(f"Generation error: {e}")
            return f"I encountered an error: {str(e)}"
    
    def generate_tshirt_prompts(self):
        """Generate Flux1-Krea-dev graphic t-shirt prompts based on the lecture"""
        print("\n" + "=" * 80)
        print("👕 FLUX1-KREA-DEV T-SHIRT PROMPTS")
        print("=" * 80)
        
        prompt = f"""Create 3 graphic t-shirt design prompts for Flux1-Krea-dev based on Yuval Noah Harari's lecture 
        "Storytelling, Human Cooperation, and the Rise of AI" in London on June 11, 2025.
        
        Each prompt should:
        1. Include 1-2 powerful words that capture the essence of the lecture
        2. Describe a visually striking design that represents the themes
        3. Incorporate elements related to storytelling, AI, and human cooperation
        4. Be suitable for printing on a t-shirt
        
        Lecture themes: {self.transcript[:3000]}
        
        Create 3 distinct prompts:
        
        PROMPT 1:
        Words: 
        Design: 
        
        PROMPT 2:
        Words: 
        Design: 
        
        PROMPT 3:
        Words: 
        Design: """
        
        tshirt_prompts = self.generate_response(prompt, max_tokens=1024)
        print(tshirt_prompts)
        self.save_output(tshirt_prompts, "flux1_krea_dev_tshirt_prompts.txt")
        
        # Generate additional minimalist versions
        print("\n" + "-" * 40)
        print("🎨 MINIMALIST T-SHIRT DESIGNS")
        print("-" * 40)
        
        minimalist_prompt = f"""Create 3 minimalist t-shirt design concepts based on Yuval Noah Harari's lecture.
        Each should feature only 1-2 words that perfectly capture the essence of the lecture.
        
        Lecture themes: {self.transcript[:2000]}
        
        Design 1: [Word(s)] - [Brief explanation]
        Design 2: [Word(s)] - [Brief explanation]  
        Design 3: [Word(s)] - [Brief explanation]"""
        
        minimalist_designs = self.generate_response(minimalist_prompt, max_tokens=512)
        print(minimalist_designs)
        self.save_output(minimalist_designs, "minimalist_tshirt_designs.txt")
    
    def generate_summaries(self):
        """Generate summaries of various lengths and save them"""
        print("\n" + "=" * 80)
        print("📝 MULTI-LENGTH SUMMARIES")
        print("=" * 80)
        
        summary_lengths = [10, 150, 200, 250, 300]
        all_summaries = []
        
        for length in summary_lengths:
            print(f"\nGenerating {length}-word summary...")
            prompt = f"""Create a precise {length}-word summary of this lecture. Focus on key themes:
            storytelling, AI risks/benefits, alignment problem, and human values.
            Provide only the final concise summary without any additional commentary or word counting.

            Transcript: {self.transcript[:6000]}

            {length}-word summary:"""
            
            summary = self.generate_response(prompt, max_tokens=500)
            
            # Clean up the summary to remove any analysis or word counting text
            clean_summary = re.sub(r'(analysis|count|words|draft|let\'s|must be exactly).*?summary:', '', summary, flags=re.IGNORECASE | re.DOTALL)
            clean_summary = re.sub(r'now count words.*', '', clean_summary, flags=re.IGNORECASE | re.DOTALL)
            clean_summary = re.sub(r'\d+ words.*', '', clean_summary)
            clean_summary = clean_summary.strip()
            
            print(f"✅ {length}-word summary:")
            print("-" * 60)
            print(textwrap.fill(clean_summary, width=70))
            print("-" * 60)
            
            # Save individual summary
            self.save_output(clean_summary, f"summary_{length}_words.txt")
            all_summaries.append(f"{length}-word summary:\n{clean_summary}\n\n")
            
            time.sleep(1)
        
        # Save all summaries in one file
        self.save_output("\n".join(all_summaries), "all_summaries.txt")
    
    def create_visualizations(self):
        """Create various visualizations of the lecture content and save them as HTML"""
        print("\n" + "=" * 80)
        print("📊 DATA VISUALIZATIONS")
        print("=" * 80)
        
        # Word frequency analysis
        words = re.findall(r'\b[a-zA-Z]{3,}\b', self.transcript.lower())
        word_freq = Counter(words)
        common_words = word_freq.most_common(500)
        
        # Create Plotly bar chart and save as HTML
        words, counts = zip(*common_words)
        fig = px.bar(x=words, y=counts, title="Top 500 Words in Lecture")
        self.save_plotly_html(fig, "word_frequency.html")
        
        # Save word frequency data
        freq_data = "\n".join([f"{word}: {count}" for word, count in common_words])
        self.save_output(freq_data, "word_frequency_data.txt")
        
        # Create word cloud with matplotlib (since Plotly doesn't have word cloud)
        print("\nGenerating word cloud...")
        wordcloud = WordCloud(width=800, height=400, background_color='white').generate(self.transcript)
        plt.figure(figsize=(10, 5))
        plt.imshow(wordcloud, interpolation='bilinear')
        plt.axis('off')
        plt.title('Word Cloud of Lecture Content')
        self.save_matplotlib_figure(plt, "word_cloud.png")
        plt.close()
        
        # Topic distribution visualization using Plotly
        print("\nGenerating topic analysis...")
        topics = ['AI', 'storytelling', 'ethics', 'risk', 'cooperation', 'trust', 'alignment']
        topic_counts = {topic: self.transcript.lower().count(topic) for topic in topics}
        
        fig = px.pie(values=list(topic_counts.values()), names=list(topic_counts.keys()), 
                    title='Topic Distribution in Lecture')
        self.save_plotly_html(fig, "topic_distribution.html")
        
        # Save topic data
        topic_data = "\n".join([f"{topic}: {count}" for topic, count in topic_counts.items()])
        self.save_output(topic_data, "topic_data.txt")
    
    def generate_debate(self):
        """Generate pro and con arguments about the lecture themes and save them"""
        print("\n" + "=" * 80)
        print("⚖️ DEBATE: AI DEVELOPMENT - PROS AND CONS")
        print("=" * 80)
        
        prompt = f"""Based on this lecture, create a structured debate with 5 strong arguments FOR 
        rapid AI development and 5 strong arguments AGAINST rapid AI development. 
        Format as two clear sections with compelling points.

        Lecture content: {self.transcript[:8000]}

        DEBATE STRUCTURE:

        ARGUMENTS FOR RAPID AI DEVELOPMENT:
        1. 
        2. 
        3. 
        4. 
        5. 

        ARGUMENTS AGAINST RAPID AI DEVELOPMENT:
        1. 
        2. 
        3. 
        4. 
        5. """
        
        debate = self.generate_response(prompt, max_tokens=1024)
        print(debate)
        self.save_output(debate, "ai_development_debate.txt")
    
    def write_article(self):
        """Generate a professional article about the lecture and save it"""
        print("\n" + "=" * 80)
        print("📰 PROFESSIONAL ARTICLE")
        print("=" * 80)
        
        prompt = f"""Write a comprehensive 500-word article suitable for a technology publication 
        about this Yuval Harari lecture. Include:
        - Key themes discussed
        - Importance of the alignment problem
        - Societal implications of AI storytelling
        - Expert perspectives from the lecture
        - Future outlook

        Transcript: {self.transcript[:10000]}

        ARTICLE:"""
        
        article = self.generate_response(prompt, max_tokens=1024)
        print(article)
        self.save_output(article, "professional_article.txt")
    
    def write_editorial(self):
        """Generate an editorial opinion piece and save it"""
        print("\n" + "=" * 80)
        print("✍️ EDITORIAL OPINION")
        print("=" * 80)
        
        prompt = f"""Write a compelling editorial (400 words) expressing a strong viewpoint about 
        the issues raised in this lecture. Take a clear stance on AI regulation and development,
        supporting your position with evidence from the lecture.

        Key lecture points: {self.transcript[:5000]}

        EDITORIAL:"""
        
        editorial = self.generate_response(prompt, max_tokens=1024)
        print(editorial)
        self.save_output(editorial, "editorial_opinion.txt")
    
    def generate_qna(self):
        """Generate questions and answers about the lecture and save them"""
        print("\n" + "=" * 80)
        print("❓ Q&A SESSION")
        print("=" * 80)
        
        prompt = f"""Create 10 insightful questions about this lecture with detailed answers. 
        Focus on the most important and controversial aspects.

        Lecture content: {self.transcript[:6000]}

        QUESTIONS AND ANSWERS:
        1. Q: 
           A: 
        2. Q: 
           A: 
        [Continue for 10 questions]"""
        
        qna = self.generate_response(prompt, max_tokens=4096)
        print(qna)
        self.save_output(qna, "qna_session.txt")
    
    def is_whisper_format(self, data):
        """Check if the timestamp data is in OpenAI Whisper format"""
        return 'segments' in data and isinstance(data['segments'], list) and len(data['segments']) > 0 and 'start' in data['segments'][0]
    
    def convert_whisper_to_timeline(self, whisper_data):
        """Convert Whisper format to timeline format"""
        timeline = {"sections": []}
        
        for i, segment in enumerate(whisper_data.get('segments', [])):
            start_time = segment.get('start', 0)
            end_time = segment.get('end', 0)
            text = segment.get('text', '').strip()
            
            # Convert seconds to HH:MM:SS format
            start_minutes, start_seconds = divmod(start_time, 60)
            start_hours, start_minutes = divmod(start_minutes, 60)
            start_str = f"{int(start_hours):02d}:{int(start_minutes):02d}:{int(start_seconds):02d}"
            
            end_minutes, end_seconds = divmod(end_time, 60)
            end_hours, end_minutes = divmod(end_minutes, 60)
            end_str = f"{int(end_hours):02d}:{int(end_minutes):02d}:{int(end_seconds):02d}"
            
            # Create a short topic from the text
            topic = text[:50] + "..." if len(text) > 50 else text
            if not topic:
                topic = f"Segment {i+1}"
            
            timeline["sections"].append({
                "start_time": start_str,
                "end_time": end_str,
                "topic": topic,
                "text": text
            })
        
        return timeline
    
    def create_timeline_visualization_plotly(self, timestamps_data, title, filename):
        """Create timeline visualization using Plotly and save as HTML"""
        if not timestamps_data or 'sections' not in timestamps_data:
            return
        
        # Extract data for visualization
        segments = []
        durations = []
        labels = []
        
        for i, section in enumerate(timestamps_data.get('sections', [])):
            if 'start_time' in section and 'end_time' in section:
                # Parse time strings to calculate duration
                start_parts = section['start_time'].split(':')
                end_parts = section['end_time'].split(':')
                
                if len(start_parts) == 3 and len(end_parts) == 3:
                    start_sec = int(start_parts[0]) * 3600 + int(start_parts[1]) * 60 + int(start_parts[2])
                    end_sec = int(end_parts[0]) * 3600 + int(end_parts[1]) * 60 + int(end_parts[2])
                    duration = end_sec - start_sec
                    
                    if duration > 0:  # Only include segments with positive duration
                        segments.append(i)
                        durations.append(duration)
                        labels.append(f"Seg {i+1}")
        
        if durations:
            # Create pie chart for timeline 1
            if "TIMELINE 1" in title:
                fig = px.pie(values=durations, names=labels, title=f'{title} - Segment Durations')
                self.save_plotly_html(fig, filename.replace('.txt', '_durations_pie.html'))
            
            # Create bar chart for other timelines
            else:
                fig = px.bar(x=segments, y=durations, title=f'{title} - Segment Durations',
                            labels={'x': 'Segment Number', 'y': 'Duration (seconds)'})
                fig.update_layout(xaxis=dict(tickvals=segments, ticktext=labels))
                self.save_plotly_html(fig, filename.replace('.txt', '_durations.html'))
    
    def create_timeline(self, timestamps_data, title, filename):
        """Create a visual timeline from timestamps data and save it"""
        if not timestamps_data:
            print(f"No timestamp data available for {title}")
            return
        
        print(f"\n⏰ {title}")
        print("=" * 80)
        
        # Check if data is in Whisper format and convert if needed
        if self.is_whisper_format(timestamps_data):
            print("Detected Whisper format - converting to timeline format")
            timestamps_data = self.convert_whisper_to_timeline(timestamps_data)
        
        # Extract timeline data
        times = []
        topics = []
        full_texts = []
        
        for section in timestamps_data.get('sections', []):
            start_time = section.get('start_time', '00:00:00')
            topic = section.get('topic', 'Unknown')
            full_text = section.get('text', '')
            
            times.append(start_time)
            topics.append(topic)
            full_texts.append(full_text)
        
        # Create a text-based timeline
        timeline_text = f"{title}\n\n"
        for i, (time, topic, text) in enumerate(zip(times, topics, full_texts), 1):
            timeline_text += f"{i}. {time} - {topic}\n"
            if text:
                timeline_text += f"   Text: {text}\n"
            timeline_text += "\n"
        
        print(timeline_text)
        self.save_output(timeline_text, filename)
        
        # Create visualization using Plotly
        self.create_timeline_visualization_plotly(timestamps_data, title, filename)
    
    def create_timelines(self):
        """Create timelines for both timestamp files and save them"""
        print("\n" + "=" * 80)
        print("⏰ LECTURE TIMELINES")
        print("=" * 80)
        
        # Create timeline for first timestamp file
        self.create_timeline(self.timestamps, "LECTURE TIMELINE 1", "timeline_1.txt")
        
        # Create timeline for second timestamp file
        self.create_timeline(self.timestamps_2, "LECTURE TIMELINE 2", "timeline_2.txt")
    
    def generate_key_insights(self):
        """Generate key insights with visual representation and save them"""
        print("\n" + "=" * 80)
        print("💡 KEY INSIGHTS ANALYSIS")
        print("=" * 80)
        
        prompt = f"""Extract the 7 most profound insights from this lecture. For each insight:
        1. State the insight clearly
        2. Explain its significance
        3. Provide supporting evidence from the lecture
        4. Rate its importance (1-10)

        Lecture: {self.transcript[:8000]}

        KEY INSIGHTS:"""
        
        insights = self.generate_response(prompt, max_tokens=1024)
        print(insights)
        self.save_output(insights, "key_insights.txt")
        
        # Create a radar chart of insight importance using Plotly
        print("\nGenerating insights visualization...")
        categories = ['Storytelling Power', 'AI Risks', 'Alignment Challenge', 
                     'Ethical Frameworks', 'Human Cooperation', 'Trust Issues', 'Future Implications']
        values = [8, 9, 9, 7, 8, 8, 9]  # Example values
        
        # Create radar chart with Plotly
        fig = go.Figure(data=go.Scatterpolar(
            r=values,
            theta=categories,
            fill='toself'
        ))
        
        fig.update_layout(
            polar=dict(radialaxis=dict(visible=True, range=[0, 10])),
            title="Importance of Lecture Themes"
        )
        self.save_plotly_html(fig, "insights_radar_chart.html")
    
    def generate_recommendations(self):
        """Generate policy and personal recommendations and save them"""
        print("\n" + "=" * 80)
        print("📋 POLICY AND PERSONAL RECOMMENDATIONS")
        print("=" * 80)
        
        prompt = f"""Based on this lecture, create:
        1. 5 policy recommendations for governments
        2. 5 recommendations for AI companies
        3. 5 personal actions individuals can take
        4. 3 global cooperation initiatives needed

        Lecture content: {self.transcript[:7000]}

        RECOMMENDATIONS:"""
        
        recommendations = self.generate_response(prompt, max_tokens=4096)
        print(recommendations)
        self.save_output(recommendations, "recommendations.txt")
    
    def create_readme(self):
        """Create a README file with information about all generated content"""
        readme_content = f"""# GPT-OSS-120B Analysis Output

## Analysis of Yuval Noah Harari Lecture on AI and Humanity

### Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}

### Contents:

1. **Original Transcript** - The complete lecture transcript
2. **Multi-Length Summaries** - Summaries of various lengths (10-300 words)
3. **Data Visualizations** - Word frequency, word cloud, and topic distribution
4. **AI Development Debate** - Pros and cons of rapid AI development
5. **Professional Article** - Technology publication-style article
6. **Editorial Opinion** - Strong viewpoint on AI regulation
7. **Q&A Session** - 10 insightful questions with detailed answers
8. **Lecture Timelines** - Text and visual timelines of the lecture structure
9. **Key Insights** - 7 profound insights with significance and ratings
10. **Recommendations** - Policy, personal, and global cooperation recommendations
11. **T-Shirt Designs** - Flux1-Krea-dev graphic t-shirt prompts

### Visualization Files:
- HTML files: Interactive Plotly visualizations
- PNG files: Static images (word cloud)

### Model Information:
- Model: GPT-OSS-120B (4-bit quantized)
- Parameters: 120 billion
- Hardware: Apple M3 Ultra with 512GB RAM

### Analysis Themes:
- Storytelling as human differentiator
- AI risks and benefits
- Alignment problem
- Ethical frameworks
- Human cooperation and trust
- Future implications of AI
"""
        
        self.save_output(readme_content, "README.md")
    
    def run_comprehensive_demo(self):
        """Run the complete demonstration and save all outputs"""
        print("🚀 Starting Comprehensive GPT-OSS-120B Demonstration")
        print("💾 Model: 120B parameters, 4-bit quantized")
        print("📚 Analyzing: Yuval Noah Harari Lecture on AI and Humanity")
        print("=" * 80)
        
        # Load data
        self.load_data(
            "yuval_harari_lecture_transcript.txt", 
            "yuval_harari_lecture_timestamps.json",
            "yuval_harari_lecture_timestamps_2.json"
        )
        
        # Run all demonstrations
        demonstrations = [
            self.generate_summaries,
            self.create_visualizations,
            self.generate_debate,
            self.write_article,
            self.write_editorial,
            self.generate_qna,
            self.create_timelines,
            self.generate_key_insights,
            self.generate_recommendations,
            self.generate_tshirt_prompts,
            self.create_readme
        ]
        
        for demo in demonstrations:
            try:
                demo()
                time.sleep(2)
            except Exception as e:
                logger.error(f"Error in demonstration: {e}")
                continue
        
        print(f"\n🎉 All outputs saved to: {self.output_dir}")
        print("📋 Contents:")
        for file in os.listdir(self.output_dir):
            print(f"   - {file}")

if __name__ == "__main__":
    demo = GPTOSSDemo()
    demo.run_comprehensive_demo()