Spaces:

AI-Talent-Force
/

ai_exec

Paused

File size: 22,268 Bytes

45ee481

"""
Style Analyzer Module

Analyze CEO's writing style to inform training and evaluation.
Extracts vocabulary patterns, sentence structure, rhetorical devices, and tone markers.

Example usage:
    analyzer = StyleAnalyzer()
    profile = analyzer.analyze_posts(blog_posts)
    profile.save("data/processed/style_profile.json")
"""

import json
import re
from collections import Counter
from dataclasses import dataclass, field
from pathlib import Path
from typing import Optional

from loguru import logger

try:
    import nltk
    from nltk.tokenize import sent_tokenize, word_tokenize
    from nltk.corpus import stopwords
    from nltk.util import ngrams

    NLTK_AVAILABLE = True
except ImportError:
    NLTK_AVAILABLE = False
    logger.warning("nltk not available, using basic tokenization")


@dataclass
class StyleProfile:
    """Represents the analyzed writing style profile."""

    # Vocabulary analysis
    vocabulary_size: int = 0
    top_words: list = field(default_factory=list)
    top_bigrams: list = field(default_factory=list)
    top_trigrams: list = field(default_factory=list)
    unique_phrases: list = field(default_factory=list)
    jargon_terms: list = field(default_factory=list)

    # Sentence structure
    avg_sentence_length: float = 0.0
    sentence_length_std: float = 0.0
    avg_words_per_sentence: float = 0.0
    sentence_complexity_score: float = 0.0

    # Rhetorical patterns
    question_frequency: float = 0.0
    exclamation_frequency: float = 0.0
    rhetorical_devices: list = field(default_factory=list)

    # Topic analysis
    topic_categories: dict = field(default_factory=dict)
    key_themes: list = field(default_factory=list)

    # Tone markers
    formality_score: float = 0.0
    confidence_score: float = 0.0
    tone_indicators: dict = field(default_factory=dict)

    # Raw statistics
    total_words: int = 0
    total_sentences: int = 0
    total_posts: int = 0

    def to_dict(self) -> dict:
        """Convert to dictionary for serialization."""
        return {
            "vocabulary": {
                "size": self.vocabulary_size,
                "top_words": self.top_words,
                "top_bigrams": self.top_bigrams,
                "top_trigrams": self.top_trigrams,
                "unique_phrases": self.unique_phrases,
                "jargon_terms": self.jargon_terms,
            },
            "sentence_structure": {
                "avg_sentence_length": self.avg_sentence_length,
                "sentence_length_std": self.sentence_length_std,
                "avg_words_per_sentence": self.avg_words_per_sentence,
                "complexity_score": self.sentence_complexity_score,
            },
            "rhetorical_patterns": {
                "question_frequency": self.question_frequency,
                "exclamation_frequency": self.exclamation_frequency,
                "devices": self.rhetorical_devices,
            },
            "topics": {
                "categories": self.topic_categories,
                "key_themes": self.key_themes,
            },
            "tone": {
                "formality_score": self.formality_score,
                "confidence_score": self.confidence_score,
                "indicators": self.tone_indicators,
            },
            "statistics": {
                "total_words": self.total_words,
                "total_sentences": self.total_sentences,
                "total_posts": self.total_posts,
            },
        }

    def save(self, path: str | Path) -> None:
        """Save profile to JSON file."""
        with open(path, "w", encoding="utf-8") as f:
            json.dump(self.to_dict(), f, indent=2, ensure_ascii=False)
        logger.info(f"Saved style profile to: {path}")

    @classmethod
    def load(cls, path: str | Path) -> "StyleProfile":
        """Load profile from JSON file."""
        with open(path, "r", encoding="utf-8") as f:
            data = json.load(f)

        profile = cls()
        profile.vocabulary_size = data["vocabulary"]["size"]
        profile.top_words = data["vocabulary"]["top_words"]
        profile.top_bigrams = data["vocabulary"]["top_bigrams"]
        profile.top_trigrams = data["vocabulary"]["top_trigrams"]
        profile.unique_phrases = data["vocabulary"]["unique_phrases"]
        profile.jargon_terms = data["vocabulary"]["jargon_terms"]

        profile.avg_sentence_length = data["sentence_structure"]["avg_sentence_length"]
        profile.sentence_length_std = data["sentence_structure"]["sentence_length_std"]
        profile.avg_words_per_sentence = data["sentence_structure"]["avg_words_per_sentence"]
        profile.sentence_complexity_score = data["sentence_structure"]["complexity_score"]

        profile.question_frequency = data["rhetorical_patterns"]["question_frequency"]
        profile.exclamation_frequency = data["rhetorical_patterns"]["exclamation_frequency"]
        profile.rhetorical_devices = data["rhetorical_patterns"]["devices"]

        profile.topic_categories = data["topics"]["categories"]
        profile.key_themes = data["topics"]["key_themes"]

        profile.formality_score = data["tone"]["formality_score"]
        profile.confidence_score = data["tone"]["confidence_score"]
        profile.tone_indicators = data["tone"]["indicators"]

        profile.total_words = data["statistics"]["total_words"]
        profile.total_sentences = data["statistics"]["total_sentences"]
        profile.total_posts = data["statistics"]["total_posts"]

        return profile


class StyleAnalyzer:
    """
    Analyze writing style from blog posts.

    Extracts patterns useful for:
    - Training data generation
    - Evaluation metrics
    - System prompt design

    Example:
        >>> analyzer = StyleAnalyzer()
        >>> profile = analyzer.analyze_posts(blog_posts)
        >>> print(f"Vocabulary size: {profile.vocabulary_size}")
    """

    # Common English stopwords (fallback if NLTK unavailable)
    BASIC_STOPWORDS = {
        "the", "a", "an", "and", "or", "but", "in", "on", "at", "to", "for",
        "of", "with", "by", "from", "as", "is", "was", "are", "were", "been",
        "be", "have", "has", "had", "do", "does", "did", "will", "would",
        "could", "should", "may", "might", "must", "shall", "can", "need",
        "this", "that", "these", "those", "i", "you", "he", "she", "it",
        "we", "they", "what", "which", "who", "when", "where", "why", "how",
        "all", "each", "every", "both", "few", "more", "most", "other",
        "some", "such", "no", "nor", "not", "only", "own", "same", "so",
        "than", "too", "very", "just", "also", "now", "here", "there",
    }

    # Formal language indicators
    FORMAL_INDICATORS = [
        "therefore", "however", "moreover", "furthermore", "consequently",
        "nevertheless", "accordingly", "thus", "hence", "whereas",
        "notwithstanding", "albeit", "hitherto", "whereby", "therein",
    ]

    # Informal language indicators
    INFORMAL_INDICATORS = [
        "gonna", "wanna", "gotta", "kinda", "sorta", "yeah", "yep",
        "nope", "okay", "ok", "cool", "awesome", "basically", "literally",
        "actually", "honestly", "seriously", "totally", "super",
    ]

    # Confidence markers
    CONFIDENT_MARKERS = [
        "certainly", "definitely", "absolutely", "clearly", "obviously",
        "undoubtedly", "surely", "indeed", "precisely", "exactly",
        "will", "must", "always", "never", "every",
    ]

    # Hedging markers
    HEDGING_MARKERS = [
        "maybe", "perhaps", "possibly", "probably", "might", "could",
        "seems", "appears", "suggests", "tends", "somewhat", "rather",
        "fairly", "quite", "relatively", "generally", "typically",
    ]

    def __init__(self, language: str = "english"):
        """
        Initialize the style analyzer.

        Args:
            language: Language for tokenization and stopwords
        """
        self.language = language

        # Initialize NLTK if available
        if NLTK_AVAILABLE:
            try:
                nltk.data.find("tokenizers/punkt")
            except LookupError:
                logger.info("Downloading NLTK punkt tokenizer...")
                nltk.download("punkt", quiet=True)
                nltk.download("punkt_tab", quiet=True)

            try:
                nltk.data.find("corpora/stopwords")
            except LookupError:
                logger.info("Downloading NLTK stopwords...")
                nltk.download("stopwords", quiet=True)

            self.stopwords = set(stopwords.words(language))
        else:
            self.stopwords = self.BASIC_STOPWORDS

    def analyze_posts(self, posts: list) -> StyleProfile:
        """
        Analyze multiple blog posts and create a style profile.

        Args:
            posts: List of BlogPost objects

        Returns:
            StyleProfile with analyzed patterns
        """
        logger.info(f"Analyzing style from {len(posts)} posts")

        profile = StyleProfile()
        profile.total_posts = len(posts)

        # Collect all text
        all_text = "\n\n".join(post.content for post in posts)
        all_sentences = self._tokenize_sentences(all_text)
        all_words = self._tokenize_words(all_text)

        profile.total_sentences = len(all_sentences)
        profile.total_words = len(all_words)

        # Vocabulary analysis
        self._analyze_vocabulary(all_words, profile)

        # N-gram analysis
        self._analyze_ngrams(all_words, profile)

        # Sentence structure analysis
        self._analyze_sentence_structure(all_sentences, profile)

        # Rhetorical patterns
        self._analyze_rhetorical_patterns(all_sentences, all_text, profile)

        # Tone analysis
        self._analyze_tone(all_words, profile)

        # Topic analysis
        self._analyze_topics(posts, profile)

        # Extract unique phrases
        self._extract_unique_phrases(all_text, profile)

        logger.info(f"Style analysis complete: {profile.vocabulary_size} unique words")
        return profile

    def _tokenize_sentences(self, text: str) -> list[str]:
        """Tokenize text into sentences."""
        if NLTK_AVAILABLE:
            return sent_tokenize(text, language=self.language)
        else:
            # Basic sentence splitting
            sentences = re.split(r"[.!?]+", text)
            return [s.strip() for s in sentences if s.strip()]

    def _tokenize_words(self, text: str) -> list[str]:
        """Tokenize text into words."""
        if NLTK_AVAILABLE:
            return word_tokenize(text.lower(), language=self.language)
        else:
            # Basic word splitting
            words = re.findall(r"\b\w+\b", text.lower())
            return words

    def _analyze_vocabulary(self, words: list[str], profile: StyleProfile) -> None:
        """Analyze vocabulary patterns."""
        # Filter out stopwords and short words
        content_words = [
            w for w in words
            if w not in self.stopwords and len(w) > 2 and w.isalpha()
        ]

        word_counts = Counter(content_words)
        profile.vocabulary_size = len(word_counts)

        # Top 100 most common words
        profile.top_words = [
            {"word": word, "count": count}
            for word, count in word_counts.most_common(100)
        ]

    def _analyze_ngrams(self, words: list[str], profile: StyleProfile) -> None:
        """Analyze bigram and trigram patterns."""
        # Filter words for n-gram analysis
        filtered_words = [w for w in words if w.isalpha()]

        if NLTK_AVAILABLE:
            # Bigrams
            bigram_list = list(ngrams(filtered_words, 2))
            bigram_counts = Counter(bigram_list)

            # Filter out bigrams with stopwords
            meaningful_bigrams = {
                bg: count for bg, count in bigram_counts.items()
                if bg[0] not in self.stopwords or bg[1] not in self.stopwords
            }

            profile.top_bigrams = [
                {"bigram": " ".join(bg), "count": count}
                for bg, count in Counter(meaningful_bigrams).most_common(50)
            ]

            # Trigrams
            trigram_list = list(ngrams(filtered_words, 3))
            trigram_counts = Counter(trigram_list)

            profile.top_trigrams = [
                {"trigram": " ".join(tg), "count": count}
                for tg, count in trigram_counts.most_common(30)
            ]
        else:
            # Basic n-gram extraction without NLTK
            profile.top_bigrams = []
            profile.top_trigrams = []

    def _analyze_sentence_structure(
        self, sentences: list[str], profile: StyleProfile
    ) -> None:
        """Analyze sentence length and complexity patterns."""
        if not sentences:
            return

        sentence_lengths = []
        word_counts = []

        for sent in sentences:
            char_len = len(sent)
            words = sent.split()
            word_count = len(words)

            sentence_lengths.append(char_len)
            word_counts.append(word_count)

        # Calculate statistics
        import statistics

        profile.avg_sentence_length = statistics.mean(sentence_lengths)
        profile.sentence_length_std = (
            statistics.stdev(sentence_lengths) if len(sentence_lengths) > 1 else 0
        )
        profile.avg_words_per_sentence = statistics.mean(word_counts)

        # Complexity score based on variation
        if profile.avg_sentence_length > 0:
            profile.sentence_complexity_score = (
                profile.sentence_length_std / profile.avg_sentence_length
            )

    def _analyze_rhetorical_patterns(
        self, sentences: list[str], full_text: str, profile: StyleProfile
    ) -> None:
        """Analyze rhetorical devices and patterns."""
        if not sentences:
            return

        # Question frequency
        questions = [s for s in sentences if s.strip().endswith("?")]
        profile.question_frequency = len(questions) / len(sentences)

        # Exclamation frequency
        exclamations = [s for s in sentences if s.strip().endswith("!")]
        profile.exclamation_frequency = len(exclamations) / len(sentences)

        # Detect rhetorical devices
        devices = []

        # Anaphora (repetition at start)
        sentence_starts = [s.split()[0].lower() if s.split() else "" for s in sentences]
        start_counts = Counter(sentence_starts)
        repeated_starts = [
            word for word, count in start_counts.items()
            if count >= 3 and word not in self.stopwords
        ]
        if repeated_starts:
            devices.append({
                "device": "anaphora",
                "examples": repeated_starts[:5],
            })

        # Lists (bullet points, numbered lists)
        list_pattern = re.compile(r"^[\s]*[-*•]\s+|^[\s]*\d+[.)\]]\s+", re.MULTILINE)
        if list_pattern.search(full_text):
            devices.append({
                "device": "enumeration",
                "description": "Uses bullet points or numbered lists",
            })

        # Rhetorical questions
        rhetorical_indicators = [
            "isn't it", "don't you think", "wouldn't you say", "right?",
            "correct?", "yes?", "no?",
        ]
        rhetorical_count = sum(
            1 for q in questions
            if any(ind in q.lower() for ind in rhetorical_indicators)
        )
        if rhetorical_count > 0:
            devices.append({
                "device": "rhetorical_questions",
                "count": rhetorical_count,
            })

        profile.rhetorical_devices = devices

    def _analyze_tone(self, words: list[str], profile: StyleProfile) -> None:
        """Analyze tone indicators (formality, confidence)."""
        if not words:
            return

        word_set = set(words)
        word_count = len(words)

        # Formality score
        formal_count = sum(1 for w in words if w in self.FORMAL_INDICATORS)
        informal_count = sum(1 for w in words if w in self.INFORMAL_INDICATORS)

        if formal_count + informal_count > 0:
            profile.formality_score = formal_count / (formal_count + informal_count)
        else:
            profile.formality_score = 0.5  # Neutral

        # Confidence score
        confident_count = sum(1 for w in words if w in self.CONFIDENT_MARKERS)
        hedging_count = sum(1 for w in words if w in self.HEDGING_MARKERS)

        if confident_count + hedging_count > 0:
            profile.confidence_score = confident_count / (confident_count + hedging_count)
        else:
            profile.confidence_score = 0.5  # Neutral

        # Detailed tone indicators
        profile.tone_indicators = {
            "formal_words_per_1000": (formal_count / word_count) * 1000,
            "informal_words_per_1000": (informal_count / word_count) * 1000,
            "confident_words_per_1000": (confident_count / word_count) * 1000,
            "hedging_words_per_1000": (hedging_count / word_count) * 1000,
        }

    def _analyze_topics(self, posts: list, profile: StyleProfile) -> None:
        """Analyze topic categories from post titles and content."""
        # Simple keyword-based categorization
        categories = {
            "technology": ["ai", "technology", "digital", "software", "data", "tech", "machine", "algorithm"],
            "business": ["business", "company", "market", "strategy", "growth", "revenue", "customer"],
            "leadership": ["leadership", "team", "culture", "management", "vision", "values"],
            "innovation": ["innovation", "future", "change", "disruption", "transform", "new"],
            "personal": ["i", "my", "journey", "experience", "learned", "believe"],
        }

        category_counts = {cat: 0 for cat in categories}

        for post in posts:
            text = (post.title + " " + post.content).lower()
            for category, keywords in categories.items():
                if any(kw in text for kw in keywords):
                    category_counts[category] += 1

        total = len(posts)
        profile.topic_categories = {
            cat: count / total for cat, count in category_counts.items()
        }

        # Key themes (most common nouns/topics)
        profile.key_themes = [
            cat for cat, _ in sorted(
                category_counts.items(), key=lambda x: x[1], reverse=True
            )[:5]
        ]

    def _extract_unique_phrases(self, text: str, profile: StyleProfile) -> None:
        """Extract potentially unique or signature phrases."""
        # Look for quoted phrases
        quoted = re.findall(r'"([^"]+)"', text)
        quoted_counts = Counter(quoted)

        # Look for repeated phrases (potential catchphrases)
        # Simple approach: find capitalized phrases
        capitalized = re.findall(r"[A-Z][a-z]+(?:\s+[A-Z][a-z]+)+", text)
        cap_counts = Counter(capitalized)

        unique_phrases = []

        # Add repeated quotes
        for phrase, count in quoted_counts.most_common(10):
            if count >= 2 and 3 <= len(phrase.split()) <= 8:
                unique_phrases.append({
                    "phrase": phrase,
                    "count": count,
                    "type": "quoted",
                })

        # Add repeated capitalized phrases
        for phrase, count in cap_counts.most_common(10):
            if count >= 2:
                unique_phrases.append({
                    "phrase": phrase,
                    "count": count,
                    "type": "capitalized",
                })

        profile.unique_phrases = unique_phrases

        # Technical jargon detection (words not in common vocabulary)
        words = self._tokenize_words(text)
        word_counts = Counter(words)

        # Simple heuristic: long words used multiple times that aren't common
        potential_jargon = [
            word for word, count in word_counts.items()
            if len(word) > 7 and count >= 3 and word not in self.stopwords
        ]

        profile.jargon_terms = potential_jargon[:20]


def main():
    """CLI entry point for testing the analyzer."""
    import argparse

    parser = argparse.ArgumentParser(
        description="Analyze writing style from text files",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
    python style_analyzer.py input.txt --output style_profile.json
    python style_analyzer.py input.txt --verbose
        """,
    )
    parser.add_argument("input", help="Input text file")
    parser.add_argument("--output", "-o", help="Output JSON file for style profile")
    parser.add_argument("--verbose", "-v", action="store_true", help="Verbose output")

    args = parser.parse_args()

    analyzer = StyleAnalyzer()

    # Read input file
    with open(args.input, "r", encoding="utf-8") as f:
        text = f.read()

    # Create a mock post for analysis
    from .blog_parser import BlogPost

    mock_post = BlogPost(
        title="Combined Content",
        content=text,
        raw_content=text,
        word_count=len(text.split()),
        char_count=len(text),
        index=0,
    )

    profile = analyzer.analyze_posts([mock_post])

    # Print summary
    print("\n=== Style Analysis Summary ===")
    print(f"Total words: {profile.total_words:,}")
    print(f"Total sentences: {profile.total_sentences:,}")
    print(f"Vocabulary size: {profile.vocabulary_size:,}")
    print(f"\nAvg sentence length: {profile.avg_sentence_length:.1f} chars")
    print(f"Avg words/sentence: {profile.avg_words_per_sentence:.1f}")
    print(f"\nFormality score: {profile.formality_score:.2f} (0=informal, 1=formal)")
    print(f"Confidence score: {profile.confidence_score:.2f} (0=hedging, 1=confident)")
    print(f"\nQuestion frequency: {profile.question_frequency:.1%}")
    print(f"Exclamation frequency: {profile.exclamation_frequency:.1%}")

    if args.verbose:
        print("\n=== Top Words ===")
        for item in profile.top_words[:20]:
            print(f"  {item['word']}: {item['count']}")

    if args.output:
        profile.save(args.output)
        print(f"\nSaved profile to: {args.output}")


if __name__ == "__main__":
    main()