""" Style Analyzer Module Analyze CEO's writing style to inform training and evaluation. Extracts vocabulary patterns, sentence structure, rhetorical devices, and tone markers. Example usage: analyzer = StyleAnalyzer() profile = analyzer.analyze_posts(blog_posts) profile.save("data/processed/style_profile.json") """ import json import re from collections import Counter from dataclasses import dataclass, field from pathlib import Path from typing import Optional from loguru import logger try: import nltk from nltk.tokenize import sent_tokenize, word_tokenize from nltk.corpus import stopwords from nltk.util import ngrams NLTK_AVAILABLE = True except ImportError: NLTK_AVAILABLE = False logger.warning("nltk not available, using basic tokenization") @dataclass class StyleProfile: """Represents the analyzed writing style profile.""" # Vocabulary analysis vocabulary_size: int = 0 top_words: list = field(default_factory=list) top_bigrams: list = field(default_factory=list) top_trigrams: list = field(default_factory=list) unique_phrases: list = field(default_factory=list) jargon_terms: list = field(default_factory=list) # Sentence structure avg_sentence_length: float = 0.0 sentence_length_std: float = 0.0 avg_words_per_sentence: float = 0.0 sentence_complexity_score: float = 0.0 # Rhetorical patterns question_frequency: float = 0.0 exclamation_frequency: float = 0.0 rhetorical_devices: list = field(default_factory=list) # Topic analysis topic_categories: dict = field(default_factory=dict) key_themes: list = field(default_factory=list) # Tone markers formality_score: float = 0.0 confidence_score: float = 0.0 tone_indicators: dict = field(default_factory=dict) # Raw statistics total_words: int = 0 total_sentences: int = 0 total_posts: int = 0 def to_dict(self) -> dict: """Convert to dictionary for serialization.""" return { "vocabulary": { "size": self.vocabulary_size, "top_words": self.top_words, "top_bigrams": self.top_bigrams, "top_trigrams": self.top_trigrams, "unique_phrases": self.unique_phrases, "jargon_terms": self.jargon_terms, }, "sentence_structure": { "avg_sentence_length": self.avg_sentence_length, "sentence_length_std": self.sentence_length_std, "avg_words_per_sentence": self.avg_words_per_sentence, "complexity_score": self.sentence_complexity_score, }, "rhetorical_patterns": { "question_frequency": self.question_frequency, "exclamation_frequency": self.exclamation_frequency, "devices": self.rhetorical_devices, }, "topics": { "categories": self.topic_categories, "key_themes": self.key_themes, }, "tone": { "formality_score": self.formality_score, "confidence_score": self.confidence_score, "indicators": self.tone_indicators, }, "statistics": { "total_words": self.total_words, "total_sentences": self.total_sentences, "total_posts": self.total_posts, }, } def save(self, path: str | Path) -> None: """Save profile to JSON file.""" with open(path, "w", encoding="utf-8") as f: json.dump(self.to_dict(), f, indent=2, ensure_ascii=False) logger.info(f"Saved style profile to: {path}") @classmethod def load(cls, path: str | Path) -> "StyleProfile": """Load profile from JSON file.""" with open(path, "r", encoding="utf-8") as f: data = json.load(f) profile = cls() profile.vocabulary_size = data["vocabulary"]["size"] profile.top_words = data["vocabulary"]["top_words"] profile.top_bigrams = data["vocabulary"]["top_bigrams"] profile.top_trigrams = data["vocabulary"]["top_trigrams"] profile.unique_phrases = data["vocabulary"]["unique_phrases"] profile.jargon_terms = data["vocabulary"]["jargon_terms"] profile.avg_sentence_length = data["sentence_structure"]["avg_sentence_length"] profile.sentence_length_std = data["sentence_structure"]["sentence_length_std"] profile.avg_words_per_sentence = data["sentence_structure"]["avg_words_per_sentence"] profile.sentence_complexity_score = data["sentence_structure"]["complexity_score"] profile.question_frequency = data["rhetorical_patterns"]["question_frequency"] profile.exclamation_frequency = data["rhetorical_patterns"]["exclamation_frequency"] profile.rhetorical_devices = data["rhetorical_patterns"]["devices"] profile.topic_categories = data["topics"]["categories"] profile.key_themes = data["topics"]["key_themes"] profile.formality_score = data["tone"]["formality_score"] profile.confidence_score = data["tone"]["confidence_score"] profile.tone_indicators = data["tone"]["indicators"] profile.total_words = data["statistics"]["total_words"] profile.total_sentences = data["statistics"]["total_sentences"] profile.total_posts = data["statistics"]["total_posts"] return profile class StyleAnalyzer: """ Analyze writing style from blog posts. Extracts patterns useful for: - Training data generation - Evaluation metrics - System prompt design Example: >>> analyzer = StyleAnalyzer() >>> profile = analyzer.analyze_posts(blog_posts) >>> print(f"Vocabulary size: {profile.vocabulary_size}") """ # Common English stopwords (fallback if NLTK unavailable) BASIC_STOPWORDS = { "the", "a", "an", "and", "or", "but", "in", "on", "at", "to", "for", "of", "with", "by", "from", "as", "is", "was", "are", "were", "been", "be", "have", "has", "had", "do", "does", "did", "will", "would", "could", "should", "may", "might", "must", "shall", "can", "need", "this", "that", "these", "those", "i", "you", "he", "she", "it", "we", "they", "what", "which", "who", "when", "where", "why", "how", "all", "each", "every", "both", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "just", "also", "now", "here", "there", } # Formal language indicators FORMAL_INDICATORS = [ "therefore", "however", "moreover", "furthermore", "consequently", "nevertheless", "accordingly", "thus", "hence", "whereas", "notwithstanding", "albeit", "hitherto", "whereby", "therein", ] # Informal language indicators INFORMAL_INDICATORS = [ "gonna", "wanna", "gotta", "kinda", "sorta", "yeah", "yep", "nope", "okay", "ok", "cool", "awesome", "basically", "literally", "actually", "honestly", "seriously", "totally", "super", ] # Confidence markers CONFIDENT_MARKERS = [ "certainly", "definitely", "absolutely", "clearly", "obviously", "undoubtedly", "surely", "indeed", "precisely", "exactly", "will", "must", "always", "never", "every", ] # Hedging markers HEDGING_MARKERS = [ "maybe", "perhaps", "possibly", "probably", "might", "could", "seems", "appears", "suggests", "tends", "somewhat", "rather", "fairly", "quite", "relatively", "generally", "typically", ] def __init__(self, language: str = "english"): """ Initialize the style analyzer. Args: language: Language for tokenization and stopwords """ self.language = language # Initialize NLTK if available if NLTK_AVAILABLE: try: nltk.data.find("tokenizers/punkt") except LookupError: logger.info("Downloading NLTK punkt tokenizer...") nltk.download("punkt", quiet=True) nltk.download("punkt_tab", quiet=True) try: nltk.data.find("corpora/stopwords") except LookupError: logger.info("Downloading NLTK stopwords...") nltk.download("stopwords", quiet=True) self.stopwords = set(stopwords.words(language)) else: self.stopwords = self.BASIC_STOPWORDS def analyze_posts(self, posts: list) -> StyleProfile: """ Analyze multiple blog posts and create a style profile. Args: posts: List of BlogPost objects Returns: StyleProfile with analyzed patterns """ logger.info(f"Analyzing style from {len(posts)} posts") profile = StyleProfile() profile.total_posts = len(posts) # Collect all text all_text = "\n\n".join(post.content for post in posts) all_sentences = self._tokenize_sentences(all_text) all_words = self._tokenize_words(all_text) profile.total_sentences = len(all_sentences) profile.total_words = len(all_words) # Vocabulary analysis self._analyze_vocabulary(all_words, profile) # N-gram analysis self._analyze_ngrams(all_words, profile) # Sentence structure analysis self._analyze_sentence_structure(all_sentences, profile) # Rhetorical patterns self._analyze_rhetorical_patterns(all_sentences, all_text, profile) # Tone analysis self._analyze_tone(all_words, profile) # Topic analysis self._analyze_topics(posts, profile) # Extract unique phrases self._extract_unique_phrases(all_text, profile) logger.info(f"Style analysis complete: {profile.vocabulary_size} unique words") return profile def _tokenize_sentences(self, text: str) -> list[str]: """Tokenize text into sentences.""" if NLTK_AVAILABLE: return sent_tokenize(text, language=self.language) else: # Basic sentence splitting sentences = re.split(r"[.!?]+", text) return [s.strip() for s in sentences if s.strip()] def _tokenize_words(self, text: str) -> list[str]: """Tokenize text into words.""" if NLTK_AVAILABLE: return word_tokenize(text.lower(), language=self.language) else: # Basic word splitting words = re.findall(r"\b\w+\b", text.lower()) return words def _analyze_vocabulary(self, words: list[str], profile: StyleProfile) -> None: """Analyze vocabulary patterns.""" # Filter out stopwords and short words content_words = [ w for w in words if w not in self.stopwords and len(w) > 2 and w.isalpha() ] word_counts = Counter(content_words) profile.vocabulary_size = len(word_counts) # Top 100 most common words profile.top_words = [ {"word": word, "count": count} for word, count in word_counts.most_common(100) ] def _analyze_ngrams(self, words: list[str], profile: StyleProfile) -> None: """Analyze bigram and trigram patterns.""" # Filter words for n-gram analysis filtered_words = [w for w in words if w.isalpha()] if NLTK_AVAILABLE: # Bigrams bigram_list = list(ngrams(filtered_words, 2)) bigram_counts = Counter(bigram_list) # Filter out bigrams with stopwords meaningful_bigrams = { bg: count for bg, count in bigram_counts.items() if bg[0] not in self.stopwords or bg[1] not in self.stopwords } profile.top_bigrams = [ {"bigram": " ".join(bg), "count": count} for bg, count in Counter(meaningful_bigrams).most_common(50) ] # Trigrams trigram_list = list(ngrams(filtered_words, 3)) trigram_counts = Counter(trigram_list) profile.top_trigrams = [ {"trigram": " ".join(tg), "count": count} for tg, count in trigram_counts.most_common(30) ] else: # Basic n-gram extraction without NLTK profile.top_bigrams = [] profile.top_trigrams = [] def _analyze_sentence_structure( self, sentences: list[str], profile: StyleProfile ) -> None: """Analyze sentence length and complexity patterns.""" if not sentences: return sentence_lengths = [] word_counts = [] for sent in sentences: char_len = len(sent) words = sent.split() word_count = len(words) sentence_lengths.append(char_len) word_counts.append(word_count) # Calculate statistics import statistics profile.avg_sentence_length = statistics.mean(sentence_lengths) profile.sentence_length_std = ( statistics.stdev(sentence_lengths) if len(sentence_lengths) > 1 else 0 ) profile.avg_words_per_sentence = statistics.mean(word_counts) # Complexity score based on variation if profile.avg_sentence_length > 0: profile.sentence_complexity_score = ( profile.sentence_length_std / profile.avg_sentence_length ) def _analyze_rhetorical_patterns( self, sentences: list[str], full_text: str, profile: StyleProfile ) -> None: """Analyze rhetorical devices and patterns.""" if not sentences: return # Question frequency questions = [s for s in sentences if s.strip().endswith("?")] profile.question_frequency = len(questions) / len(sentences) # Exclamation frequency exclamations = [s for s in sentences if s.strip().endswith("!")] profile.exclamation_frequency = len(exclamations) / len(sentences) # Detect rhetorical devices devices = [] # Anaphora (repetition at start) sentence_starts = [s.split()[0].lower() if s.split() else "" for s in sentences] start_counts = Counter(sentence_starts) repeated_starts = [ word for word, count in start_counts.items() if count >= 3 and word not in self.stopwords ] if repeated_starts: devices.append({ "device": "anaphora", "examples": repeated_starts[:5], }) # Lists (bullet points, numbered lists) list_pattern = re.compile(r"^[\s]*[-*•]\s+|^[\s]*\d+[.)\]]\s+", re.MULTILINE) if list_pattern.search(full_text): devices.append({ "device": "enumeration", "description": "Uses bullet points or numbered lists", }) # Rhetorical questions rhetorical_indicators = [ "isn't it", "don't you think", "wouldn't you say", "right?", "correct?", "yes?", "no?", ] rhetorical_count = sum( 1 for q in questions if any(ind in q.lower() for ind in rhetorical_indicators) ) if rhetorical_count > 0: devices.append({ "device": "rhetorical_questions", "count": rhetorical_count, }) profile.rhetorical_devices = devices def _analyze_tone(self, words: list[str], profile: StyleProfile) -> None: """Analyze tone indicators (formality, confidence).""" if not words: return word_set = set(words) word_count = len(words) # Formality score formal_count = sum(1 for w in words if w in self.FORMAL_INDICATORS) informal_count = sum(1 for w in words if w in self.INFORMAL_INDICATORS) if formal_count + informal_count > 0: profile.formality_score = formal_count / (formal_count + informal_count) else: profile.formality_score = 0.5 # Neutral # Confidence score confident_count = sum(1 for w in words if w in self.CONFIDENT_MARKERS) hedging_count = sum(1 for w in words if w in self.HEDGING_MARKERS) if confident_count + hedging_count > 0: profile.confidence_score = confident_count / (confident_count + hedging_count) else: profile.confidence_score = 0.5 # Neutral # Detailed tone indicators profile.tone_indicators = { "formal_words_per_1000": (formal_count / word_count) * 1000, "informal_words_per_1000": (informal_count / word_count) * 1000, "confident_words_per_1000": (confident_count / word_count) * 1000, "hedging_words_per_1000": (hedging_count / word_count) * 1000, } def _analyze_topics(self, posts: list, profile: StyleProfile) -> None: """Analyze topic categories from post titles and content.""" # Simple keyword-based categorization categories = { "technology": ["ai", "technology", "digital", "software", "data", "tech", "machine", "algorithm"], "business": ["business", "company", "market", "strategy", "growth", "revenue", "customer"], "leadership": ["leadership", "team", "culture", "management", "vision", "values"], "innovation": ["innovation", "future", "change", "disruption", "transform", "new"], "personal": ["i", "my", "journey", "experience", "learned", "believe"], } category_counts = {cat: 0 for cat in categories} for post in posts: text = (post.title + " " + post.content).lower() for category, keywords in categories.items(): if any(kw in text for kw in keywords): category_counts[category] += 1 total = len(posts) profile.topic_categories = { cat: count / total for cat, count in category_counts.items() } # Key themes (most common nouns/topics) profile.key_themes = [ cat for cat, _ in sorted( category_counts.items(), key=lambda x: x[1], reverse=True )[:5] ] def _extract_unique_phrases(self, text: str, profile: StyleProfile) -> None: """Extract potentially unique or signature phrases.""" # Look for quoted phrases quoted = re.findall(r'"([^"]+)"', text) quoted_counts = Counter(quoted) # Look for repeated phrases (potential catchphrases) # Simple approach: find capitalized phrases capitalized = re.findall(r"[A-Z][a-z]+(?:\s+[A-Z][a-z]+)+", text) cap_counts = Counter(capitalized) unique_phrases = [] # Add repeated quotes for phrase, count in quoted_counts.most_common(10): if count >= 2 and 3 <= len(phrase.split()) <= 8: unique_phrases.append({ "phrase": phrase, "count": count, "type": "quoted", }) # Add repeated capitalized phrases for phrase, count in cap_counts.most_common(10): if count >= 2: unique_phrases.append({ "phrase": phrase, "count": count, "type": "capitalized", }) profile.unique_phrases = unique_phrases # Technical jargon detection (words not in common vocabulary) words = self._tokenize_words(text) word_counts = Counter(words) # Simple heuristic: long words used multiple times that aren't common potential_jargon = [ word for word, count in word_counts.items() if len(word) > 7 and count >= 3 and word not in self.stopwords ] profile.jargon_terms = potential_jargon[:20] def main(): """CLI entry point for testing the analyzer.""" import argparse parser = argparse.ArgumentParser( description="Analyze writing style from text files", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: python style_analyzer.py input.txt --output style_profile.json python style_analyzer.py input.txt --verbose """, ) parser.add_argument("input", help="Input text file") parser.add_argument("--output", "-o", help="Output JSON file for style profile") parser.add_argument("--verbose", "-v", action="store_true", help="Verbose output") args = parser.parse_args() analyzer = StyleAnalyzer() # Read input file with open(args.input, "r", encoding="utf-8") as f: text = f.read() # Create a mock post for analysis from .blog_parser import BlogPost mock_post = BlogPost( title="Combined Content", content=text, raw_content=text, word_count=len(text.split()), char_count=len(text), index=0, ) profile = analyzer.analyze_posts([mock_post]) # Print summary print("\n=== Style Analysis Summary ===") print(f"Total words: {profile.total_words:,}") print(f"Total sentences: {profile.total_sentences:,}") print(f"Vocabulary size: {profile.vocabulary_size:,}") print(f"\nAvg sentence length: {profile.avg_sentence_length:.1f} chars") print(f"Avg words/sentence: {profile.avg_words_per_sentence:.1f}") print(f"\nFormality score: {profile.formality_score:.2f} (0=informal, 1=formal)") print(f"Confidence score: {profile.confidence_score:.2f} (0=hedging, 1=confident)") print(f"\nQuestion frequency: {profile.question_frequency:.1%}") print(f"Exclamation frequency: {profile.exclamation_frequency:.1%}") if args.verbose: print("\n=== Top Words ===") for item in profile.top_words[:20]: print(f" {item['word']}: {item['count']}") if args.output: profile.save(args.output) print(f"\nSaved profile to: {args.output}") if __name__ == "__main__": main()