Spaces:
Paused
Paused
| """ | |
| Style Analyzer Module | |
| Analyze CEO's writing style to inform training and evaluation. | |
| Extracts vocabulary patterns, sentence structure, rhetorical devices, and tone markers. | |
| Example usage: | |
| analyzer = StyleAnalyzer() | |
| profile = analyzer.analyze_posts(blog_posts) | |
| profile.save("data/processed/style_profile.json") | |
| """ | |
| import json | |
| import re | |
| from collections import Counter | |
| from dataclasses import dataclass, field | |
| from pathlib import Path | |
| from typing import Optional | |
| from loguru import logger | |
| try: | |
| import nltk | |
| from nltk.tokenize import sent_tokenize, word_tokenize | |
| from nltk.corpus import stopwords | |
| from nltk.util import ngrams | |
| NLTK_AVAILABLE = True | |
| except ImportError: | |
| NLTK_AVAILABLE = False | |
| logger.warning("nltk not available, using basic tokenization") | |
| class StyleProfile: | |
| """Represents the analyzed writing style profile.""" | |
| # Vocabulary analysis | |
| vocabulary_size: int = 0 | |
| top_words: list = field(default_factory=list) | |
| top_bigrams: list = field(default_factory=list) | |
| top_trigrams: list = field(default_factory=list) | |
| unique_phrases: list = field(default_factory=list) | |
| jargon_terms: list = field(default_factory=list) | |
| # Sentence structure | |
| avg_sentence_length: float = 0.0 | |
| sentence_length_std: float = 0.0 | |
| avg_words_per_sentence: float = 0.0 | |
| sentence_complexity_score: float = 0.0 | |
| # Rhetorical patterns | |
| question_frequency: float = 0.0 | |
| exclamation_frequency: float = 0.0 | |
| rhetorical_devices: list = field(default_factory=list) | |
| # Topic analysis | |
| topic_categories: dict = field(default_factory=dict) | |
| key_themes: list = field(default_factory=list) | |
| # Tone markers | |
| formality_score: float = 0.0 | |
| confidence_score: float = 0.0 | |
| tone_indicators: dict = field(default_factory=dict) | |
| # Raw statistics | |
| total_words: int = 0 | |
| total_sentences: int = 0 | |
| total_posts: int = 0 | |
| def to_dict(self) -> dict: | |
| """Convert to dictionary for serialization.""" | |
| return { | |
| "vocabulary": { | |
| "size": self.vocabulary_size, | |
| "top_words": self.top_words, | |
| "top_bigrams": self.top_bigrams, | |
| "top_trigrams": self.top_trigrams, | |
| "unique_phrases": self.unique_phrases, | |
| "jargon_terms": self.jargon_terms, | |
| }, | |
| "sentence_structure": { | |
| "avg_sentence_length": self.avg_sentence_length, | |
| "sentence_length_std": self.sentence_length_std, | |
| "avg_words_per_sentence": self.avg_words_per_sentence, | |
| "complexity_score": self.sentence_complexity_score, | |
| }, | |
| "rhetorical_patterns": { | |
| "question_frequency": self.question_frequency, | |
| "exclamation_frequency": self.exclamation_frequency, | |
| "devices": self.rhetorical_devices, | |
| }, | |
| "topics": { | |
| "categories": self.topic_categories, | |
| "key_themes": self.key_themes, | |
| }, | |
| "tone": { | |
| "formality_score": self.formality_score, | |
| "confidence_score": self.confidence_score, | |
| "indicators": self.tone_indicators, | |
| }, | |
| "statistics": { | |
| "total_words": self.total_words, | |
| "total_sentences": self.total_sentences, | |
| "total_posts": self.total_posts, | |
| }, | |
| } | |
| def save(self, path: str | Path) -> None: | |
| """Save profile to JSON file.""" | |
| with open(path, "w", encoding="utf-8") as f: | |
| json.dump(self.to_dict(), f, indent=2, ensure_ascii=False) | |
| logger.info(f"Saved style profile to: {path}") | |
| def load(cls, path: str | Path) -> "StyleProfile": | |
| """Load profile from JSON file.""" | |
| with open(path, "r", encoding="utf-8") as f: | |
| data = json.load(f) | |
| profile = cls() | |
| profile.vocabulary_size = data["vocabulary"]["size"] | |
| profile.top_words = data["vocabulary"]["top_words"] | |
| profile.top_bigrams = data["vocabulary"]["top_bigrams"] | |
| profile.top_trigrams = data["vocabulary"]["top_trigrams"] | |
| profile.unique_phrases = data["vocabulary"]["unique_phrases"] | |
| profile.jargon_terms = data["vocabulary"]["jargon_terms"] | |
| profile.avg_sentence_length = data["sentence_structure"]["avg_sentence_length"] | |
| profile.sentence_length_std = data["sentence_structure"]["sentence_length_std"] | |
| profile.avg_words_per_sentence = data["sentence_structure"]["avg_words_per_sentence"] | |
| profile.sentence_complexity_score = data["sentence_structure"]["complexity_score"] | |
| profile.question_frequency = data["rhetorical_patterns"]["question_frequency"] | |
| profile.exclamation_frequency = data["rhetorical_patterns"]["exclamation_frequency"] | |
| profile.rhetorical_devices = data["rhetorical_patterns"]["devices"] | |
| profile.topic_categories = data["topics"]["categories"] | |
| profile.key_themes = data["topics"]["key_themes"] | |
| profile.formality_score = data["tone"]["formality_score"] | |
| profile.confidence_score = data["tone"]["confidence_score"] | |
| profile.tone_indicators = data["tone"]["indicators"] | |
| profile.total_words = data["statistics"]["total_words"] | |
| profile.total_sentences = data["statistics"]["total_sentences"] | |
| profile.total_posts = data["statistics"]["total_posts"] | |
| return profile | |
| class StyleAnalyzer: | |
| """ | |
| Analyze writing style from blog posts. | |
| Extracts patterns useful for: | |
| - Training data generation | |
| - Evaluation metrics | |
| - System prompt design | |
| Example: | |
| >>> analyzer = StyleAnalyzer() | |
| >>> profile = analyzer.analyze_posts(blog_posts) | |
| >>> print(f"Vocabulary size: {profile.vocabulary_size}") | |
| """ | |
| # Common English stopwords (fallback if NLTK unavailable) | |
| BASIC_STOPWORDS = { | |
| "the", "a", "an", "and", "or", "but", "in", "on", "at", "to", "for", | |
| "of", "with", "by", "from", "as", "is", "was", "are", "were", "been", | |
| "be", "have", "has", "had", "do", "does", "did", "will", "would", | |
| "could", "should", "may", "might", "must", "shall", "can", "need", | |
| "this", "that", "these", "those", "i", "you", "he", "she", "it", | |
| "we", "they", "what", "which", "who", "when", "where", "why", "how", | |
| "all", "each", "every", "both", "few", "more", "most", "other", | |
| "some", "such", "no", "nor", "not", "only", "own", "same", "so", | |
| "than", "too", "very", "just", "also", "now", "here", "there", | |
| } | |
| # Formal language indicators | |
| FORMAL_INDICATORS = [ | |
| "therefore", "however", "moreover", "furthermore", "consequently", | |
| "nevertheless", "accordingly", "thus", "hence", "whereas", | |
| "notwithstanding", "albeit", "hitherto", "whereby", "therein", | |
| ] | |
| # Informal language indicators | |
| INFORMAL_INDICATORS = [ | |
| "gonna", "wanna", "gotta", "kinda", "sorta", "yeah", "yep", | |
| "nope", "okay", "ok", "cool", "awesome", "basically", "literally", | |
| "actually", "honestly", "seriously", "totally", "super", | |
| ] | |
| # Confidence markers | |
| CONFIDENT_MARKERS = [ | |
| "certainly", "definitely", "absolutely", "clearly", "obviously", | |
| "undoubtedly", "surely", "indeed", "precisely", "exactly", | |
| "will", "must", "always", "never", "every", | |
| ] | |
| # Hedging markers | |
| HEDGING_MARKERS = [ | |
| "maybe", "perhaps", "possibly", "probably", "might", "could", | |
| "seems", "appears", "suggests", "tends", "somewhat", "rather", | |
| "fairly", "quite", "relatively", "generally", "typically", | |
| ] | |
| def __init__(self, language: str = "english"): | |
| """ | |
| Initialize the style analyzer. | |
| Args: | |
| language: Language for tokenization and stopwords | |
| """ | |
| self.language = language | |
| # Initialize NLTK if available | |
| if NLTK_AVAILABLE: | |
| try: | |
| nltk.data.find("tokenizers/punkt") | |
| except LookupError: | |
| logger.info("Downloading NLTK punkt tokenizer...") | |
| nltk.download("punkt", quiet=True) | |
| nltk.download("punkt_tab", quiet=True) | |
| try: | |
| nltk.data.find("corpora/stopwords") | |
| except LookupError: | |
| logger.info("Downloading NLTK stopwords...") | |
| nltk.download("stopwords", quiet=True) | |
| self.stopwords = set(stopwords.words(language)) | |
| else: | |
| self.stopwords = self.BASIC_STOPWORDS | |
| def analyze_posts(self, posts: list) -> StyleProfile: | |
| """ | |
| Analyze multiple blog posts and create a style profile. | |
| Args: | |
| posts: List of BlogPost objects | |
| Returns: | |
| StyleProfile with analyzed patterns | |
| """ | |
| logger.info(f"Analyzing style from {len(posts)} posts") | |
| profile = StyleProfile() | |
| profile.total_posts = len(posts) | |
| # Collect all text | |
| all_text = "\n\n".join(post.content for post in posts) | |
| all_sentences = self._tokenize_sentences(all_text) | |
| all_words = self._tokenize_words(all_text) | |
| profile.total_sentences = len(all_sentences) | |
| profile.total_words = len(all_words) | |
| # Vocabulary analysis | |
| self._analyze_vocabulary(all_words, profile) | |
| # N-gram analysis | |
| self._analyze_ngrams(all_words, profile) | |
| # Sentence structure analysis | |
| self._analyze_sentence_structure(all_sentences, profile) | |
| # Rhetorical patterns | |
| self._analyze_rhetorical_patterns(all_sentences, all_text, profile) | |
| # Tone analysis | |
| self._analyze_tone(all_words, profile) | |
| # Topic analysis | |
| self._analyze_topics(posts, profile) | |
| # Extract unique phrases | |
| self._extract_unique_phrases(all_text, profile) | |
| logger.info(f"Style analysis complete: {profile.vocabulary_size} unique words") | |
| return profile | |
| def _tokenize_sentences(self, text: str) -> list[str]: | |
| """Tokenize text into sentences.""" | |
| if NLTK_AVAILABLE: | |
| return sent_tokenize(text, language=self.language) | |
| else: | |
| # Basic sentence splitting | |
| sentences = re.split(r"[.!?]+", text) | |
| return [s.strip() for s in sentences if s.strip()] | |
| def _tokenize_words(self, text: str) -> list[str]: | |
| """Tokenize text into words.""" | |
| if NLTK_AVAILABLE: | |
| return word_tokenize(text.lower(), language=self.language) | |
| else: | |
| # Basic word splitting | |
| words = re.findall(r"\b\w+\b", text.lower()) | |
| return words | |
| def _analyze_vocabulary(self, words: list[str], profile: StyleProfile) -> None: | |
| """Analyze vocabulary patterns.""" | |
| # Filter out stopwords and short words | |
| content_words = [ | |
| w for w in words | |
| if w not in self.stopwords and len(w) > 2 and w.isalpha() | |
| ] | |
| word_counts = Counter(content_words) | |
| profile.vocabulary_size = len(word_counts) | |
| # Top 100 most common words | |
| profile.top_words = [ | |
| {"word": word, "count": count} | |
| for word, count in word_counts.most_common(100) | |
| ] | |
| def _analyze_ngrams(self, words: list[str], profile: StyleProfile) -> None: | |
| """Analyze bigram and trigram patterns.""" | |
| # Filter words for n-gram analysis | |
| filtered_words = [w for w in words if w.isalpha()] | |
| if NLTK_AVAILABLE: | |
| # Bigrams | |
| bigram_list = list(ngrams(filtered_words, 2)) | |
| bigram_counts = Counter(bigram_list) | |
| # Filter out bigrams with stopwords | |
| meaningful_bigrams = { | |
| bg: count for bg, count in bigram_counts.items() | |
| if bg[0] not in self.stopwords or bg[1] not in self.stopwords | |
| } | |
| profile.top_bigrams = [ | |
| {"bigram": " ".join(bg), "count": count} | |
| for bg, count in Counter(meaningful_bigrams).most_common(50) | |
| ] | |
| # Trigrams | |
| trigram_list = list(ngrams(filtered_words, 3)) | |
| trigram_counts = Counter(trigram_list) | |
| profile.top_trigrams = [ | |
| {"trigram": " ".join(tg), "count": count} | |
| for tg, count in trigram_counts.most_common(30) | |
| ] | |
| else: | |
| # Basic n-gram extraction without NLTK | |
| profile.top_bigrams = [] | |
| profile.top_trigrams = [] | |
| def _analyze_sentence_structure( | |
| self, sentences: list[str], profile: StyleProfile | |
| ) -> None: | |
| """Analyze sentence length and complexity patterns.""" | |
| if not sentences: | |
| return | |
| sentence_lengths = [] | |
| word_counts = [] | |
| for sent in sentences: | |
| char_len = len(sent) | |
| words = sent.split() | |
| word_count = len(words) | |
| sentence_lengths.append(char_len) | |
| word_counts.append(word_count) | |
| # Calculate statistics | |
| import statistics | |
| profile.avg_sentence_length = statistics.mean(sentence_lengths) | |
| profile.sentence_length_std = ( | |
| statistics.stdev(sentence_lengths) if len(sentence_lengths) > 1 else 0 | |
| ) | |
| profile.avg_words_per_sentence = statistics.mean(word_counts) | |
| # Complexity score based on variation | |
| if profile.avg_sentence_length > 0: | |
| profile.sentence_complexity_score = ( | |
| profile.sentence_length_std / profile.avg_sentence_length | |
| ) | |
| def _analyze_rhetorical_patterns( | |
| self, sentences: list[str], full_text: str, profile: StyleProfile | |
| ) -> None: | |
| """Analyze rhetorical devices and patterns.""" | |
| if not sentences: | |
| return | |
| # Question frequency | |
| questions = [s for s in sentences if s.strip().endswith("?")] | |
| profile.question_frequency = len(questions) / len(sentences) | |
| # Exclamation frequency | |
| exclamations = [s for s in sentences if s.strip().endswith("!")] | |
| profile.exclamation_frequency = len(exclamations) / len(sentences) | |
| # Detect rhetorical devices | |
| devices = [] | |
| # Anaphora (repetition at start) | |
| sentence_starts = [s.split()[0].lower() if s.split() else "" for s in sentences] | |
| start_counts = Counter(sentence_starts) | |
| repeated_starts = [ | |
| word for word, count in start_counts.items() | |
| if count >= 3 and word not in self.stopwords | |
| ] | |
| if repeated_starts: | |
| devices.append({ | |
| "device": "anaphora", | |
| "examples": repeated_starts[:5], | |
| }) | |
| # Lists (bullet points, numbered lists) | |
| list_pattern = re.compile(r"^[\s]*[-*•]\s+|^[\s]*\d+[.)\]]\s+", re.MULTILINE) | |
| if list_pattern.search(full_text): | |
| devices.append({ | |
| "device": "enumeration", | |
| "description": "Uses bullet points or numbered lists", | |
| }) | |
| # Rhetorical questions | |
| rhetorical_indicators = [ | |
| "isn't it", "don't you think", "wouldn't you say", "right?", | |
| "correct?", "yes?", "no?", | |
| ] | |
| rhetorical_count = sum( | |
| 1 for q in questions | |
| if any(ind in q.lower() for ind in rhetorical_indicators) | |
| ) | |
| if rhetorical_count > 0: | |
| devices.append({ | |
| "device": "rhetorical_questions", | |
| "count": rhetorical_count, | |
| }) | |
| profile.rhetorical_devices = devices | |
| def _analyze_tone(self, words: list[str], profile: StyleProfile) -> None: | |
| """Analyze tone indicators (formality, confidence).""" | |
| if not words: | |
| return | |
| word_set = set(words) | |
| word_count = len(words) | |
| # Formality score | |
| formal_count = sum(1 for w in words if w in self.FORMAL_INDICATORS) | |
| informal_count = sum(1 for w in words if w in self.INFORMAL_INDICATORS) | |
| if formal_count + informal_count > 0: | |
| profile.formality_score = formal_count / (formal_count + informal_count) | |
| else: | |
| profile.formality_score = 0.5 # Neutral | |
| # Confidence score | |
| confident_count = sum(1 for w in words if w in self.CONFIDENT_MARKERS) | |
| hedging_count = sum(1 for w in words if w in self.HEDGING_MARKERS) | |
| if confident_count + hedging_count > 0: | |
| profile.confidence_score = confident_count / (confident_count + hedging_count) | |
| else: | |
| profile.confidence_score = 0.5 # Neutral | |
| # Detailed tone indicators | |
| profile.tone_indicators = { | |
| "formal_words_per_1000": (formal_count / word_count) * 1000, | |
| "informal_words_per_1000": (informal_count / word_count) * 1000, | |
| "confident_words_per_1000": (confident_count / word_count) * 1000, | |
| "hedging_words_per_1000": (hedging_count / word_count) * 1000, | |
| } | |
| def _analyze_topics(self, posts: list, profile: StyleProfile) -> None: | |
| """Analyze topic categories from post titles and content.""" | |
| # Simple keyword-based categorization | |
| categories = { | |
| "technology": ["ai", "technology", "digital", "software", "data", "tech", "machine", "algorithm"], | |
| "business": ["business", "company", "market", "strategy", "growth", "revenue", "customer"], | |
| "leadership": ["leadership", "team", "culture", "management", "vision", "values"], | |
| "innovation": ["innovation", "future", "change", "disruption", "transform", "new"], | |
| "personal": ["i", "my", "journey", "experience", "learned", "believe"], | |
| } | |
| category_counts = {cat: 0 for cat in categories} | |
| for post in posts: | |
| text = (post.title + " " + post.content).lower() | |
| for category, keywords in categories.items(): | |
| if any(kw in text for kw in keywords): | |
| category_counts[category] += 1 | |
| total = len(posts) | |
| profile.topic_categories = { | |
| cat: count / total for cat, count in category_counts.items() | |
| } | |
| # Key themes (most common nouns/topics) | |
| profile.key_themes = [ | |
| cat for cat, _ in sorted( | |
| category_counts.items(), key=lambda x: x[1], reverse=True | |
| )[:5] | |
| ] | |
| def _extract_unique_phrases(self, text: str, profile: StyleProfile) -> None: | |
| """Extract potentially unique or signature phrases.""" | |
| # Look for quoted phrases | |
| quoted = re.findall(r'"([^"]+)"', text) | |
| quoted_counts = Counter(quoted) | |
| # Look for repeated phrases (potential catchphrases) | |
| # Simple approach: find capitalized phrases | |
| capitalized = re.findall(r"[A-Z][a-z]+(?:\s+[A-Z][a-z]+)+", text) | |
| cap_counts = Counter(capitalized) | |
| unique_phrases = [] | |
| # Add repeated quotes | |
| for phrase, count in quoted_counts.most_common(10): | |
| if count >= 2 and 3 <= len(phrase.split()) <= 8: | |
| unique_phrases.append({ | |
| "phrase": phrase, | |
| "count": count, | |
| "type": "quoted", | |
| }) | |
| # Add repeated capitalized phrases | |
| for phrase, count in cap_counts.most_common(10): | |
| if count >= 2: | |
| unique_phrases.append({ | |
| "phrase": phrase, | |
| "count": count, | |
| "type": "capitalized", | |
| }) | |
| profile.unique_phrases = unique_phrases | |
| # Technical jargon detection (words not in common vocabulary) | |
| words = self._tokenize_words(text) | |
| word_counts = Counter(words) | |
| # Simple heuristic: long words used multiple times that aren't common | |
| potential_jargon = [ | |
| word for word, count in word_counts.items() | |
| if len(word) > 7 and count >= 3 and word not in self.stopwords | |
| ] | |
| profile.jargon_terms = potential_jargon[:20] | |
| def main(): | |
| """CLI entry point for testing the analyzer.""" | |
| import argparse | |
| parser = argparse.ArgumentParser( | |
| description="Analyze writing style from text files", | |
| formatter_class=argparse.RawDescriptionHelpFormatter, | |
| epilog=""" | |
| Examples: | |
| python style_analyzer.py input.txt --output style_profile.json | |
| python style_analyzer.py input.txt --verbose | |
| """, | |
| ) | |
| parser.add_argument("input", help="Input text file") | |
| parser.add_argument("--output", "-o", help="Output JSON file for style profile") | |
| parser.add_argument("--verbose", "-v", action="store_true", help="Verbose output") | |
| args = parser.parse_args() | |
| analyzer = StyleAnalyzer() | |
| # Read input file | |
| with open(args.input, "r", encoding="utf-8") as f: | |
| text = f.read() | |
| # Create a mock post for analysis | |
| from .blog_parser import BlogPost | |
| mock_post = BlogPost( | |
| title="Combined Content", | |
| content=text, | |
| raw_content=text, | |
| word_count=len(text.split()), | |
| char_count=len(text), | |
| index=0, | |
| ) | |
| profile = analyzer.analyze_posts([mock_post]) | |
| # Print summary | |
| print("\n=== Style Analysis Summary ===") | |
| print(f"Total words: {profile.total_words:,}") | |
| print(f"Total sentences: {profile.total_sentences:,}") | |
| print(f"Vocabulary size: {profile.vocabulary_size:,}") | |
| print(f"\nAvg sentence length: {profile.avg_sentence_length:.1f} chars") | |
| print(f"Avg words/sentence: {profile.avg_words_per_sentence:.1f}") | |
| print(f"\nFormality score: {profile.formality_score:.2f} (0=informal, 1=formal)") | |
| print(f"Confidence score: {profile.confidence_score:.2f} (0=hedging, 1=confident)") | |
| print(f"\nQuestion frequency: {profile.question_frequency:.1%}") | |
| print(f"Exclamation frequency: {profile.exclamation_frequency:.1%}") | |
| if args.verbose: | |
| print("\n=== Top Words ===") | |
| for item in profile.top_words[:20]: | |
| print(f" {item['word']}: {item['count']}") | |
| if args.output: | |
| profile.save(args.output) | |
| print(f"\nSaved profile to: {args.output}") | |
| if __name__ == "__main__": | |
| main() | |