#!/usr/bin/env python3 """ Process Blogs CLI Parse raw blog content, segment into chunks, and analyze writing style. This is the first step in the data processing pipeline. Usage: python scripts/process_blogs.py --input data/raw/blogs.txt --output data/processed/ Output: - posts.json: Parsed blog posts with metadata - segments.json: Text segments for training - style_profile.json: Writing style analysis """ import argparse import json import sys from pathlib import Path # Add src to path for imports sys.path.insert(0, str(Path(__file__).parent.parent)) from rich.console import Console from rich.progress import Progress, SpinnerColumn, TextColumn from rich.table import Table from src.data_processing.blog_parser import BlogParser from src.data_processing.text_segmenter import TextSegmenter from src.data_processing.style_analyzer import StyleAnalyzer console = Console() def main(): parser = argparse.ArgumentParser( description="Process raw blog content into training-ready data", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: # Basic usage python scripts/process_blogs.py --input data/raw/blogs.txt --output data/processed/ # Custom segmentation python scripts/process_blogs.py --input data/raw/blogs.txt --output data/processed/ \\ --target-tokens 256 --overlap 30 # Skip style analysis python scripts/process_blogs.py --input data/raw/blogs.txt --output data/processed/ \\ --skip-style-analysis Input format (blogs.txt): === BLOG START === Title of first blog post Content of first blog post... === BLOG END === === BLOG START === Title of second blog post Content... === BLOG END === """, ) parser.add_argument( "--input", "-i", default="data/raw/blogs.txt", help="Path to blogs.txt file (default: data/raw/blogs.txt)", ) parser.add_argument( "--output", "-o", default="data/processed/", help="Output directory (default: data/processed/)", ) parser.add_argument( "--target-tokens", type=int, default=384, help="Target tokens per segment (default: 384)", ) parser.add_argument( "--min-tokens", type=int, default=100, help="Minimum tokens per segment (default: 100)", ) parser.add_argument( "--max-tokens", type=int, default=512, help="Maximum tokens per segment (default: 512)", ) parser.add_argument( "--overlap", type=int, default=50, help="Overlap tokens between segments (default: 50)", ) parser.add_argument( "--min-post-length", type=int, default=100, help="Minimum characters for a valid post (default: 100)", ) parser.add_argument( "--skip-style-analysis", action="store_true", help="Skip writing style analysis", ) parser.add_argument( "--verbose", "-v", action="store_true", help="Verbose output", ) args = parser.parse_args() # Validate input file input_path = Path(args.input) if not input_path.exists(): console.print(f"[red]Error:[/red] Input file not found: {input_path}") console.print("\nPlease create the file with your blog content in this format:") console.print(" === BLOG START ===") console.print(" [Title]") console.print(" [Content...]") console.print(" === BLOG END ===") return 1 # Create output directory output_dir = Path(args.output) output_dir.mkdir(parents=True, exist_ok=True) console.print("\n[bold blue]AI Executive - Blog Processor[/bold blue]") console.print("=" * 50) # Step 1: Parse blogs console.print("\n[yellow]Step 1:[/yellow] Parsing blog content...") blog_parser = BlogParser(min_content_length=args.min_post_length) try: posts = blog_parser.parse_file(input_path) except Exception as e: console.print(f"[red]Error parsing blogs:[/red] {e}") return 1 # Save posts posts_data = [p.to_dict() for p in posts] posts_path = output_dir / "posts.json" with open(posts_path, "w", encoding="utf-8") as f: json.dump(posts_data, f, indent=2, ensure_ascii=False) console.print(f" [green]✓[/green] Parsed {len(posts)} blog posts") console.print(f" [green]✓[/green] Saved to: {posts_path}") # Show post summary if args.verbose: table = Table(title="Parsed Posts") table.add_column("Index", justify="right", style="cyan") table.add_column("Title", style="white") table.add_column("Words", justify="right", style="green") for post in posts[:10]: # Show first 10 table.add_row( str(post.index), post.title[:50] + "..." if len(post.title) > 50 else post.title, f"{post.word_count:,}", ) if len(posts) > 10: table.add_row("...", f"({len(posts) - 10} more)", "...") console.print(table) # Step 2: Segment text console.print("\n[yellow]Step 2:[/yellow] Segmenting into chunks...") segmenter = TextSegmenter( target_tokens=args.target_tokens, min_tokens=args.min_tokens, max_tokens=args.max_tokens, overlap_tokens=args.overlap, ) segments = segmenter.segment_posts(posts) # Save segments segments_data = [s.to_dict() for s in segments] segments_path = output_dir / "segments.json" with open(segments_path, "w", encoding="utf-8") as f: json.dump(segments_data, f, indent=2, ensure_ascii=False) console.print(f" [green]✓[/green] Created {len(segments)} segments") console.print(f" [green]✓[/green] Saved to: {segments_path}") # Show segment statistics token_counts = [s.token_count for s in segments] avg_tokens = sum(token_counts) / len(token_counts) if token_counts else 0 console.print(f" [dim]Average tokens/segment: {avg_tokens:.1f}[/dim]") console.print(f" [dim]Token range: {min(token_counts)} - {max(token_counts)}[/dim]") # Step 3: Style analysis (optional) if not args.skip_style_analysis: console.print("\n[yellow]Step 3:[/yellow] Analyzing writing style...") analyzer = StyleAnalyzer() profile = analyzer.analyze_posts(posts) # Save profile profile_path = output_dir / "style_profile.json" profile.save(profile_path) console.print(f" [green]✓[/green] Vocabulary size: {profile.vocabulary_size:,} words") console.print(f" [green]✓[/green] Avg sentence length: {profile.avg_words_per_sentence:.1f} words") console.print(f" [green]✓[/green] Formality score: {profile.formality_score:.2f}") console.print(f" [green]✓[/green] Saved to: {profile_path}") if args.verbose and profile.top_words: console.print("\n [dim]Top 10 words:[/dim]") for item in profile.top_words[:10]: console.print(f" {item['word']}: {item['count']}") else: console.print("\n[yellow]Step 3:[/yellow] [dim]Skipped style analysis[/dim]") # Summary console.print("\n" + "=" * 50) console.print("[bold green]Processing complete![/bold green]") console.print(f"\nOutput files in: {output_dir}") console.print(f" - posts.json ({len(posts)} posts)") console.print(f" - segments.json ({len(segments)} segments)") if not args.skip_style_analysis: console.print(" - style_profile.json") console.print("\n[dim]Next step: Generate Q&A training pairs[/dim]") console.print(f"[dim] python scripts/generate_training_data.py --input {segments_path}[/dim]") return 0 if __name__ == "__main__": exit(main())