Spaces:
Paused
Paused
| #!/usr/bin/env python3 | |
| """ | |
| Process Blogs CLI | |
| Parse raw blog content, segment into chunks, and analyze writing style. | |
| This is the first step in the data processing pipeline. | |
| Usage: | |
| python scripts/process_blogs.py --input data/raw/blogs.txt --output data/processed/ | |
| Output: | |
| - posts.json: Parsed blog posts with metadata | |
| - segments.json: Text segments for training | |
| - style_profile.json: Writing style analysis | |
| """ | |
| import argparse | |
| import json | |
| import sys | |
| from pathlib import Path | |
| # Add src to path for imports | |
| sys.path.insert(0, str(Path(__file__).parent.parent)) | |
| from rich.console import Console | |
| from rich.progress import Progress, SpinnerColumn, TextColumn | |
| from rich.table import Table | |
| from src.data_processing.blog_parser import BlogParser | |
| from src.data_processing.text_segmenter import TextSegmenter | |
| from src.data_processing.style_analyzer import StyleAnalyzer | |
| console = Console() | |
| def main(): | |
| parser = argparse.ArgumentParser( | |
| description="Process raw blog content into training-ready data", | |
| formatter_class=argparse.RawDescriptionHelpFormatter, | |
| epilog=""" | |
| Examples: | |
| # Basic usage | |
| python scripts/process_blogs.py --input data/raw/blogs.txt --output data/processed/ | |
| # Custom segmentation | |
| python scripts/process_blogs.py --input data/raw/blogs.txt --output data/processed/ \\ | |
| --target-tokens 256 --overlap 30 | |
| # Skip style analysis | |
| python scripts/process_blogs.py --input data/raw/blogs.txt --output data/processed/ \\ | |
| --skip-style-analysis | |
| Input format (blogs.txt): | |
| === BLOG START === | |
| Title of first blog post | |
| Content of first blog post... | |
| === BLOG END === | |
| === BLOG START === | |
| Title of second blog post | |
| Content... | |
| === BLOG END === | |
| """, | |
| ) | |
| parser.add_argument( | |
| "--input", "-i", | |
| default="data/raw/blogs.txt", | |
| help="Path to blogs.txt file (default: data/raw/blogs.txt)", | |
| ) | |
| parser.add_argument( | |
| "--output", "-o", | |
| default="data/processed/", | |
| help="Output directory (default: data/processed/)", | |
| ) | |
| parser.add_argument( | |
| "--target-tokens", | |
| type=int, | |
| default=384, | |
| help="Target tokens per segment (default: 384)", | |
| ) | |
| parser.add_argument( | |
| "--min-tokens", | |
| type=int, | |
| default=100, | |
| help="Minimum tokens per segment (default: 100)", | |
| ) | |
| parser.add_argument( | |
| "--max-tokens", | |
| type=int, | |
| default=512, | |
| help="Maximum tokens per segment (default: 512)", | |
| ) | |
| parser.add_argument( | |
| "--overlap", | |
| type=int, | |
| default=50, | |
| help="Overlap tokens between segments (default: 50)", | |
| ) | |
| parser.add_argument( | |
| "--min-post-length", | |
| type=int, | |
| default=100, | |
| help="Minimum characters for a valid post (default: 100)", | |
| ) | |
| parser.add_argument( | |
| "--skip-style-analysis", | |
| action="store_true", | |
| help="Skip writing style analysis", | |
| ) | |
| parser.add_argument( | |
| "--verbose", "-v", | |
| action="store_true", | |
| help="Verbose output", | |
| ) | |
| args = parser.parse_args() | |
| # Validate input file | |
| input_path = Path(args.input) | |
| if not input_path.exists(): | |
| console.print(f"[red]Error:[/red] Input file not found: {input_path}") | |
| console.print("\nPlease create the file with your blog content in this format:") | |
| console.print(" === BLOG START ===") | |
| console.print(" [Title]") | |
| console.print(" [Content...]") | |
| console.print(" === BLOG END ===") | |
| return 1 | |
| # Create output directory | |
| output_dir = Path(args.output) | |
| output_dir.mkdir(parents=True, exist_ok=True) | |
| console.print("\n[bold blue]AI Executive - Blog Processor[/bold blue]") | |
| console.print("=" * 50) | |
| # Step 1: Parse blogs | |
| console.print("\n[yellow]Step 1:[/yellow] Parsing blog content...") | |
| blog_parser = BlogParser(min_content_length=args.min_post_length) | |
| try: | |
| posts = blog_parser.parse_file(input_path) | |
| except Exception as e: | |
| console.print(f"[red]Error parsing blogs:[/red] {e}") | |
| return 1 | |
| # Save posts | |
| posts_data = [p.to_dict() for p in posts] | |
| posts_path = output_dir / "posts.json" | |
| with open(posts_path, "w", encoding="utf-8") as f: | |
| json.dump(posts_data, f, indent=2, ensure_ascii=False) | |
| console.print(f" [green]β[/green] Parsed {len(posts)} blog posts") | |
| console.print(f" [green]β[/green] Saved to: {posts_path}") | |
| # Show post summary | |
| if args.verbose: | |
| table = Table(title="Parsed Posts") | |
| table.add_column("Index", justify="right", style="cyan") | |
| table.add_column("Title", style="white") | |
| table.add_column("Words", justify="right", style="green") | |
| for post in posts[:10]: # Show first 10 | |
| table.add_row( | |
| str(post.index), | |
| post.title[:50] + "..." if len(post.title) > 50 else post.title, | |
| f"{post.word_count:,}", | |
| ) | |
| if len(posts) > 10: | |
| table.add_row("...", f"({len(posts) - 10} more)", "...") | |
| console.print(table) | |
| # Step 2: Segment text | |
| console.print("\n[yellow]Step 2:[/yellow] Segmenting into chunks...") | |
| segmenter = TextSegmenter( | |
| target_tokens=args.target_tokens, | |
| min_tokens=args.min_tokens, | |
| max_tokens=args.max_tokens, | |
| overlap_tokens=args.overlap, | |
| ) | |
| segments = segmenter.segment_posts(posts) | |
| # Save segments | |
| segments_data = [s.to_dict() for s in segments] | |
| segments_path = output_dir / "segments.json" | |
| with open(segments_path, "w", encoding="utf-8") as f: | |
| json.dump(segments_data, f, indent=2, ensure_ascii=False) | |
| console.print(f" [green]β[/green] Created {len(segments)} segments") | |
| console.print(f" [green]β[/green] Saved to: {segments_path}") | |
| # Show segment statistics | |
| token_counts = [s.token_count for s in segments] | |
| avg_tokens = sum(token_counts) / len(token_counts) if token_counts else 0 | |
| console.print(f" [dim]Average tokens/segment: {avg_tokens:.1f}[/dim]") | |
| console.print(f" [dim]Token range: {min(token_counts)} - {max(token_counts)}[/dim]") | |
| # Step 3: Style analysis (optional) | |
| if not args.skip_style_analysis: | |
| console.print("\n[yellow]Step 3:[/yellow] Analyzing writing style...") | |
| analyzer = StyleAnalyzer() | |
| profile = analyzer.analyze_posts(posts) | |
| # Save profile | |
| profile_path = output_dir / "style_profile.json" | |
| profile.save(profile_path) | |
| console.print(f" [green]β[/green] Vocabulary size: {profile.vocabulary_size:,} words") | |
| console.print(f" [green]β[/green] Avg sentence length: {profile.avg_words_per_sentence:.1f} words") | |
| console.print(f" [green]β[/green] Formality score: {profile.formality_score:.2f}") | |
| console.print(f" [green]β[/green] Saved to: {profile_path}") | |
| if args.verbose and profile.top_words: | |
| console.print("\n [dim]Top 10 words:[/dim]") | |
| for item in profile.top_words[:10]: | |
| console.print(f" {item['word']}: {item['count']}") | |
| else: | |
| console.print("\n[yellow]Step 3:[/yellow] [dim]Skipped style analysis[/dim]") | |
| # Summary | |
| console.print("\n" + "=" * 50) | |
| console.print("[bold green]Processing complete![/bold green]") | |
| console.print(f"\nOutput files in: {output_dir}") | |
| console.print(f" - posts.json ({len(posts)} posts)") | |
| console.print(f" - segments.json ({len(segments)} segments)") | |
| if not args.skip_style_analysis: | |
| console.print(" - style_profile.json") | |
| console.print("\n[dim]Next step: Generate Q&A training pairs[/dim]") | |
| console.print(f"[dim] python scripts/generate_training_data.py --input {segments_path}[/dim]") | |
| return 0 | |
| if __name__ == "__main__": | |
| exit(main()) | |