Spaces:

AI-Talent-Force
/

ai_exec

Paused

File size: 7,894 Bytes

45ee481

#!/usr/bin/env python3
"""
Process Blogs CLI

Parse raw blog content, segment into chunks, and analyze writing style.
This is the first step in the data processing pipeline.

Usage:
    python scripts/process_blogs.py --input data/raw/blogs.txt --output data/processed/

Output:
    - posts.json: Parsed blog posts with metadata
    - segments.json: Text segments for training
    - style_profile.json: Writing style analysis
"""

import argparse
import json
import sys
from pathlib import Path

# Add src to path for imports
sys.path.insert(0, str(Path(__file__).parent.parent))

from rich.console import Console
from rich.progress import Progress, SpinnerColumn, TextColumn
from rich.table import Table

from src.data_processing.blog_parser import BlogParser
from src.data_processing.text_segmenter import TextSegmenter
from src.data_processing.style_analyzer import StyleAnalyzer

console = Console()


def main():
    parser = argparse.ArgumentParser(
        description="Process raw blog content into training-ready data",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
    # Basic usage
    python scripts/process_blogs.py --input data/raw/blogs.txt --output data/processed/

    # Custom segmentation
    python scripts/process_blogs.py --input data/raw/blogs.txt --output data/processed/ \\
        --target-tokens 256 --overlap 30

    # Skip style analysis
    python scripts/process_blogs.py --input data/raw/blogs.txt --output data/processed/ \\
        --skip-style-analysis

Input format (blogs.txt):
    === BLOG START ===
    Title of first blog post

    Content of first blog post...

    === BLOG END ===

    === BLOG START ===
    Title of second blog post

    Content...

    === BLOG END ===
        """,
    )

    parser.add_argument(
        "--input", "-i",
        default="data/raw/blogs.txt",
        help="Path to blogs.txt file (default: data/raw/blogs.txt)",
    )
    parser.add_argument(
        "--output", "-o",
        default="data/processed/",
        help="Output directory (default: data/processed/)",
    )
    parser.add_argument(
        "--target-tokens",
        type=int,
        default=384,
        help="Target tokens per segment (default: 384)",
    )
    parser.add_argument(
        "--min-tokens",
        type=int,
        default=100,
        help="Minimum tokens per segment (default: 100)",
    )
    parser.add_argument(
        "--max-tokens",
        type=int,
        default=512,
        help="Maximum tokens per segment (default: 512)",
    )
    parser.add_argument(
        "--overlap",
        type=int,
        default=50,
        help="Overlap tokens between segments (default: 50)",
    )
    parser.add_argument(
        "--min-post-length",
        type=int,
        default=100,
        help="Minimum characters for a valid post (default: 100)",
    )
    parser.add_argument(
        "--skip-style-analysis",
        action="store_true",
        help="Skip writing style analysis",
    )
    parser.add_argument(
        "--verbose", "-v",
        action="store_true",
        help="Verbose output",
    )

    args = parser.parse_args()

    # Validate input file
    input_path = Path(args.input)
    if not input_path.exists():
        console.print(f"[red]Error:[/red] Input file not found: {input_path}")
        console.print("\nPlease create the file with your blog content in this format:")
        console.print("  === BLOG START ===")
        console.print("  [Title]")
        console.print("  [Content...]")
        console.print("  === BLOG END ===")
        return 1

    # Create output directory
    output_dir = Path(args.output)
    output_dir.mkdir(parents=True, exist_ok=True)

    console.print("\n[bold blue]AI Executive - Blog Processor[/bold blue]")
    console.print("=" * 50)

    # Step 1: Parse blogs
    console.print("\n[yellow]Step 1:[/yellow] Parsing blog content...")

    blog_parser = BlogParser(min_content_length=args.min_post_length)

    try:
        posts = blog_parser.parse_file(input_path)
    except Exception as e:
        console.print(f"[red]Error parsing blogs:[/red] {e}")
        return 1

    # Save posts
    posts_data = [p.to_dict() for p in posts]
    posts_path = output_dir / "posts.json"
    with open(posts_path, "w", encoding="utf-8") as f:
        json.dump(posts_data, f, indent=2, ensure_ascii=False)

    console.print(f"  [green]✓[/green] Parsed {len(posts)} blog posts")
    console.print(f"  [green]✓[/green] Saved to: {posts_path}")

    # Show post summary
    if args.verbose:
        table = Table(title="Parsed Posts")
        table.add_column("Index", justify="right", style="cyan")
        table.add_column("Title", style="white")
        table.add_column("Words", justify="right", style="green")

        for post in posts[:10]:  # Show first 10
            table.add_row(
                str(post.index),
                post.title[:50] + "..." if len(post.title) > 50 else post.title,
                f"{post.word_count:,}",
            )
        if len(posts) > 10:
            table.add_row("...", f"({len(posts) - 10} more)", "...")

        console.print(table)

    # Step 2: Segment text
    console.print("\n[yellow]Step 2:[/yellow] Segmenting into chunks...")

    segmenter = TextSegmenter(
        target_tokens=args.target_tokens,
        min_tokens=args.min_tokens,
        max_tokens=args.max_tokens,
        overlap_tokens=args.overlap,
    )

    segments = segmenter.segment_posts(posts)

    # Save segments
    segments_data = [s.to_dict() for s in segments]
    segments_path = output_dir / "segments.json"
    with open(segments_path, "w", encoding="utf-8") as f:
        json.dump(segments_data, f, indent=2, ensure_ascii=False)

    console.print(f"  [green]✓[/green] Created {len(segments)} segments")
    console.print(f"  [green]✓[/green] Saved to: {segments_path}")

    # Show segment statistics
    token_counts = [s.token_count for s in segments]
    avg_tokens = sum(token_counts) / len(token_counts) if token_counts else 0
    console.print(f"  [dim]Average tokens/segment: {avg_tokens:.1f}[/dim]")
    console.print(f"  [dim]Token range: {min(token_counts)} - {max(token_counts)}[/dim]")

    # Step 3: Style analysis (optional)
    if not args.skip_style_analysis:
        console.print("\n[yellow]Step 3:[/yellow] Analyzing writing style...")

        analyzer = StyleAnalyzer()
        profile = analyzer.analyze_posts(posts)

        # Save profile
        profile_path = output_dir / "style_profile.json"
        profile.save(profile_path)

        console.print(f"  [green]✓[/green] Vocabulary size: {profile.vocabulary_size:,} words")
        console.print(f"  [green]✓[/green] Avg sentence length: {profile.avg_words_per_sentence:.1f} words")
        console.print(f"  [green]✓[/green] Formality score: {profile.formality_score:.2f}")
        console.print(f"  [green]✓[/green] Saved to: {profile_path}")

        if args.verbose and profile.top_words:
            console.print("\n  [dim]Top 10 words:[/dim]")
            for item in profile.top_words[:10]:
                console.print(f"    {item['word']}: {item['count']}")
    else:
        console.print("\n[yellow]Step 3:[/yellow] [dim]Skipped style analysis[/dim]")

    # Summary
    console.print("\n" + "=" * 50)
    console.print("[bold green]Processing complete![/bold green]")
    console.print(f"\nOutput files in: {output_dir}")
    console.print(f"  - posts.json ({len(posts)} posts)")
    console.print(f"  - segments.json ({len(segments)} segments)")
    if not args.skip_style_analysis:
        console.print("  - style_profile.json")

    console.print("\n[dim]Next step: Generate Q&A training pairs[/dim]")
    console.print(f"[dim]  python scripts/generate_training_data.py --input {segments_path}[/dim]")

    return 0


if __name__ == "__main__":
    exit(main())