Spaces:

AI-Talent-Force
/

ai_exec

Paused

App Files Files Community

ai_exec / scripts /process_blogs.py

Chaitanya-aitf

Upload 38 files

45ee481 verified about 1 month ago

raw

history blame contribute delete

7.89 kB

	#!/usr/bin/env python3
	"""
	Process Blogs CLI

	Parse raw blog content, segment into chunks, and analyze writing style.
	This is the first step in the data processing pipeline.

	Usage:
	python scripts/process_blogs.py --input data/raw/blogs.txt --output data/processed/

	Output:
	- posts.json: Parsed blog posts with metadata
	- segments.json: Text segments for training
	- style_profile.json: Writing style analysis
	"""

	import argparse
	import json
	import sys
	from pathlib import Path

	# Add src to path for imports
	sys.path.insert(0, str(Path(__file__).parent.parent))

	from rich.console import Console
	from rich.progress import Progress, SpinnerColumn, TextColumn
	from rich.table import Table

	from src.data_processing.blog_parser import BlogParser
	from src.data_processing.text_segmenter import TextSegmenter
	from src.data_processing.style_analyzer import StyleAnalyzer

	console = Console()


	def main():
	parser = argparse.ArgumentParser(
	description="Process raw blog content into training-ready data",
	formatter_class=argparse.RawDescriptionHelpFormatter,
	epilog="""
	Examples:
	# Basic usage
	python scripts/process_blogs.py --input data/raw/blogs.txt --output data/processed/

	# Custom segmentation
	python scripts/process_blogs.py --input data/raw/blogs.txt --output data/processed/ \\
	--target-tokens 256 --overlap 30

	# Skip style analysis
	python scripts/process_blogs.py --input data/raw/blogs.txt --output data/processed/ \\
	--skip-style-analysis

	Input format (blogs.txt):
	=== BLOG START ===
	Title of first blog post

	Content of first blog post...

	=== BLOG END ===

	=== BLOG START ===
	Title of second blog post

	Content...

	=== BLOG END ===
	""",
	)

	parser.add_argument(
	"--input", "-i",
	default="data/raw/blogs.txt",
	help="Path to blogs.txt file (default: data/raw/blogs.txt)",
	)
	parser.add_argument(
	"--output", "-o",
	default="data/processed/",
	help="Output directory (default: data/processed/)",
	)
	parser.add_argument(
	"--target-tokens",
	type=int,
	default=384,
	help="Target tokens per segment (default: 384)",
	)
	parser.add_argument(
	"--min-tokens",
	type=int,
	default=100,
	help="Minimum tokens per segment (default: 100)",
	)
	parser.add_argument(
	"--max-tokens",
	type=int,
	default=512,
	help="Maximum tokens per segment (default: 512)",
	)
	parser.add_argument(
	"--overlap",
	type=int,
	default=50,
	help="Overlap tokens between segments (default: 50)",
	)
	parser.add_argument(
	"--min-post-length",
	type=int,
	default=100,
	help="Minimum characters for a valid post (default: 100)",
	)
	parser.add_argument(
	"--skip-style-analysis",
	action="store_true",
	help="Skip writing style analysis",
	)
	parser.add_argument(
	"--verbose", "-v",
	action="store_true",
	help="Verbose output",
	)

	args = parser.parse_args()

	# Validate input file
	input_path = Path(args.input)
	if not input_path.exists():
	console.print(f"[red]Error:[/red] Input file not found: {input_path}")
	console.print("\nPlease create the file with your blog content in this format:")
	console.print(" === BLOG START ===")
	console.print(" [Title]")
	console.print(" [Content...]")
	console.print(" === BLOG END ===")
	return 1

	# Create output directory
	output_dir = Path(args.output)
	output_dir.mkdir(parents=True, exist_ok=True)

	console.print("\n[bold blue]AI Executive - Blog Processor[/bold blue]")
	console.print("=" * 50)

	# Step 1: Parse blogs
	console.print("\n[yellow]Step 1:[/yellow] Parsing blog content...")

	blog_parser = BlogParser(min_content_length=args.min_post_length)

	try:
	posts = blog_parser.parse_file(input_path)
	except Exception as e:
	console.print(f"[red]Error parsing blogs:[/red] {e}")
	return 1

	# Save posts
	posts_data = [p.to_dict() for p in posts]
	posts_path = output_dir / "posts.json"
	with open(posts_path, "w", encoding="utf-8") as f:
	json.dump(posts_data, f, indent=2, ensure_ascii=False)

	console.print(f" [green]✓[/green] Parsed {len(posts)} blog posts")
	console.print(f" [green]✓[/green] Saved to: {posts_path}")

	# Show post summary
	if args.verbose:
	table = Table(title="Parsed Posts")
	table.add_column("Index", justify="right", style="cyan")
	table.add_column("Title", style="white")
	table.add_column("Words", justify="right", style="green")

	for post in posts[:10]: # Show first 10
	table.add_row(
	str(post.index),
	post.title[:50] + "..." if len(post.title) > 50 else post.title,
	f"{post.word_count:,}",
	)
	if len(posts) > 10:
	table.add_row("...", f"({len(posts) - 10} more)", "...")

	console.print(table)

	# Step 2: Segment text
	console.print("\n[yellow]Step 2:[/yellow] Segmenting into chunks...")

	segmenter = TextSegmenter(
	target_tokens=args.target_tokens,
	min_tokens=args.min_tokens,
	max_tokens=args.max_tokens,
	overlap_tokens=args.overlap,
	)

	segments = segmenter.segment_posts(posts)

	# Save segments
	segments_data = [s.to_dict() for s in segments]
	segments_path = output_dir / "segments.json"
	with open(segments_path, "w", encoding="utf-8") as f:
	json.dump(segments_data, f, indent=2, ensure_ascii=False)

	console.print(f" [green]✓[/green] Created {len(segments)} segments")
	console.print(f" [green]✓[/green] Saved to: {segments_path}")

	# Show segment statistics
	token_counts = [s.token_count for s in segments]
	avg_tokens = sum(token_counts) / len(token_counts) if token_counts else 0
	console.print(f" [dim]Average tokens/segment: {avg_tokens:.1f}[/dim]")
	console.print(f" [dim]Token range: {min(token_counts)} - {max(token_counts)}[/dim]")

	# Step 3: Style analysis (optional)
	if not args.skip_style_analysis:
	console.print("\n[yellow]Step 3:[/yellow] Analyzing writing style...")

	analyzer = StyleAnalyzer()
	profile = analyzer.analyze_posts(posts)

	# Save profile
	profile_path = output_dir / "style_profile.json"
	profile.save(profile_path)

	console.print(f" [green]✓[/green] Vocabulary size: {profile.vocabulary_size:,} words")
	console.print(f" [green]✓[/green] Avg sentence length: {profile.avg_words_per_sentence:.1f} words")
	console.print(f" [green]✓[/green] Formality score: {profile.formality_score:.2f}")
	console.print(f" [green]✓[/green] Saved to: {profile_path}")

	if args.verbose and profile.top_words:
	console.print("\n [dim]Top 10 words:[/dim]")
	for item in profile.top_words[:10]:
	console.print(f" {item['word']}: {item['count']}")
	else:
	console.print("\n[yellow]Step 3:[/yellow] [dim]Skipped style analysis[/dim]")

	# Summary
	console.print("\n" + "=" * 50)
	console.print("[bold green]Processing complete![/bold green]")
	console.print(f"\nOutput files in: {output_dir}")
	console.print(f" - posts.json ({len(posts)} posts)")
	console.print(f" - segments.json ({len(segments)} segments)")
	if not args.skip_style_analysis:
	console.print(" - style_profile.json")

	console.print("\n[dim]Next step: Generate Q&A training pairs[/dim]")
	console.print(f"[dim] python scripts/generate_training_data.py --input {segments_path}[/dim]")

	return 0


	if __name__ == "__main__":
	exit(main())