AI-Agent-Book / utils /analyzer.py
Cuong2004's picture
init project
ded29b0
import csv
import pandas as pd
import io
from collections import Counter, defaultdict
import logging
import sys
# Configure logging to stdout only
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
handlers=[
logging.StreamHandler(sys.stdout)
]
)
logger = logging.getLogger(__name__)
def analyze_niches(input_data) -> tuple:
"""
Analyze and rank niches in a categorized CSV file
Args:
input_data: File-like object containing the CSV data with categorized titles
Returns:
tuple: (niche_df, subniche_df) DataFrames with analysis results
"""
logger.info("Starting niche analysis...")
# Read CSV file into DataFrame
df = pd.read_csv(input_data)
# Ensure required columns exist
required_columns = ["Title", "Niche", "Subniches"]
for col in required_columns:
if col not in df.columns:
error_msg = f"Column '{col}' not found in CSV file!"
logger.error(error_msg)
raise ValueError(error_msg)
# Count occurrences of each niche
niche_counter = Counter(df["Niche"])
# Create dictionary to store detailed info about each niche
niche_details = defaultdict(lambda: {"count": 0, "subniches": Counter(), "titles": []})
# Process each row in the DataFrame
for _, row in df.iterrows():
niche = row["Niche"]
title = row["Title"]
# Get subniches (may be comma-separated)
subniches = [s.strip() for s in str(row["Subniches"]).split(",")]
# Update detailed information
niche_details[niche]["count"] += 1
niche_details[niche]["titles"].append(title)
for subniche in subniches:
if subniche: # Skip empty subniches
niche_details[niche]["subniches"][subniche] += 1
# Sort niches by occurrence count (descending)
sorted_niches = sorted(niche_counter.items(), key=lambda x: x[1], reverse=True)
# Print results
logger.info("\n=== NICHE RANKING BY FREQUENCY ===")
logger.info(f"{'Rank':<5}{'Niche':<25}{'Count':<10}{'Top Subniches':<50}")
logger.info("-" * 90)
for rank, (niche, count) in enumerate(sorted_niches, 1):
# Get top 3 subniches
top_subniches = niche_details[niche]["subniches"].most_common(3)
top_subniches_str = ", ".join([f"{s[0]} ({s[1]})" for s in top_subniches])
logger.info(f"{rank:<5}{niche:<25}{count:<10}{top_subniches_str:<50}")
# Create niche ranking DataFrame
niche_data = []
for rank, (niche, count) in enumerate(sorted_niches, 1):
top_subniches = niche_details[niche]["subniches"].most_common(5)
top_subniches_str = ", ".join([f"{s[0]} ({s[1]})" for s in top_subniches])
example_titles = niche_details[niche]["titles"][:3]
example_titles_str = "; ".join(example_titles)
niche_data.append({
"Rank": rank,
"Niche": niche,
"Count": count,
"Top Subniches": top_subniches_str,
"Example Titles": example_titles_str
})
niche_df = pd.DataFrame(niche_data)
logger.info("Niche ranking analysis complete")
# Analyze subniches in more detail
logger.info("\n=== DETAILED SUBNICHE ANALYSIS ===")
all_subniches = Counter()
for niche, details in niche_details.items():
for subniche, count in details["subniches"].items():
all_subniches[subniche] += count
logger.info(f"Total unique subniches: {len(all_subniches)}")
logger.info("Top 10 most common subniches:")
for subniche, count in all_subniches.most_common(10):
logger.info(f" - {subniche}: {count}")
# Create subniche details DataFrame
subniche_data = []
for subniche, count in all_subniches.most_common():
# Find niches related to this subniche
related_niches = []
for niche, details in niche_details.items():
if subniche in details["subniches"]:
related_niches.append(f"{niche} ({details['subniches'][subniche]})")
subniche_data.append({
"Subniche": subniche,
"Count": count,
"Related Niches": ", ".join(related_niches)
})
subniche_df = pd.DataFrame(subniche_data)
logger.info("Subniche analysis complete")
return niche_df, subniche_df