import csv import pandas as pd import io from collections import Counter, defaultdict import logging import sys # Configure logging to stdout only logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', handlers=[ logging.StreamHandler(sys.stdout) ] ) logger = logging.getLogger(__name__) def analyze_niches(input_data) -> tuple: """ Analyze and rank niches in a categorized CSV file Args: input_data: File-like object containing the CSV data with categorized titles Returns: tuple: (niche_df, subniche_df) DataFrames with analysis results """ logger.info("Starting niche analysis...") # Read CSV file into DataFrame df = pd.read_csv(input_data) # Ensure required columns exist required_columns = ["Title", "Niche", "Subniches"] for col in required_columns: if col not in df.columns: error_msg = f"Column '{col}' not found in CSV file!" logger.error(error_msg) raise ValueError(error_msg) # Count occurrences of each niche niche_counter = Counter(df["Niche"]) # Create dictionary to store detailed info about each niche niche_details = defaultdict(lambda: {"count": 0, "subniches": Counter(), "titles": []}) # Process each row in the DataFrame for _, row in df.iterrows(): niche = row["Niche"] title = row["Title"] # Get subniches (may be comma-separated) subniches = [s.strip() for s in str(row["Subniches"]).split(",")] # Update detailed information niche_details[niche]["count"] += 1 niche_details[niche]["titles"].append(title) for subniche in subniches: if subniche: # Skip empty subniches niche_details[niche]["subniches"][subniche] += 1 # Sort niches by occurrence count (descending) sorted_niches = sorted(niche_counter.items(), key=lambda x: x[1], reverse=True) # Print results logger.info("\n=== NICHE RANKING BY FREQUENCY ===") logger.info(f"{'Rank':<5}{'Niche':<25}{'Count':<10}{'Top Subniches':<50}") logger.info("-" * 90) for rank, (niche, count) in enumerate(sorted_niches, 1): # Get top 3 subniches top_subniches = niche_details[niche]["subniches"].most_common(3) top_subniches_str = ", ".join([f"{s[0]} ({s[1]})" for s in top_subniches]) logger.info(f"{rank:<5}{niche:<25}{count:<10}{top_subniches_str:<50}") # Create niche ranking DataFrame niche_data = [] for rank, (niche, count) in enumerate(sorted_niches, 1): top_subniches = niche_details[niche]["subniches"].most_common(5) top_subniches_str = ", ".join([f"{s[0]} ({s[1]})" for s in top_subniches]) example_titles = niche_details[niche]["titles"][:3] example_titles_str = "; ".join(example_titles) niche_data.append({ "Rank": rank, "Niche": niche, "Count": count, "Top Subniches": top_subniches_str, "Example Titles": example_titles_str }) niche_df = pd.DataFrame(niche_data) logger.info("Niche ranking analysis complete") # Analyze subniches in more detail logger.info("\n=== DETAILED SUBNICHE ANALYSIS ===") all_subniches = Counter() for niche, details in niche_details.items(): for subniche, count in details["subniches"].items(): all_subniches[subniche] += count logger.info(f"Total unique subniches: {len(all_subniches)}") logger.info("Top 10 most common subniches:") for subniche, count in all_subniches.most_common(10): logger.info(f" - {subniche}: {count}") # Create subniche details DataFrame subniche_data = [] for subniche, count in all_subniches.most_common(): # Find niches related to this subniche related_niches = [] for niche, details in niche_details.items(): if subniche in details["subniches"]: related_niches.append(f"{niche} ({details['subniches'][subniche]})") subniche_data.append({ "Subniche": subniche, "Count": count, "Related Niches": ", ".join(related_niches) }) subniche_df = pd.DataFrame(subniche_data) logger.info("Subniche analysis complete") return niche_df, subniche_df