Spaces:
Sleeping
Sleeping
| import csv | |
| import pandas as pd | |
| import io | |
| from collections import Counter, defaultdict | |
| import logging | |
| import sys | |
| # Configure logging to stdout only | |
| logging.basicConfig( | |
| level=logging.INFO, | |
| format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', | |
| handlers=[ | |
| logging.StreamHandler(sys.stdout) | |
| ] | |
| ) | |
| logger = logging.getLogger(__name__) | |
| def analyze_niches(input_data) -> tuple: | |
| """ | |
| Analyze and rank niches in a categorized CSV file | |
| Args: | |
| input_data: File-like object containing the CSV data with categorized titles | |
| Returns: | |
| tuple: (niche_df, subniche_df) DataFrames with analysis results | |
| """ | |
| logger.info("Starting niche analysis...") | |
| # Read CSV file into DataFrame | |
| df = pd.read_csv(input_data) | |
| # Ensure required columns exist | |
| required_columns = ["Title", "Niche", "Subniches"] | |
| for col in required_columns: | |
| if col not in df.columns: | |
| error_msg = f"Column '{col}' not found in CSV file!" | |
| logger.error(error_msg) | |
| raise ValueError(error_msg) | |
| # Count occurrences of each niche | |
| niche_counter = Counter(df["Niche"]) | |
| # Create dictionary to store detailed info about each niche | |
| niche_details = defaultdict(lambda: {"count": 0, "subniches": Counter(), "titles": []}) | |
| # Process each row in the DataFrame | |
| for _, row in df.iterrows(): | |
| niche = row["Niche"] | |
| title = row["Title"] | |
| # Get subniches (may be comma-separated) | |
| subniches = [s.strip() for s in str(row["Subniches"]).split(",")] | |
| # Update detailed information | |
| niche_details[niche]["count"] += 1 | |
| niche_details[niche]["titles"].append(title) | |
| for subniche in subniches: | |
| if subniche: # Skip empty subniches | |
| niche_details[niche]["subniches"][subniche] += 1 | |
| # Sort niches by occurrence count (descending) | |
| sorted_niches = sorted(niche_counter.items(), key=lambda x: x[1], reverse=True) | |
| # Print results | |
| logger.info("\n=== NICHE RANKING BY FREQUENCY ===") | |
| logger.info(f"{'Rank':<5}{'Niche':<25}{'Count':<10}{'Top Subniches':<50}") | |
| logger.info("-" * 90) | |
| for rank, (niche, count) in enumerate(sorted_niches, 1): | |
| # Get top 3 subniches | |
| top_subniches = niche_details[niche]["subniches"].most_common(3) | |
| top_subniches_str = ", ".join([f"{s[0]} ({s[1]})" for s in top_subniches]) | |
| logger.info(f"{rank:<5}{niche:<25}{count:<10}{top_subniches_str:<50}") | |
| # Create niche ranking DataFrame | |
| niche_data = [] | |
| for rank, (niche, count) in enumerate(sorted_niches, 1): | |
| top_subniches = niche_details[niche]["subniches"].most_common(5) | |
| top_subniches_str = ", ".join([f"{s[0]} ({s[1]})" for s in top_subniches]) | |
| example_titles = niche_details[niche]["titles"][:3] | |
| example_titles_str = "; ".join(example_titles) | |
| niche_data.append({ | |
| "Rank": rank, | |
| "Niche": niche, | |
| "Count": count, | |
| "Top Subniches": top_subniches_str, | |
| "Example Titles": example_titles_str | |
| }) | |
| niche_df = pd.DataFrame(niche_data) | |
| logger.info("Niche ranking analysis complete") | |
| # Analyze subniches in more detail | |
| logger.info("\n=== DETAILED SUBNICHE ANALYSIS ===") | |
| all_subniches = Counter() | |
| for niche, details in niche_details.items(): | |
| for subniche, count in details["subniches"].items(): | |
| all_subniches[subniche] += count | |
| logger.info(f"Total unique subniches: {len(all_subniches)}") | |
| logger.info("Top 10 most common subniches:") | |
| for subniche, count in all_subniches.most_common(10): | |
| logger.info(f" - {subniche}: {count}") | |
| # Create subniche details DataFrame | |
| subniche_data = [] | |
| for subniche, count in all_subniches.most_common(): | |
| # Find niches related to this subniche | |
| related_niches = [] | |
| for niche, details in niche_details.items(): | |
| if subniche in details["subniches"]: | |
| related_niches.append(f"{niche} ({details['subniches'][subniche]})") | |
| subniche_data.append({ | |
| "Subniche": subniche, | |
| "Count": count, | |
| "Related Niches": ", ".join(related_niches) | |
| }) | |
| subniche_df = pd.DataFrame(subniche_data) | |
| logger.info("Subniche analysis complete") | |
| return niche_df, subniche_df |