File size: 4,488 Bytes
ded29b0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import csv
import pandas as pd
import io
from collections import Counter, defaultdict
import logging
import sys

# Configure logging to stdout only
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.StreamHandler(sys.stdout)
    ]
)
logger = logging.getLogger(__name__)

def analyze_niches(input_data) -> tuple:
    """
    Analyze and rank niches in a categorized CSV file
    
    Args:
        input_data: File-like object containing the CSV data with categorized titles
        
    Returns:
        tuple: (niche_df, subniche_df) DataFrames with analysis results
    """
    logger.info("Starting niche analysis...")
    
    # Read CSV file into DataFrame
    df = pd.read_csv(input_data)
    
    # Ensure required columns exist
    required_columns = ["Title", "Niche", "Subniches"]
    for col in required_columns:
        if col not in df.columns:
            error_msg = f"Column '{col}' not found in CSV file!"
            logger.error(error_msg)
            raise ValueError(error_msg)
    
    # Count occurrences of each niche
    niche_counter = Counter(df["Niche"])
    
    # Create dictionary to store detailed info about each niche
    niche_details = defaultdict(lambda: {"count": 0, "subniches": Counter(), "titles": []})
    
    # Process each row in the DataFrame
    for _, row in df.iterrows():
        niche = row["Niche"]
        title = row["Title"]
        
        # Get subniches (may be comma-separated)
        subniches = [s.strip() for s in str(row["Subniches"]).split(",")]
        
        # Update detailed information
        niche_details[niche]["count"] += 1
        niche_details[niche]["titles"].append(title)
        for subniche in subniches:
            if subniche:  # Skip empty subniches
                niche_details[niche]["subniches"][subniche] += 1
    
    # Sort niches by occurrence count (descending)
    sorted_niches = sorted(niche_counter.items(), key=lambda x: x[1], reverse=True)
    
    # Print results
    logger.info("\n=== NICHE RANKING BY FREQUENCY ===")
    logger.info(f"{'Rank':<5}{'Niche':<25}{'Count':<10}{'Top Subniches':<50}")
    logger.info("-" * 90)
    
    for rank, (niche, count) in enumerate(sorted_niches, 1):
        # Get top 3 subniches
        top_subniches = niche_details[niche]["subniches"].most_common(3)
        top_subniches_str = ", ".join([f"{s[0]} ({s[1]})" for s in top_subniches])
        
        logger.info(f"{rank:<5}{niche:<25}{count:<10}{top_subniches_str:<50}")
    
    # Create niche ranking DataFrame
    niche_data = []
    for rank, (niche, count) in enumerate(sorted_niches, 1):
        top_subniches = niche_details[niche]["subniches"].most_common(5)
        top_subniches_str = ", ".join([f"{s[0]} ({s[1]})" for s in top_subniches])
        
        example_titles = niche_details[niche]["titles"][:3]
        example_titles_str = "; ".join(example_titles)
        
        niche_data.append({
            "Rank": rank,
            "Niche": niche,
            "Count": count,
            "Top Subniches": top_subniches_str,
            "Example Titles": example_titles_str
        })
    
    niche_df = pd.DataFrame(niche_data)
    logger.info("Niche ranking analysis complete")
    
    # Analyze subniches in more detail
    logger.info("\n=== DETAILED SUBNICHE ANALYSIS ===")
    all_subniches = Counter()
    for niche, details in niche_details.items():
        for subniche, count in details["subniches"].items():
            all_subniches[subniche] += count
    
    logger.info(f"Total unique subniches: {len(all_subniches)}")
    logger.info("Top 10 most common subniches:")
    for subniche, count in all_subniches.most_common(10):
        logger.info(f"  - {subniche}: {count}")
    
    # Create subniche details DataFrame
    subniche_data = []
    for subniche, count in all_subniches.most_common():
        # Find niches related to this subniche
        related_niches = []
        for niche, details in niche_details.items():
            if subniche in details["subniches"]:
                related_niches.append(f"{niche} ({details['subniches'][subniche]})")
        
        subniche_data.append({
            "Subniche": subniche,
            "Count": count,
            "Related Niches": ", ".join(related_niches)
        })
    
    subniche_df = pd.DataFrame(subniche_data)
    logger.info("Subniche analysis complete")
    
    return niche_df, subniche_df