File size: 5,498 Bytes
f3ed66a
2b474dd
f3ed66a
 
 
2b474dd
 
 
 
 
 
 
 
f3ed66a
2b474dd
 
e5de79f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f3ed66a
2b474dd
 
e5de79f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f3ed66a
2b474dd
 
 
e5de79f
2b474dd
 
 
 
e5de79f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2b474dd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e5de79f
 
2b474dd
 
e5de79f
 
 
 
2b474dd
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
# utils.py
import re
import json
import streamlit as st

def update_progress(container, percentage, message=""):
    """Update the progress bar"""
    progress_html = f"""
        <div class="progress-container">
            <div class="progress-bar" style="width: {percentage}%">{message}</div>
        </div>
    """
    container.markdown(progress_html, unsafe_allow_html=True)

def extract_section(text, section_name):
    """Extract a section from the text"""
    try:
        # Try multiple patterns to find the section
        patterns = [
            f"{section_name}.*?\n(.*?)(?=\n\n|$)",
            f"{section_name}[:\s](.*?)(?=\n\n|$)",
            f"{section_name}:\s*(.*?)(?=\n|$)",
            f"{section_name}\s*(.*?)(?=\n|$)"
        ]
        
        for pattern in patterns:
            match = re.search(pattern, text, re.DOTALL | re.IGNORECASE)
            if match:
                return match.group(1).strip()
        
        return f"No {section_name.lower()} information found"
    except Exception as e:
        print(f"Error extracting {section_name}: {str(e)}")
        return f"Error extracting {section_name.lower()}"

def extract_sources(text):
    """Extract sources from the text"""
    try:
        sources = []
        patterns = [
            r"Source:.*?(?:\n|$)",
            r"\[.*?\]",
            r"\(https?://.*?\)",
            r"Reference:.*?(?:\n|$)",
            r"Retrieved from:.*?(?:\n|$)"
        ]
        
        for pattern in patterns:
            matches = re.finditer(pattern, text, re.MULTILINE)
            sources.extend([match.group().strip() for match in matches])
        
        return sources if sources else ["Sources not explicitly mentioned"]
    except Exception as e:
        print(f"Error extracting sources: {str(e)}")
        return ["Error extracting sources"]

def format_json_output(raw_output):
    """Format CrewOutput into proper JSON structure"""
    try:
        # Get raw text from output
        if hasattr(raw_output, 'raw_output'):
            raw_text = str(raw_output.raw_output)
        else:
            raw_text = str(raw_output)
        
        print("Raw text received:", raw_text[:500])  # Debug print
        
        # Try to find and parse JSON structure
        try:
            json_pattern = r"\{[\s\S]*\}"
            match = re.search(json_pattern, raw_text)
            if match:
                json_str = match.group()
                parsed_json = json.loads(json_str)
                print("Successfully parsed JSON:", json_str[:500])  # Debug print
                
                # Ensure all required fields exist
                if isinstance(parsed_json, dict):
                    return {
                        "exec_summary": {
                            "summary": parsed_json.get('exec_summary', {}).get('summary', 
                                      extract_section(raw_text, "Executive Summary")),
                            "market_size": parsed_json.get('exec_summary', {}).get('market_size',
                                         extract_section(raw_text, "Market Size")),
                            "growth_rate": parsed_json.get('exec_summary', {}).get('growth_rate',
                                         extract_section(raw_text, "Growth Rate")),
                            "key_players": parsed_json.get('exec_summary', {}).get('key_players',
                                         extract_section(raw_text, "Key Players"))
                        },
                        "detailed_report": parsed_json.get('detailed_report', raw_text),
                        "sources": parsed_json.get('sources', extract_sources(raw_text)),
                        "metrics": parsed_json.get('metrics', {
                            "market_size_data": [],
                            "growth_rates": [],
                            "market_shares": {}
                        })
                    }
        except json.JSONDecodeError as e:
            print(f"JSON parsing error: {str(e)}")  # Debug print
        
        # If JSON parsing fails, create structured format from raw text
        print("Creating structured format from raw text")  # Debug print
        return {
            "exec_summary": {
                "summary": extract_section(raw_text, "Executive Summary"),
                "market_size": extract_section(raw_text, "Market Size"),
                "growth_rate": extract_section(raw_text, "Growth Rate"),
                "key_players": extract_section(raw_text, "Key Players")
            },
            "detailed_report": raw_text,
            "sources": extract_sources(raw_text),
            "metrics": {
                "market_size_data": [],
                "growth_rates": [],
                "market_shares": {}
            }
        }
    except Exception as e:
        print(f"Error in format_json_output: {str(e)}")  # Debug print
        # Return a safe default structure
        return {
            "exec_summary": {
                "summary": "Error processing report",
                "market_size": "Data not available",
                "growth_rate": "Data not available",
                "key_players": "Data not available"
            },
            "detailed_report": raw_text if 'raw_text' in locals() else str(raw_output),
            "sources": [],
            "metrics": {
                "market_size_data": [],
                "growth_rates": [],
                "market_shares": {}
            }
        }