cryogenic22's picture
Update utils.py
e5de79f verified
# utils.py
import re
import json
import streamlit as st
def update_progress(container, percentage, message=""):
"""Update the progress bar"""
progress_html = f"""
<div class="progress-container">
<div class="progress-bar" style="width: {percentage}%">{message}</div>
</div>
"""
container.markdown(progress_html, unsafe_allow_html=True)
def extract_section(text, section_name):
"""Extract a section from the text"""
try:
# Try multiple patterns to find the section
patterns = [
f"{section_name}.*?\n(.*?)(?=\n\n|$)",
f"{section_name}[:\s](.*?)(?=\n\n|$)",
f"{section_name}:\s*(.*?)(?=\n|$)",
f"{section_name}\s*(.*?)(?=\n|$)"
]
for pattern in patterns:
match = re.search(pattern, text, re.DOTALL | re.IGNORECASE)
if match:
return match.group(1).strip()
return f"No {section_name.lower()} information found"
except Exception as e:
print(f"Error extracting {section_name}: {str(e)}")
return f"Error extracting {section_name.lower()}"
def extract_sources(text):
"""Extract sources from the text"""
try:
sources = []
patterns = [
r"Source:.*?(?:\n|$)",
r"\[.*?\]",
r"\(https?://.*?\)",
r"Reference:.*?(?:\n|$)",
r"Retrieved from:.*?(?:\n|$)"
]
for pattern in patterns:
matches = re.finditer(pattern, text, re.MULTILINE)
sources.extend([match.group().strip() for match in matches])
return sources if sources else ["Sources not explicitly mentioned"]
except Exception as e:
print(f"Error extracting sources: {str(e)}")
return ["Error extracting sources"]
def format_json_output(raw_output):
"""Format CrewOutput into proper JSON structure"""
try:
# Get raw text from output
if hasattr(raw_output, 'raw_output'):
raw_text = str(raw_output.raw_output)
else:
raw_text = str(raw_output)
print("Raw text received:", raw_text[:500]) # Debug print
# Try to find and parse JSON structure
try:
json_pattern = r"\{[\s\S]*\}"
match = re.search(json_pattern, raw_text)
if match:
json_str = match.group()
parsed_json = json.loads(json_str)
print("Successfully parsed JSON:", json_str[:500]) # Debug print
# Ensure all required fields exist
if isinstance(parsed_json, dict):
return {
"exec_summary": {
"summary": parsed_json.get('exec_summary', {}).get('summary',
extract_section(raw_text, "Executive Summary")),
"market_size": parsed_json.get('exec_summary', {}).get('market_size',
extract_section(raw_text, "Market Size")),
"growth_rate": parsed_json.get('exec_summary', {}).get('growth_rate',
extract_section(raw_text, "Growth Rate")),
"key_players": parsed_json.get('exec_summary', {}).get('key_players',
extract_section(raw_text, "Key Players"))
},
"detailed_report": parsed_json.get('detailed_report', raw_text),
"sources": parsed_json.get('sources', extract_sources(raw_text)),
"metrics": parsed_json.get('metrics', {
"market_size_data": [],
"growth_rates": [],
"market_shares": {}
})
}
except json.JSONDecodeError as e:
print(f"JSON parsing error: {str(e)}") # Debug print
# If JSON parsing fails, create structured format from raw text
print("Creating structured format from raw text") # Debug print
return {
"exec_summary": {
"summary": extract_section(raw_text, "Executive Summary"),
"market_size": extract_section(raw_text, "Market Size"),
"growth_rate": extract_section(raw_text, "Growth Rate"),
"key_players": extract_section(raw_text, "Key Players")
},
"detailed_report": raw_text,
"sources": extract_sources(raw_text),
"metrics": {
"market_size_data": [],
"growth_rates": [],
"market_shares": {}
}
}
except Exception as e:
print(f"Error in format_json_output: {str(e)}") # Debug print
# Return a safe default structure
return {
"exec_summary": {
"summary": "Error processing report",
"market_size": "Data not available",
"growth_rate": "Data not available",
"key_players": "Data not available"
},
"detailed_report": raw_text if 'raw_text' in locals() else str(raw_output),
"sources": [],
"metrics": {
"market_size_data": [],
"growth_rates": [],
"market_shares": {}
}
}