Spaces:
Sleeping
Sleeping
Update utils.py
Browse files
utils.py
CHANGED
|
@@ -14,49 +14,92 @@ def update_progress(container, percentage, message=""):
|
|
| 14 |
|
| 15 |
def extract_section(text, section_name):
|
| 16 |
"""Extract a section from the text"""
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
|
| 26 |
def extract_sources(text):
|
| 27 |
"""Extract sources from the text"""
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
|
| 42 |
def format_json_output(raw_output):
|
| 43 |
"""Format CrewOutput into proper JSON structure"""
|
| 44 |
try:
|
|
|
|
| 45 |
if hasattr(raw_output, 'raw_output'):
|
| 46 |
raw_text = str(raw_output.raw_output)
|
| 47 |
else:
|
| 48 |
raw_text = str(raw_output)
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
return {
|
| 61 |
"exec_summary": {
|
| 62 |
"summary": extract_section(raw_text, "Executive Summary"),
|
|
@@ -73,13 +116,14 @@ def format_json_output(raw_output):
|
|
| 73 |
}
|
| 74 |
}
|
| 75 |
except Exception as e:
|
| 76 |
-
|
|
|
|
| 77 |
return {
|
| 78 |
"exec_summary": {
|
| 79 |
-
"summary": "Error
|
| 80 |
-
"market_size": "
|
| 81 |
-
"growth_rate": "
|
| 82 |
-
"key_players": "
|
| 83 |
},
|
| 84 |
"detailed_report": raw_text if 'raw_text' in locals() else str(raw_output),
|
| 85 |
"sources": [],
|
|
|
|
| 14 |
|
| 15 |
def extract_section(text, section_name):
|
| 16 |
"""Extract a section from the text"""
|
| 17 |
+
try:
|
| 18 |
+
# Try multiple patterns to find the section
|
| 19 |
+
patterns = [
|
| 20 |
+
f"{section_name}.*?\n(.*?)(?=\n\n|$)",
|
| 21 |
+
f"{section_name}[:\s](.*?)(?=\n\n|$)",
|
| 22 |
+
f"{section_name}:\s*(.*?)(?=\n|$)",
|
| 23 |
+
f"{section_name}\s*(.*?)(?=\n|$)"
|
| 24 |
+
]
|
| 25 |
+
|
| 26 |
+
for pattern in patterns:
|
| 27 |
+
match = re.search(pattern, text, re.DOTALL | re.IGNORECASE)
|
| 28 |
+
if match:
|
| 29 |
+
return match.group(1).strip()
|
| 30 |
+
|
| 31 |
+
return f"No {section_name.lower()} information found"
|
| 32 |
+
except Exception as e:
|
| 33 |
+
print(f"Error extracting {section_name}: {str(e)}")
|
| 34 |
+
return f"Error extracting {section_name.lower()}"
|
| 35 |
|
| 36 |
def extract_sources(text):
|
| 37 |
"""Extract sources from the text"""
|
| 38 |
+
try:
|
| 39 |
+
sources = []
|
| 40 |
+
patterns = [
|
| 41 |
+
r"Source:.*?(?:\n|$)",
|
| 42 |
+
r"\[.*?\]",
|
| 43 |
+
r"\(https?://.*?\)",
|
| 44 |
+
r"Reference:.*?(?:\n|$)",
|
| 45 |
+
r"Retrieved from:.*?(?:\n|$)"
|
| 46 |
+
]
|
| 47 |
+
|
| 48 |
+
for pattern in patterns:
|
| 49 |
+
matches = re.finditer(pattern, text, re.MULTILINE)
|
| 50 |
+
sources.extend([match.group().strip() for match in matches])
|
| 51 |
+
|
| 52 |
+
return sources if sources else ["Sources not explicitly mentioned"]
|
| 53 |
+
except Exception as e:
|
| 54 |
+
print(f"Error extracting sources: {str(e)}")
|
| 55 |
+
return ["Error extracting sources"]
|
| 56 |
|
| 57 |
def format_json_output(raw_output):
|
| 58 |
"""Format CrewOutput into proper JSON structure"""
|
| 59 |
try:
|
| 60 |
+
# Get raw text from output
|
| 61 |
if hasattr(raw_output, 'raw_output'):
|
| 62 |
raw_text = str(raw_output.raw_output)
|
| 63 |
else:
|
| 64 |
raw_text = str(raw_output)
|
| 65 |
+
|
| 66 |
+
print("Raw text received:", raw_text[:500]) # Debug print
|
| 67 |
+
|
| 68 |
+
# Try to find and parse JSON structure
|
| 69 |
+
try:
|
| 70 |
+
json_pattern = r"\{[\s\S]*\}"
|
| 71 |
+
match = re.search(json_pattern, raw_text)
|
| 72 |
+
if match:
|
| 73 |
+
json_str = match.group()
|
| 74 |
+
parsed_json = json.loads(json_str)
|
| 75 |
+
print("Successfully parsed JSON:", json_str[:500]) # Debug print
|
| 76 |
+
|
| 77 |
+
# Ensure all required fields exist
|
| 78 |
+
if isinstance(parsed_json, dict):
|
| 79 |
+
return {
|
| 80 |
+
"exec_summary": {
|
| 81 |
+
"summary": parsed_json.get('exec_summary', {}).get('summary',
|
| 82 |
+
extract_section(raw_text, "Executive Summary")),
|
| 83 |
+
"market_size": parsed_json.get('exec_summary', {}).get('market_size',
|
| 84 |
+
extract_section(raw_text, "Market Size")),
|
| 85 |
+
"growth_rate": parsed_json.get('exec_summary', {}).get('growth_rate',
|
| 86 |
+
extract_section(raw_text, "Growth Rate")),
|
| 87 |
+
"key_players": parsed_json.get('exec_summary', {}).get('key_players',
|
| 88 |
+
extract_section(raw_text, "Key Players"))
|
| 89 |
+
},
|
| 90 |
+
"detailed_report": parsed_json.get('detailed_report', raw_text),
|
| 91 |
+
"sources": parsed_json.get('sources', extract_sources(raw_text)),
|
| 92 |
+
"metrics": parsed_json.get('metrics', {
|
| 93 |
+
"market_size_data": [],
|
| 94 |
+
"growth_rates": [],
|
| 95 |
+
"market_shares": {}
|
| 96 |
+
})
|
| 97 |
+
}
|
| 98 |
+
except json.JSONDecodeError as e:
|
| 99 |
+
print(f"JSON parsing error: {str(e)}") # Debug print
|
| 100 |
+
|
| 101 |
+
# If JSON parsing fails, create structured format from raw text
|
| 102 |
+
print("Creating structured format from raw text") # Debug print
|
| 103 |
return {
|
| 104 |
"exec_summary": {
|
| 105 |
"summary": extract_section(raw_text, "Executive Summary"),
|
|
|
|
| 116 |
}
|
| 117 |
}
|
| 118 |
except Exception as e:
|
| 119 |
+
print(f"Error in format_json_output: {str(e)}") # Debug print
|
| 120 |
+
# Return a safe default structure
|
| 121 |
return {
|
| 122 |
"exec_summary": {
|
| 123 |
+
"summary": "Error processing report",
|
| 124 |
+
"market_size": "Data not available",
|
| 125 |
+
"growth_rate": "Data not available",
|
| 126 |
+
"key_players": "Data not available"
|
| 127 |
},
|
| 128 |
"detailed_report": raw_text if 'raw_text' in locals() else str(raw_output),
|
| 129 |
"sources": [],
|