Spaces:
Sleeping
Sleeping
Stephen Zweibel commited on
Commit ·
ec0096e
1
Parent(s): 4435587
Update app for Hugging Face
Browse files- rule_extractor.py +25 -10
rule_extractor.py
CHANGED
|
@@ -202,21 +202,36 @@ def get_rules_from_url(url: str) -> str:
|
|
| 202 |
return f"Failed to extract rules from {url}. Both browser and HTTP extraction failed. Error: {str(e)}"
|
| 203 |
|
| 204 |
if result.success and result.extracted_content:
|
| 205 |
-
#
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 210 |
else:
|
| 211 |
-
|
| 212 |
-
return
|
| 213 |
-
|
| 214 |
# Store the raw data for debugging
|
| 215 |
-
logger.info(f"
|
| 216 |
|
| 217 |
# Format the rules for display
|
| 218 |
formatted_rules = format_rules_for_display(rules_data)
|
| 219 |
-
|
|
|
|
|
|
|
|
|
|
| 220 |
return formatted_rules
|
| 221 |
elif result.success and result.markdown:
|
| 222 |
# Fallback to markdown if structured extraction fails
|
|
|
|
| 202 |
return f"Failed to extract rules from {url}. Both browser and HTTP extraction failed. Error: {str(e)}"
|
| 203 |
|
| 204 |
if result.success and result.extracted_content:
|
| 205 |
+
# The extracted content is often a list containing a JSON string.
|
| 206 |
+
raw_data = result.extracted_content
|
| 207 |
+
if isinstance(raw_data, list) and len(raw_data) > 0:
|
| 208 |
+
raw_data = raw_data[0]
|
| 209 |
+
|
| 210 |
+
# Ensure we have a dictionary to work with
|
| 211 |
+
if isinstance(raw_data, str):
|
| 212 |
+
try:
|
| 213 |
+
rules_data = json.loads(raw_data)
|
| 214 |
+
# If the parsed data is a list, take the first element
|
| 215 |
+
if isinstance(rules_data, list) and len(rules_data) > 0:
|
| 216 |
+
rules_data = rules_data[0]
|
| 217 |
+
except json.JSONDecodeError:
|
| 218 |
+
logger.error(f"Failed to parse JSON from extracted content: {raw_data}")
|
| 219 |
+
return "Failed to parse the extracted formatting rules."
|
| 220 |
+
elif isinstance(raw_data, dict):
|
| 221 |
+
rules_data = raw_data
|
| 222 |
else:
|
| 223 |
+
logger.warning(f"Unexpected type for extracted content: {type(raw_data)}")
|
| 224 |
+
return "Could not process the extracted formatting rules."
|
| 225 |
+
|
| 226 |
# Store the raw data for debugging
|
| 227 |
+
logger.info(f"Parsed rules data: {json.dumps(rules_data, indent=2)}")
|
| 228 |
|
| 229 |
# Format the rules for display
|
| 230 |
formatted_rules = format_rules_for_display(rules_data)
|
| 231 |
+
if not formatted_rules:
|
| 232 |
+
return "Failed to format the extracted rules."
|
| 233 |
+
|
| 234 |
+
logger.info(f"Formatted rules: {formatted_rules[:100]}...")
|
| 235 |
return formatted_rules
|
| 236 |
elif result.success and result.markdown:
|
| 237 |
# Fallback to markdown if structured extraction fails
|