Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files- rule_extractor.py +39 -39
rule_extractor.py
CHANGED
|
@@ -200,46 +200,46 @@ def get_rules_from_url(url: str) -> str:
|
|
| 200 |
except Exception as fallback_e:
|
| 201 |
logger.error(f"Fallback HTTP request also failed: {fallback_e}")
|
| 202 |
return f"Failed to extract rules from {url}. Both browser and HTTP extraction failed. Error: {str(e)}"
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
else:
|
| 223 |
-
logger.warning(f"Unexpected type for extracted content: {type(raw_data)}")
|
| 224 |
-
return "Could not process the extracted formatting rules."
|
| 225 |
-
|
| 226 |
-
# Store the raw data for debugging
|
| 227 |
-
logger.info(f"Parsed rules data: {json.dumps(rules_data, indent=2)}")
|
| 228 |
-
|
| 229 |
-
# Format the rules for display
|
| 230 |
-
formatted_rules = format_rules_for_display(rules_data)
|
| 231 |
-
if not formatted_rules:
|
| 232 |
-
return "Failed to format the extracted rules."
|
| 233 |
-
|
| 234 |
-
logger.info(f"Formatted rules: {formatted_rules[:100]}...")
|
| 235 |
-
return formatted_rules
|
| 236 |
-
elif result.success and result.markdown:
|
| 237 |
-
# Fallback to markdown if structured extraction fails
|
| 238 |
-
logger.info(f"Extraction failed, falling back to markdown for {url}")
|
| 239 |
-
return result.markdown
|
| 240 |
else:
|
| 241 |
-
logger.warning(f"
|
| 242 |
-
return "Could not
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 243 |
|
| 244 |
# Run the async function using the patched event loop
|
| 245 |
return asyncio.run(_extract_rules_async(url))
|
|
|
|
| 200 |
except Exception as fallback_e:
|
| 201 |
logger.error(f"Fallback HTTP request also failed: {fallback_e}")
|
| 202 |
return f"Failed to extract rules from {url}. Both browser and HTTP extraction failed. Error: {str(e)}"
|
| 203 |
+
|
| 204 |
+
if result.success and result.extracted_content:
|
| 205 |
+
# The extracted content is often a list containing a JSON string.
|
| 206 |
+
raw_data = result.extracted_content
|
| 207 |
+
if isinstance(raw_data, list) and len(raw_data) > 0:
|
| 208 |
+
raw_data = raw_data[0]
|
| 209 |
+
|
| 210 |
+
# Ensure we have a dictionary to work with
|
| 211 |
+
if isinstance(raw_data, str):
|
| 212 |
+
try:
|
| 213 |
+
rules_data = json.loads(raw_data)
|
| 214 |
+
# If the parsed data is a list, take the first element
|
| 215 |
+
if isinstance(rules_data, list) and len(rules_data) > 0:
|
| 216 |
+
rules_data = rules_data[0]
|
| 217 |
+
except json.JSONDecodeError:
|
| 218 |
+
logger.error(f"Failed to parse JSON from extracted content: {raw_data}")
|
| 219 |
+
return "Failed to parse the extracted formatting rules."
|
| 220 |
+
elif isinstance(raw_data, dict):
|
| 221 |
+
rules_data = raw_data
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 222 |
else:
|
| 223 |
+
logger.warning(f"Unexpected type for extracted content: {type(raw_data)}")
|
| 224 |
+
return "Could not process the extracted formatting rules."
|
| 225 |
+
|
| 226 |
+
# Store the raw data for debugging
|
| 227 |
+
logger.info(f"Parsed rules data: {json.dumps(rules_data, indent=2)}")
|
| 228 |
+
|
| 229 |
+
# Format the rules for display
|
| 230 |
+
formatted_rules = format_rules_for_display(rules_data)
|
| 231 |
+
if not formatted_rules:
|
| 232 |
+
return "Failed to format the extracted rules."
|
| 233 |
+
|
| 234 |
+
logger.info(f"Formatted rules: {formatted_rules[:100]}...")
|
| 235 |
+
return formatted_rules
|
| 236 |
+
elif result.success and result.markdown:
|
| 237 |
+
# Fallback to markdown if structured extraction fails
|
| 238 |
+
logger.info(f"Extraction failed, falling back to markdown for {url}")
|
| 239 |
+
return result.markdown
|
| 240 |
+
else:
|
| 241 |
+
logger.warning(f"Failed to extract rules or markdown for {url}. Crawler success: {result.success}")
|
| 242 |
+
return "Could not extract formatting rules from the provided URL. The crawler did not return any content."
|
| 243 |
|
| 244 |
# Run the async function using the patched event loop
|
| 245 |
return asyncio.run(_extract_rules_async(url))
|