Spaces:
Sleeping
Sleeping
Stephen Zweibel commited on
Commit ·
23943da
1
Parent(s): 95bad73
Update app for Hugging Face
Browse files- rule_extractor.py +23 -15
rule_extractor.py
CHANGED
|
@@ -20,7 +20,7 @@ def ensure_playwright_installed():
|
|
| 20 |
|
| 21 |
# Try to install Playwright browsers
|
| 22 |
result = subprocess.run([
|
| 23 |
-
sys.executable, "-m", "playwright", "install", "chromium"
|
| 24 |
], capture_output=True, text=True, timeout=300)
|
| 25 |
|
| 26 |
if result.returncode == 0:
|
|
@@ -92,15 +92,6 @@ def get_rules_from_url(url: str) -> str:
|
|
| 92 |
|
| 93 |
# Ensure Playwright is installed (especially for Hugging Face)
|
| 94 |
playwright_available = ensure_playwright_installed()
|
| 95 |
-
if not playwright_available:
|
| 96 |
-
logger.warning("Playwright installation failed, falling back to simple HTTP request")
|
| 97 |
-
try:
|
| 98 |
-
with httpx.Client() as client:
|
| 99 |
-
response = client.get(url, follow_redirects=True)
|
| 100 |
-
response.raise_for_status()
|
| 101 |
-
return f"# Formatting Rules (Simple Extraction)\n\nExtracted from: {url}\n\n{response.text[:2000]}...\n\n*Note: Advanced extraction failed, showing raw content. Please review manually.*"
|
| 102 |
-
except Exception as e:
|
| 103 |
-
return f"Failed to extract rules from {url}. Error: {str(e)}"
|
| 104 |
|
| 105 |
# Apply nest_asyncio here, when the function is called
|
| 106 |
nest_asyncio.apply()
|
|
@@ -112,13 +103,29 @@ def get_rules_from_url(url: str) -> str:
|
|
| 112 |
except ImportError as e:
|
| 113 |
logger.error(f"Failed to import crawl4ai: {e}")
|
| 114 |
return f"Failed to import required modules for web crawling. Error: {str(e)}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 115 |
|
| 116 |
-
async def _extract_rules_async(url: str) -> str:
|
| 117 |
"""
|
| 118 |
Asynchronously extracts formatting rules from a given URL using crawl4ai.
|
|
|
|
| 119 |
"""
|
| 120 |
-
# Configure the browser
|
| 121 |
-
browser_config = BrowserConfig(verbose=True)
|
| 122 |
|
| 123 |
# Configure the LLM extraction
|
| 124 |
extraction_strategy = LLMExtractionStrategy(
|
|
@@ -152,7 +159,8 @@ def get_rules_from_url(url: str) -> str:
|
|
| 152 |
|
| 153 |
# Initialize the crawler and run
|
| 154 |
try:
|
| 155 |
-
|
|
|
|
| 156 |
try:
|
| 157 |
result = await crawler.arun(
|
| 158 |
url=url,
|
|
@@ -225,4 +233,4 @@ def get_rules_from_url(url: str) -> str:
|
|
| 225 |
return "Could not extract formatting rules from the provided URL. The crawler did not return any content."
|
| 226 |
|
| 227 |
# Run the async function using the patched event loop
|
| 228 |
-
return asyncio.run(_extract_rules_async(url))
|
|
|
|
| 20 |
|
| 21 |
# Try to install Playwright browsers
|
| 22 |
result = subprocess.run([
|
| 23 |
+
sys.executable, "-m", "playwright", "install", "chromium"
|
| 24 |
], capture_output=True, text=True, timeout=300)
|
| 25 |
|
| 26 |
if result.returncode == 0:
|
|
|
|
| 92 |
|
| 93 |
# Ensure Playwright is installed (especially for Hugging Face)
|
| 94 |
playwright_available = ensure_playwright_installed()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 95 |
|
| 96 |
# Apply nest_asyncio here, when the function is called
|
| 97 |
nest_asyncio.apply()
|
|
|
|
| 103 |
except ImportError as e:
|
| 104 |
logger.error(f"Failed to import crawl4ai: {e}")
|
| 105 |
return f"Failed to import required modules for web crawling. Error: {str(e)}"
|
| 106 |
+
|
| 107 |
+
if not playwright_available:
|
| 108 |
+
logger.warning("Playwright installation failed, falling back to simple HTTP request and raw HTML processing")
|
| 109 |
+
try:
|
| 110 |
+
with httpx.Client() as client:
|
| 111 |
+
response = client.get(url, follow_redirects=True)
|
| 112 |
+
response.raise_for_status()
|
| 113 |
+
raw_html = response.text
|
| 114 |
+
# Use crawl4ai to process the raw HTML
|
| 115 |
+
raw_html_url = f"raw:{raw_html}"
|
| 116 |
+
# We can reuse the async extraction logic here
|
| 117 |
+
return asyncio.run(_extract_rules_async(raw_html_url, use_browser=False))
|
| 118 |
+
except Exception as e:
|
| 119 |
+
return f"Failed to extract rules from {url} after Playwright failure. Error: {str(e)}"
|
| 120 |
+
|
| 121 |
|
| 122 |
+
async def _extract_rules_async(url: str, use_browser: bool = True) -> str:
|
| 123 |
"""
|
| 124 |
Asynchronously extracts formatting rules from a given URL using crawl4ai.
|
| 125 |
+
If use_browser is False, it will process raw HTML directly.
|
| 126 |
"""
|
| 127 |
+
# Configure the browser only if needed
|
| 128 |
+
browser_config = BrowserConfig(verbose=True) if use_browser else None
|
| 129 |
|
| 130 |
# Configure the LLM extraction
|
| 131 |
extraction_strategy = LLMExtractionStrategy(
|
|
|
|
| 159 |
|
| 160 |
# Initialize the crawler and run
|
| 161 |
try:
|
| 162 |
+
# Pass the browser_config to the crawler
|
| 163 |
+
async with AsyncWebCrawler(browser_config=browser_config) as crawler:
|
| 164 |
try:
|
| 165 |
result = await crawler.arun(
|
| 166 |
url=url,
|
|
|
|
| 233 |
return "Could not extract formatting rules from the provided URL. The crawler did not return any content."
|
| 234 |
|
| 235 |
# Run the async function using the patched event loop
|
| 236 |
+
return asyncio.run(_extract_rules_async(url, use_browser=True))
|