Spaces:

Zwounds
/

FormatReview

Sleeping

App Files Files Community

Stephen Zweibel commited on Jun 29, 2025

Commit

23943da

1 Parent(s): 95bad73

Update app for Hugging Face

Browse files

Files changed (1) hide show

rule_extractor.py +23 -15

rule_extractor.py CHANGED Viewed

@@ -20,7 +20,7 @@ def ensure_playwright_installed():
             # Try to install Playwright browsers
             result = subprocess.run([
-                sys.executable, "-m", "playwright", "install", "chromium", "--with-deps"
             ], capture_output=True, text=True, timeout=300)
             if result.returncode == 0:
@@ -92,15 +92,6 @@ def get_rules_from_url(url: str) -> str:
     # Ensure Playwright is installed (especially for Hugging Face)
     playwright_available = ensure_playwright_installed()
-    if not playwright_available:
-        logger.warning("Playwright installation failed, falling back to simple HTTP request")
-        try:
-            with httpx.Client() as client:
-                response = client.get(url, follow_redirects=True)
-                response.raise_for_status()
-                return f"# Formatting Rules (Simple Extraction)\n\nExtracted from: {url}\n\n{response.text[:2000]}...\n\n*Note: Advanced extraction failed, showing raw content. Please review manually.*"
-        except Exception as e:
-            return f"Failed to extract rules from {url}. Error: {str(e)}"
     # Apply nest_asyncio here, when the function is called
     nest_asyncio.apply()
@@ -112,13 +103,29 @@ def get_rules_from_url(url: str) -> str:
     except ImportError as e:
         logger.error(f"Failed to import crawl4ai: {e}")
         return f"Failed to import required modules for web crawling. Error: {str(e)}"
-    async def _extract_rules_async(url: str) -> str:
         """
         Asynchronously extracts formatting rules from a given URL using crawl4ai.
         """
-        # Configure the browser
-        browser_config = BrowserConfig(verbose=True)
         # Configure the LLM extraction
         extraction_strategy = LLMExtractionStrategy(
@@ -152,7 +159,8 @@ def get_rules_from_url(url: str) -> str:
         # Initialize the crawler and run
         try:
-            async with AsyncWebCrawler() as crawler:
                 try:
                     result = await crawler.arun(
                         url=url,
@@ -225,4 +233,4 @@ def get_rules_from_url(url: str) -> str:
                 return "Could not extract formatting rules from the provided URL. The crawler did not return any content."
     # Run the async function using the patched event loop
-    return asyncio.run(_extract_rules_async(url))

             # Try to install Playwright browsers
             result = subprocess.run([
+                sys.executable, "-m", "playwright", "install", "chromium"
             ], capture_output=True, text=True, timeout=300)
             if result.returncode == 0:
     # Ensure Playwright is installed (especially for Hugging Face)
     playwright_available = ensure_playwright_installed()
     # Apply nest_asyncio here, when the function is called
     nest_asyncio.apply()
     except ImportError as e:
         logger.error(f"Failed to import crawl4ai: {e}")
         return f"Failed to import required modules for web crawling. Error: {str(e)}"
+    if not playwright_available:
+        logger.warning("Playwright installation failed, falling back to simple HTTP request and raw HTML processing")
+        try:
+            with httpx.Client() as client:
+                response = client.get(url, follow_redirects=True)
+                response.raise_for_status()
+                raw_html = response.text
+                # Use crawl4ai to process the raw HTML
+                raw_html_url = f"raw:{raw_html}"
+                # We can reuse the async extraction logic here
+                return asyncio.run(_extract_rules_async(raw_html_url, use_browser=False))
+        except Exception as e:
+            return f"Failed to extract rules from {url} after Playwright failure. Error: {str(e)}"
+    async def _extract_rules_async(url: str, use_browser: bool = True) -> str:
         """
         Asynchronously extracts formatting rules from a given URL using crawl4ai.
+        If use_browser is False, it will process raw HTML directly.
         """
+        # Configure the browser only if needed
+        browser_config = BrowserConfig(verbose=True) if use_browser else None
         # Configure the LLM extraction
         extraction_strategy = LLMExtractionStrategy(
         # Initialize the crawler and run
         try:
+            # Pass the browser_config to the crawler
+            async with AsyncWebCrawler(browser_config=browser_config) as crawler:
                 try:
                     result = await crawler.arun(
                         url=url,
                 return "Could not extract formatting rules from the provided URL. The crawler did not return any content."
     # Run the async function using the patched event loop
+    return asyncio.run(_extract_rules_async(url, use_browser=True))