Stephen Zweibel commited on
Commit
23943da
·
1 Parent(s): 95bad73

Update app for Hugging Face

Browse files
Files changed (1) hide show
  1. rule_extractor.py +23 -15
rule_extractor.py CHANGED
@@ -20,7 +20,7 @@ def ensure_playwright_installed():
20
 
21
  # Try to install Playwright browsers
22
  result = subprocess.run([
23
- sys.executable, "-m", "playwright", "install", "chromium", "--with-deps"
24
  ], capture_output=True, text=True, timeout=300)
25
 
26
  if result.returncode == 0:
@@ -92,15 +92,6 @@ def get_rules_from_url(url: str) -> str:
92
 
93
  # Ensure Playwright is installed (especially for Hugging Face)
94
  playwright_available = ensure_playwright_installed()
95
- if not playwright_available:
96
- logger.warning("Playwright installation failed, falling back to simple HTTP request")
97
- try:
98
- with httpx.Client() as client:
99
- response = client.get(url, follow_redirects=True)
100
- response.raise_for_status()
101
- return f"# Formatting Rules (Simple Extraction)\n\nExtracted from: {url}\n\n{response.text[:2000]}...\n\n*Note: Advanced extraction failed, showing raw content. Please review manually.*"
102
- except Exception as e:
103
- return f"Failed to extract rules from {url}. Error: {str(e)}"
104
 
105
  # Apply nest_asyncio here, when the function is called
106
  nest_asyncio.apply()
@@ -112,13 +103,29 @@ def get_rules_from_url(url: str) -> str:
112
  except ImportError as e:
113
  logger.error(f"Failed to import crawl4ai: {e}")
114
  return f"Failed to import required modules for web crawling. Error: {str(e)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
 
116
- async def _extract_rules_async(url: str) -> str:
117
  """
118
  Asynchronously extracts formatting rules from a given URL using crawl4ai.
 
119
  """
120
- # Configure the browser
121
- browser_config = BrowserConfig(verbose=True)
122
 
123
  # Configure the LLM extraction
124
  extraction_strategy = LLMExtractionStrategy(
@@ -152,7 +159,8 @@ def get_rules_from_url(url: str) -> str:
152
 
153
  # Initialize the crawler and run
154
  try:
155
- async with AsyncWebCrawler() as crawler:
 
156
  try:
157
  result = await crawler.arun(
158
  url=url,
@@ -225,4 +233,4 @@ def get_rules_from_url(url: str) -> str:
225
  return "Could not extract formatting rules from the provided URL. The crawler did not return any content."
226
 
227
  # Run the async function using the patched event loop
228
- return asyncio.run(_extract_rules_async(url))
 
20
 
21
  # Try to install Playwright browsers
22
  result = subprocess.run([
23
+ sys.executable, "-m", "playwright", "install", "chromium"
24
  ], capture_output=True, text=True, timeout=300)
25
 
26
  if result.returncode == 0:
 
92
 
93
  # Ensure Playwright is installed (especially for Hugging Face)
94
  playwright_available = ensure_playwright_installed()
 
 
 
 
 
 
 
 
 
95
 
96
  # Apply nest_asyncio here, when the function is called
97
  nest_asyncio.apply()
 
103
  except ImportError as e:
104
  logger.error(f"Failed to import crawl4ai: {e}")
105
  return f"Failed to import required modules for web crawling. Error: {str(e)}"
106
+
107
+ if not playwright_available:
108
+ logger.warning("Playwright installation failed, falling back to simple HTTP request and raw HTML processing")
109
+ try:
110
+ with httpx.Client() as client:
111
+ response = client.get(url, follow_redirects=True)
112
+ response.raise_for_status()
113
+ raw_html = response.text
114
+ # Use crawl4ai to process the raw HTML
115
+ raw_html_url = f"raw:{raw_html}"
116
+ # We can reuse the async extraction logic here
117
+ return asyncio.run(_extract_rules_async(raw_html_url, use_browser=False))
118
+ except Exception as e:
119
+ return f"Failed to extract rules from {url} after Playwright failure. Error: {str(e)}"
120
+
121
 
122
+ async def _extract_rules_async(url: str, use_browser: bool = True) -> str:
123
  """
124
  Asynchronously extracts formatting rules from a given URL using crawl4ai.
125
+ If use_browser is False, it will process raw HTML directly.
126
  """
127
+ # Configure the browser only if needed
128
+ browser_config = BrowserConfig(verbose=True) if use_browser else None
129
 
130
  # Configure the LLM extraction
131
  extraction_strategy = LLMExtractionStrategy(
 
159
 
160
  # Initialize the crawler and run
161
  try:
162
+ # Pass the browser_config to the crawler
163
+ async with AsyncWebCrawler(browser_config=browser_config) as crawler:
164
  try:
165
  result = await crawler.arun(
166
  url=url,
 
233
  return "Could not extract formatting rules from the provided URL. The crawler did not return any content."
234
 
235
  # Run the async function using the patched event loop
236
+ return asyncio.run(_extract_rules_async(url, use_browser=True))