ocr_api

Paused

Arafath10 commited on May 20, 2024

Commit

beb1e33

verified ·

1 Parent(s): 1949a87

Update main.py

Files changed (1) hide show

main.py CHANGED Viewed

@@ -36,10 +36,11 @@ async def power_scrapper(url):
         # Extract all links
         links = await page.query_selector_all('a')
-        result = []
         for link in links:
             href = await link.get_attribute('href')
-            result.append({'href': href})
         # Extract all text content
         elements = await page.query_selector_all('body *')
@@ -47,10 +48,10 @@ async def power_scrapper(url):
         for element in elements:
             text_content = await element.text_content()
             if text_content and text_content.strip():
-                result.append({'text': text_content.strip()})
         await browser.close()
-        return result
 def get_links(soup):
@@ -89,6 +90,6 @@ async def get_data(url: str):
     if links==[]:
         print("running alternative scrapper")
-        links = await power_scrapper(url)
-    return ({"title": title , "contend":links+text_content})

         # Extract all links
         links = await page.query_selector_all('a')
+        page_url = []
+        page_content = []
         for link in links:
             href = await link.get_attribute('href')
+            result.append(href)
         # Extract all text content
         elements = await page.query_selector_all('body *')
         for element in elements:
             text_content = await element.text_content()
             if text_content and text_content.strip():
+                page_content.append(text_content.strip())
         await browser.close()
+        return page_url,page_content
 def get_links(soup):
     if links==[]:
         print("running alternative scrapper")
+        links,text_content = await power_scrapper(url)
+    return ({"title": title ,"URL":links,"Content":text_content})