ocr_api2

Sleeping

App Files Files Community

Arafath10 commited on May 20, 2024

Commit

920f2ce

verified ·

1 Parent(s): 838f637

Update main.py

Browse files

Files changed (1) hide show

main.py +47 -13

main.py CHANGED Viewed

@@ -7,12 +7,6 @@ from pydantic import BaseModel
 from io import StringIO
 import os
-from llmlingua import PromptCompressor
-llm_lingua = PromptCompressor(
-    model_name="microsoft/llmlingua-2-bert-base-multilingual-cased-meetingbank",
-    use_llmlingua2=True, # Whether to use llmlingua-2
-    device_map="cpu"
-)
 app = FastAPI()
@@ -24,11 +18,51 @@ app.add_middleware(
     allow_headers=["*"],
 )
-class Prompt(BaseModel):
-    original_prompt: str
-@app.post("/get_compressed_text")
-async def get_compressed_text(prompt: Prompt):
-        compressed_prompt = llm_lingua.compress_prompt(prompt.original_prompt, instruction="", question="")
-        print("compressed")
-        return compressed_prompt

 from io import StringIO
 import os
 app = FastAPI()
     allow_headers=["*"],
 )
+import nest_asyncio
+import asyncio
+from playwright.async_api import async_playwright
+# Apply nest_asyncio to allow nested asyncio.run() calls
+nest_asyncio.apply()
+async def scrape_links():
+    async with async_playwright() as p:
+        browser = await p.chromium.launch(headless=True)
+        page = await browser.new_page()
+        # Block unnecessary resources to speed up loading
+        await page.route("**/*", lambda route: route.continue_() if route.request.resource_type in ["document", "script"] else route.abort())
+        # Open the target website
+        await page.goto('https://www.fool.com/earnings/call-transcripts/2024/01/24/tesla-tsla-q4-2023-earnings-call-transcript/', wait_until='domcontentloaded')
+        # Wait for a short time to ensure dynamic content is loaded
+        await page.wait_for_timeout(10)
+        # Extract all links
+        links = await page.query_selector_all('a')
+        result = []
+        for link in links:
+            href = await link.get_attribute('href')
+            result.append({'href': href})
+        # Extract all text content
+        elements = await page.query_selector_all('body *')
+        for element in elements:
+            text_content = await element.text_content()
+            if text_content and text_content.strip():
+                result.append({'text': text_content.strip()})
+        await browser.close()
+        return result
+@app.post("/get_webscrapet_data")
+async def get_webscrapet_data(url):
+        # Run the scraping function
+        results = asyncio.run(scrape_links())
+        print(results)
+        return results