Spaces:
Runtime error
Runtime error
| import asyncio | |
| from llama_cpp import Llama | |
| from openai import OpenAI | |
| from selenium import webdriver | |
| from selenium.common import WebDriverException | |
| from selenium.webdriver.common.by import By | |
| from selenium.webdriver.common.keys import Keys | |
| from selenium.webdriver.chrome.service import Service | |
| import concurrent.futures | |
| from webdriver_manager.chrome import ChromeDriverManager | |
| import time | |
| from datetime import datetime | |
| import os | |
| from GenerateAIPodcast import generateMp3 | |
| from btts import generateAudioFile | |
| # client = OpenAI(base_url="http://localhost:8080/v1", api_key="lm-studio") | |
| '''def make_request(link): | |
| print("-----------------------------------------------------------------------------------------") | |
| print("Make Request is called") | |
| try: | |
| completion = client.chat.completions.create( | |
| model="model-identifier", | |
| messages=[ | |
| {"role": "system", | |
| "content": "Always answer short and most detailled and dont use * in your answers. It should be good to hear as a Podcast"}, | |
| {"role": "user", "content": f"Please summarize this website: {link}."} | |
| ], | |
| temperature=0.7, | |
| ) | |
| # print(f"Thread: {completion.choices[0].message}") | |
| # print("TEST:", completion.choices[0].message) | |
| message = completion.choices[0].message.content | |
| return message | |
| except Exception as e: | |
| print(f"Thread encountered an error: {e}^") | |
| ''' | |
| llm = Llama.from_pretrained( | |
| repo_id="hugging-quants/Llama-3.2-1B-Instruct-Q8_0-GGUF", | |
| filename="llama-3.2-1b-instruct-q8_0.gguf", | |
| ) | |
| def generate(link:str): | |
| ## use the pipeline to generate text from given input text | |
| output= llm.create_chat_completion( | |
| messages = [ | |
| {"role": "system", | |
| "content": "Always answer short and most detailled and dont use * in your answers. It should be good to hear as a Podcast"}, | |
| {"role": "user", "content": f"Please summarize this website: {link}."} | |
| ] | |
| ) | |
| ## return the generate text in Json reposnfe | |
| return output['choices'][0]['message']['content'] | |
| def run_tldr_crawler(): | |
| # Setup Selenium WebDriver | |
| options = webdriver.ChromeOptions() | |
| # options.add_argument() # Run in headless mode (no browser UI) | |
| options.add_argument('--disable-gpu') | |
| options.add_argument('--no-sandbox') | |
| # Initialize the WebDriver | |
| service = Service(ChromeDriverManager().install()) | |
| driver = webdriver.Chrome(service=service, options=options) | |
| try: | |
| date = datetime.today().strftime('%Y-%m-%d') | |
| print(date) | |
| # Comment this if you want run this at a weekend | |
| date = '2025-03-07' | |
| # Step 1: Navigate to the TLDR archives page | |
| url = f"https://tldr.tech/tech/{date}" | |
| driver.get(url) | |
| # Wait for the page to load | |
| time.sleep(2) | |
| # Step 3: Extract all links on the new page | |
| links = driver.find_elements(By.TAG_NAME, 'a') | |
| # Collect the href attributes | |
| # extracted_links = [link.get_attribute('href') for link in links if link.get_attribute('href') is not None] | |
| extracted_links = [ | |
| link.get_attribute('href') | |
| for link in links | |
| if link.get_attribute('href') is not None and | |
| not link.get_attribute('href').startswith("https://tldr.tech") and | |
| not link.get_attribute('href').startswith("https://jobs") and | |
| not "advertise" in link.get_attribute('href') | |
| ] | |
| # Output the extracted links | |
| print("Extracted Links:") | |
| print(len(extracted_links)) | |
| for idx, link in enumerate(extracted_links, start=1): | |
| print(f"{idx}. {link}") | |
| # Die maximale Anzahl von Threads, die gleichzeitig laufen sollen | |
| max_threads = 4 | |
| # ThreadPoolExecutor verwenden, um maximal 4 Threads gleichzeitig auszuführen | |
| with concurrent.futures.ThreadPoolExecutor(max_workers=max_threads) as executor: | |
| # Für jeden Link in extracted_links wird makerequest aufgerufen | |
| # enumerate gibt auch den Index zurück, falls du ihn brauchst | |
| futures = [] | |
| for idx, link in enumerate(extracted_links, start=1): | |
| future = executor.submit(generate, link) | |
| futures.append((idx, link, future)) | |
| # print(f"{idx}. {link}") | |
| # print(future.result()) | |
| for idx, link, future in futures: | |
| result = future.result() | |
| # print(f"{idx}. {link} - Result {result}") | |
| asyncio.run(generateAudioFile(result, idx)) | |
| except WebDriverException as e: | |
| print(f"Fehler beim Laden der Seite: {e}") | |
| finally: | |
| # Close the WebDriver | |
| driver.quit() |