Spaces:

Defender117
/

tldr_crawlre

Runtime error

App Files Files Community

tldr_crawlre / crawl_archive.py

Defender117

Upload 5 files

b6204d2 verified 9 months ago

raw

history blame contribute delete

4.91 kB

	import asyncio

	from llama_cpp import Llama
	from openai import OpenAI
	from selenium import webdriver
	from selenium.common import WebDriverException
	from selenium.webdriver.common.by import By
	from selenium.webdriver.common.keys import Keys
	from selenium.webdriver.chrome.service import Service
	import concurrent.futures
	from webdriver_manager.chrome import ChromeDriverManager
	import time
	from datetime import datetime
	import os
	from GenerateAIPodcast import generateMp3
	from btts import generateAudioFile

	# client = OpenAI(base_url="http://localhost:8080/v1", api_key="lm-studio")


	'''def make_request(link):
	print("-----------------------------------------------------------------------------------------")
	print("Make Request is called")
	try:

	completion = client.chat.completions.create(
	model="model-identifier",

	messages=[
	{"role": "system",
	"content": "Always answer short and most detailled and dont use * in your answers. It should be good to hear as a Podcast"},
	{"role": "user", "content": f"Please summarize this website: {link}."}
	],
	temperature=0.7,
	)
	# print(f"Thread: {completion.choices[0].message}")
	# print("TEST:", completion.choices[0].message)
	message = completion.choices[0].message.content
	return message
	except Exception as e:
	print(f"Thread encountered an error: {e}^")
	'''



	llm = Llama.from_pretrained(
	repo_id="hugging-quants/Llama-3.2-1B-Instruct-Q8_0-GGUF",
	filename="llama-3.2-1b-instruct-q8_0.gguf",
	)

	def generate(link:str):
	## use the pipeline to generate text from given input text
	output= llm.create_chat_completion(
	messages = [
	{"role": "system",
	"content": "Always answer short and most detailled and dont use * in your answers. It should be good to hear as a Podcast"},
	{"role": "user", "content": f"Please summarize this website: {link}."}
	]
	)

	## return the generate text in Json reposnfe
	return output['choices'][0]['message']['content']



	def run_tldr_crawler():
	# Setup Selenium WebDriver
	options = webdriver.ChromeOptions()
	# options.add_argument() # Run in headless mode (no browser UI)
	options.add_argument('--disable-gpu')
	options.add_argument('--no-sandbox')

	# Initialize the WebDriver
	service = Service(ChromeDriverManager().install())
	driver = webdriver.Chrome(service=service, options=options)
	try:
	date = datetime.today().strftime('%Y-%m-%d')
	print(date)
	# Comment this if you want run this at a weekend
	date = '2025-03-07'
	# Step 1: Navigate to the TLDR archives page
	url = f"https://tldr.tech/tech/{date}"
	driver.get(url)

	# Wait for the page to load
	time.sleep(2)

	# Step 3: Extract all links on the new page
	links = driver.find_elements(By.TAG_NAME, 'a')

	# Collect the href attributes
	# extracted_links = [link.get_attribute('href') for link in links if link.get_attribute('href') is not None]

	extracted_links = [
	link.get_attribute('href')
	for link in links
	if link.get_attribute('href') is not None and
	not link.get_attribute('href').startswith("https://tldr.tech") and
	not link.get_attribute('href').startswith("https://jobs") and
	not "advertise" in link.get_attribute('href')
	]

	# Output the extracted links
	print("Extracted Links:")
	print(len(extracted_links))
	for idx, link in enumerate(extracted_links, start=1):
	print(f"{idx}. {link}")

	# Die maximale Anzahl von Threads, die gleichzeitig laufen sollen
	max_threads = 4

	# ThreadPoolExecutor verwenden, um maximal 4 Threads gleichzeitig auszuführen
	with concurrent.futures.ThreadPoolExecutor(max_workers=max_threads) as executor:
	# Für jeden Link in extracted_links wird makerequest aufgerufen
	# enumerate gibt auch den Index zurück, falls du ihn brauchst
	futures = []
	for idx, link in enumerate(extracted_links, start=1):
	future = executor.submit(generate, link)
	futures.append((idx, link, future))
	# print(f"{idx}. {link}")

	# print(future.result())

	for idx, link, future in futures:
	result = future.result()
	# print(f"{idx}. {link} - Result {result}")
	asyncio.run(generateAudioFile(result, idx))


	except WebDriverException as e:
	print(f"Fehler beim Laden der Seite: {e}")


	finally:
	# Close the WebDriver
	driver.quit()