finchat / scrape_data.py
Monsia's picture
first commit
c4331f2
raw
history blame
4.16 kB
import os
from datetime import date, timedelta
import bs4
from langchain.indexes import SQLRecordManager, index
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores.chroma import Chroma
from langchain_community.document_loaders import WebBaseLoader
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
import config
DATA_URL = "https://www.sikafinance.com/marches/actualites_bourse_brvm"
embeddings_model = GoogleGenerativeAIEmbeddings(
model=config.GOOGLE_EMBEDDING_MODEL
) # type: ignore
options = webdriver.ChromeOptions()
options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
driver = webdriver.Chrome(options=options)
def scrap_articles(
url="https://www.sikafinance.com/marches/actualites_bourse_brvm", num_days_past=5
):
today = date.today()
driver.get(url)
all_articles = []
for i in range(num_days_past + 1):
past_date = today - timedelta(days=i)
date_str = past_date.strftime("%Y-%m-%d")
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.ID, "dateActu"))
)
text_box = driver.find_element(By.ID, "dateActu")
text_box.send_keys(date_str)
submit_btn = WebDriverWait(driver, 10).until(
EC.element_to_be_clickable((By.ID, "btn"))
)
submit_btn.click()
dates = driver.find_elements(By.CLASS_NAME, "sp1")
titles = driver.find_elements(By.XPATH, "//td/a")
articles = []
for i in range(len(titles)):
art = {
"title": titles[i].text.strip(),
"date": dates[i].text,
"link": titles[i].get_attribute("href"),
}
articles.append(art)
all_articles += articles
# driver.quit()
return all_articles
def set_metadata(documents, metadatas):
"""
#Edit a metadata of lanchain Documents object
"""
for doc in documents:
idx = documents.index(doc)
doc.metadata = metadatas[idx]
print("Metadata successfully changed")
print(documents[0].metadata)
def process_docs(
articles, persist_directory, embeddings_model, chunk_size=1000, chunk_overlap=100
):
"""
#Scrap all articles urls content and save on a vector DB
"""
article_urls = [a["link"] for a in articles]
print("Starting to scrap ..")
loader = WebBaseLoader(
web_paths=article_urls,
bs_kwargs=dict(
parse_only=bs4.SoupStrainer(
class_=("inarticle txtbig", "dt_sign", "innerUp")
)
),
)
print("After scraping Loading ..")
docs = loader.load()
# Update metadata: add title,
set_metadata(documents=docs, metadatas=articles)
print("Successfully loaded to document")
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size, chunk_overlap=chunk_overlap, separators=["\n"]
)
splits = text_splitter.split_documents(docs)
# Create the storage path if it doesn't exist
if not os.path.exists(persist_directory):
os.makedirs(persist_directory)
doc_search = Chroma.from_documents(
documents=splits,
embedding=embeddings_model,
persist_directory=persist_directory,
)
# Indexing data
namespace = "chromadb/my_documents"
record_manager = SQLRecordManager(
namespace, db_url="sqlite:///record_manager_cache.sql"
)
record_manager.create_schema()
index_result = index(
docs,
record_manager,
doc_search,
cleanup="incremental",
source_id_key="link",
)
print(f"Indexing stats: {index_result}")
return doc_search
if __name__ == "__main__":
data = scrap_articles(DATA_URL, num_days_past=2)
vectordb = process_docs(data, config.STORAGE_PATH, embeddings_model)
ret = vectordb.as_retriever()