|
|
import os |
|
|
from datetime import date, timedelta |
|
|
|
|
|
import bs4 |
|
|
from langchain.indexes import SQLRecordManager, index |
|
|
from langchain.text_splitter import RecursiveCharacterTextSplitter |
|
|
from langchain.vectorstores.chroma import Chroma |
|
|
from langchain_community.document_loaders import WebBaseLoader |
|
|
from langchain_google_genai import GoogleGenerativeAIEmbeddings |
|
|
from selenium import webdriver |
|
|
from selenium.webdriver.common.by import By |
|
|
from selenium.webdriver.support import expected_conditions as EC |
|
|
from selenium.webdriver.support.ui import WebDriverWait |
|
|
|
|
|
import config |
|
|
|
|
|
DATA_URL = "https://www.sikafinance.com/marches/actualites_bourse_brvm" |
|
|
|
|
|
embeddings_model = GoogleGenerativeAIEmbeddings( |
|
|
model=config.GOOGLE_EMBEDDING_MODEL |
|
|
) |
|
|
|
|
|
|
|
|
options = webdriver.ChromeOptions() |
|
|
options.add_argument("--headless") |
|
|
options.add_argument("--no-sandbox") |
|
|
options.add_argument("--disable-dev-shm-usage") |
|
|
driver = webdriver.Chrome(options=options) |
|
|
|
|
|
|
|
|
def scrap_articles( |
|
|
url="https://www.sikafinance.com/marches/actualites_bourse_brvm", num_days_past=5 |
|
|
): |
|
|
|
|
|
today = date.today() |
|
|
|
|
|
driver.get(url) |
|
|
|
|
|
all_articles = [] |
|
|
for i in range(num_days_past + 1): |
|
|
past_date = today - timedelta(days=i) |
|
|
date_str = past_date.strftime("%Y-%m-%d") |
|
|
WebDriverWait(driver, 10).until( |
|
|
EC.presence_of_element_located((By.ID, "dateActu")) |
|
|
) |
|
|
text_box = driver.find_element(By.ID, "dateActu") |
|
|
text_box.send_keys(date_str) |
|
|
|
|
|
submit_btn = WebDriverWait(driver, 10).until( |
|
|
EC.element_to_be_clickable((By.ID, "btn")) |
|
|
) |
|
|
submit_btn.click() |
|
|
|
|
|
dates = driver.find_elements(By.CLASS_NAME, "sp1") |
|
|
titles = driver.find_elements(By.XPATH, "//td/a") |
|
|
|
|
|
articles = [] |
|
|
for i in range(len(titles)): |
|
|
art = { |
|
|
"title": titles[i].text.strip(), |
|
|
"date": dates[i].text, |
|
|
"link": titles[i].get_attribute("href"), |
|
|
} |
|
|
articles.append(art) |
|
|
|
|
|
all_articles += articles |
|
|
|
|
|
|
|
|
return all_articles |
|
|
|
|
|
|
|
|
def set_metadata(documents, metadatas): |
|
|
""" |
|
|
#Edit a metadata of lanchain Documents object |
|
|
""" |
|
|
for doc in documents: |
|
|
idx = documents.index(doc) |
|
|
doc.metadata = metadatas[idx] |
|
|
print("Metadata successfully changed") |
|
|
print(documents[0].metadata) |
|
|
|
|
|
|
|
|
def process_docs( |
|
|
articles, persist_directory, embeddings_model, chunk_size=1000, chunk_overlap=100 |
|
|
): |
|
|
""" |
|
|
#Scrap all articles urls content and save on a vector DB |
|
|
""" |
|
|
article_urls = [a["link"] for a in articles] |
|
|
|
|
|
print("Starting to scrap ..") |
|
|
|
|
|
loader = WebBaseLoader( |
|
|
web_paths=article_urls, |
|
|
bs_kwargs=dict( |
|
|
parse_only=bs4.SoupStrainer( |
|
|
class_=("inarticle txtbig", "dt_sign", "innerUp") |
|
|
) |
|
|
), |
|
|
) |
|
|
|
|
|
print("After scraping Loading ..") |
|
|
docs = loader.load() |
|
|
|
|
|
|
|
|
set_metadata(documents=docs, metadatas=articles) |
|
|
|
|
|
print("Successfully loaded to document") |
|
|
|
|
|
text_splitter = RecursiveCharacterTextSplitter( |
|
|
chunk_size=chunk_size, chunk_overlap=chunk_overlap, separators=["\n"] |
|
|
) |
|
|
splits = text_splitter.split_documents(docs) |
|
|
|
|
|
|
|
|
if not os.path.exists(persist_directory): |
|
|
os.makedirs(persist_directory) |
|
|
|
|
|
doc_search = Chroma.from_documents( |
|
|
documents=splits, |
|
|
embedding=embeddings_model, |
|
|
persist_directory=persist_directory, |
|
|
) |
|
|
|
|
|
|
|
|
namespace = "chromadb/my_documents" |
|
|
record_manager = SQLRecordManager( |
|
|
namespace, db_url="sqlite:///record_manager_cache.sql" |
|
|
) |
|
|
record_manager.create_schema() |
|
|
|
|
|
index_result = index( |
|
|
docs, |
|
|
record_manager, |
|
|
doc_search, |
|
|
cleanup="incremental", |
|
|
source_id_key="link", |
|
|
) |
|
|
|
|
|
print(f"Indexing stats: {index_result}") |
|
|
|
|
|
return doc_search |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
|
|
data = scrap_articles(DATA_URL, num_days_past=2) |
|
|
vectordb = process_docs(data, config.STORAGE_PATH, embeddings_model) |
|
|
ret = vectordb.as_retriever() |
|
|
|