sifars-chatbot-demo / src /crawler /_database_updater.py
Aryan Jain
add main.py for server initialization; update file handling in DatabaseUpdater and improve context retrieval in ToolCall
46d624e
import os
import aiofiles
import httpx
from src.utils import PineconeClient, SharepointClient
from src.crawler import WebCrawler
import tiktoken
import pymupdf4llm
from docx import Document
from src.utils import logger
class DatabaseUpdater:
def __init__(self):
self.model_name = 'gpt-3.5-turbo'
self.pinecone_client = PineconeClient
self.web_crawler = WebCrawler
self.sharepoint_client = SharepointClient
async def __aenter__(self):
self.tokenizer = tiktoken.encoding_for_model(self.model_name)
return self
async def __aexit__(self, exc_type, exc_val, exc_tb):
pass
async def _split_text_with_overlap(self, text, token_limit, overlap_size):
tokens = self.tokenizer.encode(text)
chunks = []
i = 0
while i < len(tokens):
chunk = tokens[i:i + token_limit]
chunks.append(chunk)
i += token_limit - overlap_size
text_chunks = [self.tokenizer.decode(chunk) for chunk in chunks]
return text_chunks
async def _update_database(self, sentences):
async with self.pinecone_client() as pinecone_client:
for i in range(0, len(sentences), 95):
await pinecone_client._upsert(sentences[i:i+95])
return
async def _delete_old_database(self):
async with self.pinecone_client() as pinecone_client:
await pinecone_client._delete_index()
return
async def process_text_file(self, file_path):
async with aiofiles.open(file_path, 'r', encoding='utf-8') as f:
content = await f.read()
async with aiofiles.open("knowledge_base.txt", 'a+', encoding='utf-8') as f:
await f.write(content)
await f.write("\n")
return
async def process_pdf_file(self, file_path):
docs = pymupdf4llm.to_markdown(file_path, page_chunks=True)
async with aiofiles.open("knowledge_base.pdf", 'a+', encoding='utf-8') as f:
for doc in docs:
await f.write(doc.get("text"))
await f.write("\n")
return
async def process_docx_file(self, file_path):
doc = Document(file_path)
async with aiofiles.open("knowledge_base.docx", 'a+', encoding='utf-8') as f:
for paragraph in doc.paragraphs:
await f.write(paragraph.text)
await f.write("\n")
async def _extract_scraped_data(self, url):
async with self.web_crawler(url) as crawler:
await crawler.crawl(url)
return
async def _extract_knowledge_base(self):
async with self.sharepoint_client() as sharepoint_client:
files = await sharepoint_client.get_files()
async with aiofiles.tempfile.TemporaryDirectory() as temp_dir:
for file in files["value"]:
file_path = os.path.join(temp_dir, file['name'])
async with httpx.AsyncClient() as client:
response = await client.get(file['@microsoft.graph.downloadUrl'])
async with aiofiles.open(file_path, 'wb') as f:
await f.write(response.content)
if file["name"].endswith(".txt") or file["name"].endswith(".md"):
await self.process_text_file(file_path)
elif file["name"].endswith(".pdf"):
await self.process_pdf_file(file_path)
elif file["name"].endswith("docx"):
await self.process_docx_file(file_path)
else:
logger.error(f"Unsupported file format: {file['name']}")
return
async def _update_database_from_file(self, file_paths):
for file_path in file_paths:
async with aiofiles.open(file_path, 'r', encoding='utf-8') as f:
content = await f.read()
sentences = await self._split_text_with_overlap(content, 1024, 200)
await self._update_database(sentences)
return
async def _clear_old_files(self, file_paths):
for file_path in file_paths:
async with aiofiles.open(file_path, 'w', encoding='utf-8') as f:
await f.write("")
return