Spaces:

sifars
/

sifars-chatbot-demo

Sleeping

sifars-chatbot-demo / src /crawler /_database_updater.py

Aryan Jain

add main.py for server initialization; update file handling in DatabaseUpdater and improve context retrieval in ToolCall

46d624e 4 months ago

raw

history blame contribute delete

4.28 kB

	import os
	import aiofiles
	import httpx
	from src.utils import PineconeClient, SharepointClient
	from src.crawler import WebCrawler
	import tiktoken
	import pymupdf4llm
	from docx import Document
	from src.utils import logger

	class DatabaseUpdater:
	def __init__(self):
	self.model_name = 'gpt-3.5-turbo'
	self.pinecone_client = PineconeClient
	self.web_crawler = WebCrawler
	self.sharepoint_client = SharepointClient

	async def __aenter__(self):
	self.tokenizer = tiktoken.encoding_for_model(self.model_name)
	return self

	async def __aexit__(self, exc_type, exc_val, exc_tb):
	pass

	async def _split_text_with_overlap(self, text, token_limit, overlap_size):
	tokens = self.tokenizer.encode(text)
	chunks = []
	i = 0
	while i < len(tokens):
	chunk = tokens[i:i + token_limit]
	chunks.append(chunk)
	i += token_limit - overlap_size
	text_chunks = [self.tokenizer.decode(chunk) for chunk in chunks]
	return text_chunks

	async def _update_database(self, sentences):
	async with self.pinecone_client() as pinecone_client:
	for i in range(0, len(sentences), 95):
	await pinecone_client._upsert(sentences[i:i+95])
	return

	async def _delete_old_database(self):
	async with self.pinecone_client() as pinecone_client:
	await pinecone_client._delete_index()
	return

	async def process_text_file(self, file_path):
	async with aiofiles.open(file_path, 'r', encoding='utf-8') as f:
	content = await f.read()
	async with aiofiles.open("knowledge_base.txt", 'a+', encoding='utf-8') as f:
	await f.write(content)
	await f.write("\n")
	return

	async def process_pdf_file(self, file_path):
	docs = pymupdf4llm.to_markdown(file_path, page_chunks=True)
	async with aiofiles.open("knowledge_base.pdf", 'a+', encoding='utf-8') as f:
	for doc in docs:
	await f.write(doc.get("text"))
	await f.write("\n")
	return

	async def process_docx_file(self, file_path):
	doc = Document(file_path)
	async with aiofiles.open("knowledge_base.docx", 'a+', encoding='utf-8') as f:
	for paragraph in doc.paragraphs:
	await f.write(paragraph.text)
	await f.write("\n")

	async def _extract_scraped_data(self, url):
	async with self.web_crawler(url) as crawler:
	await crawler.crawl(url)
	return

	async def _extract_knowledge_base(self):
	async with self.sharepoint_client() as sharepoint_client:
	files = await sharepoint_client.get_files()
	async with aiofiles.tempfile.TemporaryDirectory() as temp_dir:
	for file in files["value"]:
	file_path = os.path.join(temp_dir, file['name'])
	async with httpx.AsyncClient() as client:
	response = await client.get(file['@microsoft.graph.downloadUrl'])
	async with aiofiles.open(file_path, 'wb') as f:
	await f.write(response.content)
	if file["name"].endswith(".txt") or file["name"].endswith(".md"):
	await self.process_text_file(file_path)
	elif file["name"].endswith(".pdf"):
	await self.process_pdf_file(file_path)
	elif file["name"].endswith("docx"):
	await self.process_docx_file(file_path)
	else:
	logger.error(f"Unsupported file format: {file['name']}")
	return


	async def _update_database_from_file(self, file_paths):
	for file_path in file_paths:
	async with aiofiles.open(file_path, 'r', encoding='utf-8') as f:
	content = await f.read()
	sentences = await self._split_text_with_overlap(content, 1024, 200)
	await self._update_database(sentences)
	return

	async def _clear_old_files(self, file_paths):
	for file_path in file_paths:
	async with aiofiles.open(file_path, 'w', encoding='utf-8') as f:
	await f.write("")
	return