Spaces:
Sleeping
Sleeping
Aryan Jain
add main.py for server initialization; update file handling in DatabaseUpdater and improve context retrieval in ToolCall
46d624e | import os | |
| import aiofiles | |
| import httpx | |
| from src.utils import PineconeClient, SharepointClient | |
| from src.crawler import WebCrawler | |
| import tiktoken | |
| import pymupdf4llm | |
| from docx import Document | |
| from src.utils import logger | |
| class DatabaseUpdater: | |
| def __init__(self): | |
| self.model_name = 'gpt-3.5-turbo' | |
| self.pinecone_client = PineconeClient | |
| self.web_crawler = WebCrawler | |
| self.sharepoint_client = SharepointClient | |
| async def __aenter__(self): | |
| self.tokenizer = tiktoken.encoding_for_model(self.model_name) | |
| return self | |
| async def __aexit__(self, exc_type, exc_val, exc_tb): | |
| pass | |
| async def _split_text_with_overlap(self, text, token_limit, overlap_size): | |
| tokens = self.tokenizer.encode(text) | |
| chunks = [] | |
| i = 0 | |
| while i < len(tokens): | |
| chunk = tokens[i:i + token_limit] | |
| chunks.append(chunk) | |
| i += token_limit - overlap_size | |
| text_chunks = [self.tokenizer.decode(chunk) for chunk in chunks] | |
| return text_chunks | |
| async def _update_database(self, sentences): | |
| async with self.pinecone_client() as pinecone_client: | |
| for i in range(0, len(sentences), 95): | |
| await pinecone_client._upsert(sentences[i:i+95]) | |
| return | |
| async def _delete_old_database(self): | |
| async with self.pinecone_client() as pinecone_client: | |
| await pinecone_client._delete_index() | |
| return | |
| async def process_text_file(self, file_path): | |
| async with aiofiles.open(file_path, 'r', encoding='utf-8') as f: | |
| content = await f.read() | |
| async with aiofiles.open("knowledge_base.txt", 'a+', encoding='utf-8') as f: | |
| await f.write(content) | |
| await f.write("\n") | |
| return | |
| async def process_pdf_file(self, file_path): | |
| docs = pymupdf4llm.to_markdown(file_path, page_chunks=True) | |
| async with aiofiles.open("knowledge_base.pdf", 'a+', encoding='utf-8') as f: | |
| for doc in docs: | |
| await f.write(doc.get("text")) | |
| await f.write("\n") | |
| return | |
| async def process_docx_file(self, file_path): | |
| doc = Document(file_path) | |
| async with aiofiles.open("knowledge_base.docx", 'a+', encoding='utf-8') as f: | |
| for paragraph in doc.paragraphs: | |
| await f.write(paragraph.text) | |
| await f.write("\n") | |
| async def _extract_scraped_data(self, url): | |
| async with self.web_crawler(url) as crawler: | |
| await crawler.crawl(url) | |
| return | |
| async def _extract_knowledge_base(self): | |
| async with self.sharepoint_client() as sharepoint_client: | |
| files = await sharepoint_client.get_files() | |
| async with aiofiles.tempfile.TemporaryDirectory() as temp_dir: | |
| for file in files["value"]: | |
| file_path = os.path.join(temp_dir, file['name']) | |
| async with httpx.AsyncClient() as client: | |
| response = await client.get(file['@microsoft.graph.downloadUrl']) | |
| async with aiofiles.open(file_path, 'wb') as f: | |
| await f.write(response.content) | |
| if file["name"].endswith(".txt") or file["name"].endswith(".md"): | |
| await self.process_text_file(file_path) | |
| elif file["name"].endswith(".pdf"): | |
| await self.process_pdf_file(file_path) | |
| elif file["name"].endswith("docx"): | |
| await self.process_docx_file(file_path) | |
| else: | |
| logger.error(f"Unsupported file format: {file['name']}") | |
| return | |
| async def _update_database_from_file(self, file_paths): | |
| for file_path in file_paths: | |
| async with aiofiles.open(file_path, 'r', encoding='utf-8') as f: | |
| content = await f.read() | |
| sentences = await self._split_text_with_overlap(content, 1024, 200) | |
| await self._update_database(sentences) | |
| return | |
| async def _clear_old_files(self, file_paths): | |
| for file_path in file_paths: | |
| async with aiofiles.open(file_path, 'w', encoding='utf-8') as f: | |
| await f.write("") | |
| return |