import tiktoken from typing import List import hashlib import re import sqlite3 import sqlite3 import json def save_to_database(_id, data): # Connect to the SQLite database (or create it if it doesn't exist) conn = sqlite3.connect("utils/information.db") cursor = conn.cursor() # Create the table if it doesn't exist cursor.execute( """ CREATE TABLE IF NOT EXISTS json_data ( id TEXT PRIMARY KEY, data TEXT ) """ ) # Insert or replace the data cursor.execute( """ INSERT OR REPLACE INTO json_data (id, data) VALUES (?, ?) """, (_id, json.dumps(data)), ) # Commit the changes and close the connection conn.commit() conn.close() def retrieve_from_database(_id): conn = sqlite3.connect("utils/information.db") cursor = conn.cursor() cursor.execute("SELECT data FROM json_data WHERE id = ?", (_id,)) result = cursor.fetchone() conn.close() if result: return json.loads(result[0]) else: return None def generate_file_id(file_bytes: bytes) -> str: """Generate a Unique file ID for given file.""" hash_obj = hashlib.sha256() hash_obj.update(file_bytes[:4096]) file_id = hash_obj.hexdigest()[:63] return str(file_id) def extract_content(text): pattern = r"(.*?)" matches = re.findall(pattern, text, re.DOTALL) return matches[0] def CountTokens(texts: List[str]) -> List[int]: """ Calculate the number of tokens in a batch of strings. """ model = tiktoken.encoding_for_model("gpt-3.5-turbo") encodings = model.encode_batch(texts) num_of_tokens = [len(encoding) for encoding in encodings] return num_of_tokens def web_search_result_processor(output): """report_html = output.get("report", "") references = output.get("references", {}) references_markdown = "" for url, content in references.items(): # Making the URL clickable in pure HTML clickable_url = f'{url}' references_markdown += f"
{clickable_url}\n\n{html2text.html2text(content)}
\n\n" combined_markdown = "" if report_html.strip(): # Check if report_html is not empty # Use html2text to convert HTML to Markdown, ensuring it doesn't break lines unnecessarily report_markdown = html2text.html2text(report_html) # Remove unwanted newlines within Markdown headings report_markdown = report_markdown.replace("\n", " ").replace(" ", "\n") combined_markdown += report_markdown + "\n\n" combined_markdown += references_markdown""" r = extract_content(output) return r