File size: 2,738 Bytes
206ef5f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import tiktoken
from typing import List
import hashlib
import re
import sqlite3

import sqlite3
import json


def save_to_database(_id, data):
    # Connect to the SQLite database (or create it if it doesn't exist)
    conn = sqlite3.connect("utils/information.db")
    cursor = conn.cursor()

    # Create the table if it doesn't exist
    cursor.execute(
        """
    CREATE TABLE IF NOT EXISTS json_data (
        id TEXT PRIMARY KEY,
        data TEXT
    )
    """
    )

    # Insert or replace the data
    cursor.execute(
        """
    INSERT OR REPLACE INTO json_data (id, data)
    VALUES (?, ?)
    """,
        (_id, json.dumps(data)),
    )

    # Commit the changes and close the connection
    conn.commit()
    conn.close()


def retrieve_from_database(_id):
    conn = sqlite3.connect("utils/information.db")
    cursor = conn.cursor()

    cursor.execute("SELECT data FROM json_data WHERE id = ?", (_id,))
    result = cursor.fetchone()

    conn.close()

    if result:
        return json.loads(result[0])
    else:
        return None


def generate_file_id(file_bytes: bytes) -> str:
    """Generate a Unique file ID for given file."""

    hash_obj = hashlib.sha256()
    hash_obj.update(file_bytes[:4096])
    file_id = hash_obj.hexdigest()[:63]
    return str(file_id)


def extract_content(text):
    pattern = r"<report-chart>(.*?)</report-chart>"
    matches = re.findall(pattern, text, re.DOTALL)
    return matches[0]


def CountTokens(texts: List[str]) -> List[int]:
    """
    Calculate the number of tokens in a batch of strings.
    """
    model = tiktoken.encoding_for_model("gpt-3.5-turbo")
    encodings = model.encode_batch(texts)
    num_of_tokens = [len(encoding) for encoding in encodings]
    return num_of_tokens


def web_search_result_processor(output):
    """report_html = output.get("report", "")
    references = output.get("references", {})
    references_markdown = ""

    for url, content in references.items():
        # Making the URL clickable in pure HTML
        clickable_url = f'<a href="{url}">{url}</a>'
        references_markdown += f"<details><summary>{clickable_url}</summary>\n\n{html2text.html2text(content)}</details>\n\n"

    combined_markdown = ""
    if report_html.strip():  # Check if report_html is not empty
        # Use html2text to convert HTML to Markdown, ensuring it doesn't break lines unnecessarily
        report_markdown = html2text.html2text(report_html)
        # Remove unwanted newlines within Markdown headings
        report_markdown = report_markdown.replace("\n", " ").replace("  ", "\n")
        combined_markdown += report_markdown + "\n\n"
    combined_markdown += references_markdown"""
    r = extract_content(output)
    return r