|
|
import os |
|
|
import requests |
|
|
|
|
|
import cohere |
|
|
|
|
|
|
|
|
|
|
|
def generate_embeddings(texts: str): |
|
|
|
|
|
|
|
|
co = cohere.Client(os.getenv('COHERE_API_KEY')) |
|
|
response = co.embed(texts=texts, input_type='classification', embedding_types=['float'], model='embed-multilingual-v3.0') |
|
|
embeddings = response.embeddings.float |
|
|
return embeddings |
|
|
|
|
|
|
|
|
def call_spellbook_api(endpoint: str, payload: dict): |
|
|
spellbook_base_url = os.environ.get("SPELLBOOK_BASE_URL") |
|
|
spellbook_api_key = os.environ.get("SPELLBOOK_API_KEY") |
|
|
|
|
|
headers = { |
|
|
"accept": "application/json", |
|
|
"content-type": "application/json", |
|
|
"authorization": f"Bearer {spellbook_api_key}", |
|
|
} |
|
|
url = spellbook_base_url + endpoint if spellbook_base_url else endpoint |
|
|
response = requests.request("POST", url, json=payload, headers=headers) |
|
|
return response.json() |
|
|
|
|
|
|
|
|
def format_page(page): |
|
|
|
|
|
page_content = "" |
|
|
words = page.get_text("words", sort=True) |
|
|
if len(words) == 0: |
|
|
return True, page_content |
|
|
line = [words[0]] |
|
|
for w in words[1:]: |
|
|
w0 = line[-1] |
|
|
if abs(w0[3] - w[3]) <= 3: |
|
|
line.append(w) |
|
|
else: |
|
|
line.sort(key=lambda w: w[0], reverse=True) |
|
|
|
|
|
text = " ".join([w[4] for w in line]) |
|
|
page_content += text + "\n" |
|
|
line = [w] |
|
|
|
|
|
text = " ".join([w[4] for w in line[::-1]]) |
|
|
page_content += text + "\n" |
|
|
page_content += chr(12) + "\n" |
|
|
return False, page_content |
|
|
|