File size: 2,061 Bytes
b5acc22
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import os
import requests

import cohere
# from openai import OpenAI


def generate_embeddings(texts: str):
    # embedding_res = OpenAI().embeddings.create(input=text, model="text-embedding-ada-002")
    # embedding = embedding_res.data[0].embedding
    co = cohere.Client(os.getenv('COHERE_API_KEY'))
    response = co.embed(texts=texts, input_type='classification', embedding_types=['float'], model='embed-multilingual-v3.0')
    embeddings = response.embeddings.float
    return embeddings


def call_spellbook_api(endpoint: str, payload: dict):
    spellbook_base_url = os.environ.get("SPELLBOOK_BASE_URL")
    spellbook_api_key = os.environ.get("SPELLBOOK_API_KEY")

    headers = {
        "accept": "application/json",
        "content-type": "application/json",
        "authorization": f"Bearer {spellbook_api_key}",
    }
    url = spellbook_base_url + endpoint if spellbook_base_url else endpoint
    response = requests.request("POST", url, json=payload, headers=headers)
    return response.json()


def format_page(page):
    # reference (modified version for Arabic): https://stackoverflow.com/questions/78200728/how-to-avoid-pymupdf-fitz-interpreting-large-gaps-between-words-as-a-newline-c
    page_content = ""
    words = page.get_text("words", sort=True)  # words sorted vertical, then horizontal
    if len(words) == 0:
        return True, page_content
    line = [words[0]]  # list of words in same line
    for w in words[1:]:
        w0 = line[-1]  # get previous word
        if abs(w0[3] - w[3]) <= 3:  # same line (approx. same bottom coord)
            line.append(w)
        else:  # new line starts
            line.sort(key=lambda w: w[0], reverse=True)  # sort words in line right-to-left
            # print text of line
            text = " ".join([w[4] for w in line])
            page_content += text + "\n"
            line = [w]  # init line list again
    # print last line
    text = " ".join([w[4] for w in line[::-1]])
    page_content += text + "\n"
    page_content += chr(12) + "\n"
    return False, page_content