AraRAG / utils.py
Yaser Abdelaziz
Initial commit
b5acc22
import os
import requests
import cohere
# from openai import OpenAI
def generate_embeddings(texts: str):
# embedding_res = OpenAI().embeddings.create(input=text, model="text-embedding-ada-002")
# embedding = embedding_res.data[0].embedding
co = cohere.Client(os.getenv('COHERE_API_KEY'))
response = co.embed(texts=texts, input_type='classification', embedding_types=['float'], model='embed-multilingual-v3.0')
embeddings = response.embeddings.float
return embeddings
def call_spellbook_api(endpoint: str, payload: dict):
spellbook_base_url = os.environ.get("SPELLBOOK_BASE_URL")
spellbook_api_key = os.environ.get("SPELLBOOK_API_KEY")
headers = {
"accept": "application/json",
"content-type": "application/json",
"authorization": f"Bearer {spellbook_api_key}",
}
url = spellbook_base_url + endpoint if spellbook_base_url else endpoint
response = requests.request("POST", url, json=payload, headers=headers)
return response.json()
def format_page(page):
# reference (modified version for Arabic): https://stackoverflow.com/questions/78200728/how-to-avoid-pymupdf-fitz-interpreting-large-gaps-between-words-as-a-newline-c
page_content = ""
words = page.get_text("words", sort=True) # words sorted vertical, then horizontal
if len(words) == 0:
return True, page_content
line = [words[0]] # list of words in same line
for w in words[1:]:
w0 = line[-1] # get previous word
if abs(w0[3] - w[3]) <= 3: # same line (approx. same bottom coord)
line.append(w)
else: # new line starts
line.sort(key=lambda w: w[0], reverse=True) # sort words in line right-to-left
# print text of line
text = " ".join([w[4] for w in line])
page_content += text + "\n"
line = [w] # init line list again
# print last line
text = " ".join([w[4] for w in line[::-1]])
page_content += text + "\n"
page_content += chr(12) + "\n"
return False, page_content