""" Contains Utility functions for LLM and Database module. Along with some other misllaneous functions. """ from pymupdf import pymupdf from docx import Document from langchain_text_splitters import RecursiveCharacterTextSplitter import tiktoken import base64 import hashlib import ollama from typing import List from openai import OpenAI import os TOGETHER_API = str(os.getenv("TOGETHER_API_KEY")) def get_preview_pdf(file_bytes: bytes): """Returns first 3 pages of a PDF file.""" doc = pymupdf.open(stream=file_bytes, filetype="pdf") sliced_doc = pymupdf.open() sliced_doc.insert_pdf(doc, from_page=0, to_page=2) return sliced_doc.tobytes() def count_tokens(string: str) -> int: """Returns number of tokens in inputted string.""" tokenizer = tiktoken.get_encoding("cl100k_base") return len(tokenizer.encode(text=string)) def create_refrences(retrieved_docs): """Create a refrences of chunks/pecies used in generating reponse, in markdown format""" refrences = "" for doc in retrieved_docs: try: chunk_imgs = eval(doc["metadata"]["images"]) except: chunk_imgs = None chunk = doc["document"] if chunk_imgs: chunk_split = chunk.split("\n" + chunk_split[i + 1][3:] ) else: chunk_with_img = chunk refrences += ( f"###### {doc['metadata']['file_name']}\n\n{chunk_with_img}\n\n" ) else: chunk = doc["document"] refrences += f"###### {doc['metadata']['file_name']}\n\n{chunk}\n\n**Distance : {doc['distance']}**\n\n" return refrences def generate_file_id(file_bytes): """Generate a Unique file ID for given file.""" hash_obj = hashlib.sha256() hash_obj.update(file_bytes[:4096]) file_id = hash_obj.hexdigest()[:63] return str(file_id) def extract_content_from_docx(docx_content): """Extract content (text) from DOCX file""" doc = Document(docx_content) full_text = [] for para in doc.paragraphs: full_text.append(para.text) content = "\n".join(full_text) return content def extract_content_from_pdf(pdf_content): """Extereact content (Image + text) from PDF files.""" doc = pymupdf.open(stream=pdf_content, filetype="pdf") DOCUMENT = "" pil_images = [] for page in doc: blocks = page.get_text_blocks() # type: ignore images = page.get_images() # type: ignore # Create a list of all elements (text blocks and images) with their positions elements = [(block[:4], block[4], "text") for block in blocks] img_list = [] for img in images: try: img_bbox = page.get_image_rects(img[0])[0] # type: ignore if len(img_bbox) > 0: img_data = (img_bbox, img[0], "image") img_list.append(img_data) else: continue except Exception as e: print("Exception :", e) pass elements.extend(img_list) # Sort elements by their vertical position (top coordinate) elements.sort(key=lambda x: x[0][1]) for element in elements: if element[2] == "text": DOCUMENT += element[1] else: xref = element[1] base_image = doc.extract_image(xref) image_bytes = base_image["image"] # Save the image image = image_bytes pil_images.append(image) DOCUMENT += f"\n\n\n" return DOCUMENT, pil_images def chunk_document(document, chunk_size=200, overlap=10, encoding_name="cl100k_base"): """Split/Chunk Document with Recursive splitting strategy""" splitter = RecursiveCharacterTextSplitter( separators=["\n\n", "\n", " ", ""], keep_separator=True ).from_tiktoken_encoder( encoding_name=encoding_name, chunk_size=chunk_size, chunk_overlap=overlap ) chunks = splitter.split_text(document) return chunks def generate_embedding_ollama( texts: List[str], embedding_model: str ) -> List[List[float]]: """Generate Embeddings for the givien pieces of texts.""" embeddings = [] for text in texts: embedding = ollama.embeddings(model=embedding_model, prompt=text)["embedding"] embeddings.append(list(embedding)) return embeddings def generate_embedding(texts: List[str], embedding_model: str) -> List[List[float]]: """Generate Embeddings for the givien pieces of texts.""" client = OpenAI(api_key=TOGETHER_API, base_url="https://api.together.xyz/v1") embeddings_response = client.embeddings.create( input=texts, model="BAAI/bge-large-en-v1.5" ).data embeddings = [i.embedding for i in embeddings_response] return embeddings