Spaces:
Runtime error
Runtime error
| import tiktoken | |
| from langchain_text_splitters import CharacterTextSplitter | |
| from langchain_chroma import Chroma | |
| from langchain_community.embeddings import HuggingFaceBgeEmbeddings | |
| from langchain.document_loaders import PyMuPDFLoader,Docx2txtLoader | |
| from transformers import pipeline | |
| from app_config import VECTOR_MAX_TOKENS, VECTORS_TOKEN_OVERLAP_SIZE | |
| from langchain.docstore.document import Document | |
| from dotenv import load_dotenv | |
| from pathlib import Path | |
| import os | |
| env_path = Path('.') / '.env' | |
| load_dotenv(dotenv_path=env_path) | |
| tokenizer = tiktoken.get_encoding('cl100k_base') | |
| # create the length function | |
| def tiktoken_len(text): | |
| tokens = tokenizer.encode( | |
| text, | |
| disallowed_special=() | |
| ) | |
| return len(tokens) | |
| def get_vectorstore_with_doc_from_pdf(pdf_path): | |
| model_name = "BAAI/bge-small-en" | |
| model_kwargs = {"device": "cpu"} | |
| encode_kwargs = {"normalize_embeddings": True} | |
| hf = HuggingFaceBgeEmbeddings( | |
| model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs | |
| ) | |
| loader = PyMuPDFLoader(pdf_path) | |
| documents = loader.load() | |
| print(len(documents)) | |
| all_splits = [doc.page_content for doc in documents] | |
| vectorstore = Chroma.from_texts(texts=all_splits, embedding=hf) | |
| return vectorstore | |
| def get_vectorstore_with_doc_from_word(word_path): | |
| model_name = "BAAI/bge-small-en" | |
| model_kwargs = {"device": "cpu"} | |
| encode_kwargs = {"normalize_embeddings": True} | |
| hf = HuggingFaceBgeEmbeddings( | |
| model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs | |
| ) | |
| loader = Docx2txtLoader(word_path) | |
| documents = loader.load() | |
| text_splitter = CharacterTextSplitter( | |
| separator="Page :", | |
| ) | |
| # all_splits = text_splitter.split_text(data) | |
| print(len(documents)) | |
| print("all splits ........................") | |
| all_splits = text_splitter.split_text(documents[0].page_content) | |
| print(len(all_splits)) | |
| vectorstore = Chroma.from_texts(texts=all_splits, embedding=hf) | |
| return vectorstore | |