Spaces:
Sleeping
Sleeping
| from langchain_community.document_loaders import WebBaseLoader | |
| from langchain_text_splitters import RecursiveCharacterTextSplitter | |
| from langchain_chroma import Chroma | |
| from langchain_openai import OpenAIEmbeddings | |
| from PyPDF2 import PdfReader | |
| def get_pdf_text(pdf_docs): | |
| text = "" | |
| for pdf in pdf_docs: | |
| pdf_reader = PdfReader(pdf) | |
| for page in pdf_reader.pages: | |
| text += page.extract_text() | |
| return text | |
| def loadUrlData(url): | |
| loader = WebBaseLoader(url) | |
| loader.requests_kwargs = {'verify':False} | |
| html = loader.load() | |
| return html | |
| def splitDoc(data): | |
| text_splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=1000, chunk_overlap=200, add_start_index=True) | |
| return text_splitter.split_documents(data) | |
| def splitText(data): | |
| text_splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=400, | |
| chunk_overlap=50, | |
| length_function=len, | |
| is_separator_regex=False, | |
| ) | |
| return text_splitter.split_text(data) | |
| def vectorize(data, type): | |
| if type == "document": | |
| docs = splitDoc(data) | |
| return Chroma.from_documents(documents=docs, embedding=OpenAIEmbeddings()) | |
| elif type == "text": | |
| texts = splitText(data) | |
| return Chroma.from_texts(texts=texts, embedding=OpenAIEmbeddings()) | |