knowbot / extract_text.py
Decim@97
Knowbot first commit
04e75ed
from pypdf import PdfReader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma
import re
import os
def extract_text_from_pdf(file_path:str) -> str:
reader = PdfReader(file_path)
text = ""
for page in reader.pages:
text += page.extract_text() or ""
return text
def pdf_to_documents(file_path:str,database_name:str,collection_name:str,embeddings:OpenAIEmbeddings,chunk_size=1000,chunk_overlap=200,metadata:dict=None):
text = extract_text_from_pdf(file_path)
text = re.sub(r"[^a-zA-Z0-9.,!?;:'\"()\s]", "", text)
if not text.strip():
return []
splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap)
chunks = splitter.split_text(text)
docs = []
for i,chunk in enumerate(chunks):
#print(f"index: {i} , {chunk}")
meta = metadata.copy() if metadata else {}
meta.update({"chunk":i})
docs.append(Document(page_content=chunk, metadata=meta))
if os.path.exists(database_name):
Chroma(persist_directory=database_name, embedding_function=embeddings,collection_name=collection_name).delete_collection()
vectorstore = Chroma.from_documents(documents=docs, embedding=embeddings, persist_directory=database_name,collection_name=collection_name)
return docs,vectorstore
def store_data(text:str,database_name:str,collection_name:str,embeddings:OpenAIEmbeddings):
text_splitter = RecursiveCharacterTextSplitter(
chunk_size = 1000,
chunk_overlap = 0,
separators = [" ", ",", "\n"]
)
#with open(file_path) as f:
# text = f.read()
texts = text_splitter.split_text(text)
#print(f"split: {texts}")
docs = [Document(page_content=t) for t in texts]
if os.path.exists(database_name):
Chroma(persist_directory=database_name, embedding_function=embeddings,collection_name=collection_name).delete_collection()
vectorstore = Chroma.from_documents(documents=docs, embedding=embeddings, persist_directory=database_name,collection_name=collection_name)
return vectorstore