| from pypdf import PdfReader |
| from langchain_text_splitters import RecursiveCharacterTextSplitter |
| from langchain_core.documents import Document |
| from langchain_openai import OpenAIEmbeddings |
| from langchain_chroma import Chroma |
| import re |
| import os |
|
|
| def extract_text_from_pdf(file_path:str) -> str: |
| reader = PdfReader(file_path) |
| text = "" |
| for page in reader.pages: |
| text += page.extract_text() or "" |
| return text |
|
|
| def pdf_to_documents(file_path:str,database_name:str,collection_name:str,embeddings:OpenAIEmbeddings,chunk_size=1000,chunk_overlap=200,metadata:dict=None): |
| text = extract_text_from_pdf(file_path) |
| text = re.sub(r"[^a-zA-Z0-9.,!?;:'\"()\s]", "", text) |
| if not text.strip(): |
| return [] |
|
|
| splitter = RecursiveCharacterTextSplitter( |
| chunk_size=chunk_size, |
| chunk_overlap=chunk_overlap) |
|
|
| chunks = splitter.split_text(text) |
| docs = [] |
| for i,chunk in enumerate(chunks): |
| |
| meta = metadata.copy() if metadata else {} |
| meta.update({"chunk":i}) |
| docs.append(Document(page_content=chunk, metadata=meta)) |
|
|
| if os.path.exists(database_name): |
| Chroma(persist_directory=database_name, embedding_function=embeddings,collection_name=collection_name).delete_collection() |
|
|
| vectorstore = Chroma.from_documents(documents=docs, embedding=embeddings, persist_directory=database_name,collection_name=collection_name) |
|
|
| return docs,vectorstore |
|
|
|
|
|
|
| def store_data(text:str,database_name:str,collection_name:str,embeddings:OpenAIEmbeddings): |
| |
| text_splitter = RecursiveCharacterTextSplitter( |
| chunk_size = 1000, |
| chunk_overlap = 0, |
| separators = [" ", ",", "\n"] |
| ) |
|
|
| |
| |
|
|
| texts = text_splitter.split_text(text) |
|
|
| |
| docs = [Document(page_content=t) for t in texts] |
|
|
|
|
| if os.path.exists(database_name): |
| Chroma(persist_directory=database_name, embedding_function=embeddings,collection_name=collection_name).delete_collection() |
|
|
| vectorstore = Chroma.from_documents(documents=docs, embedding=embeddings, persist_directory=database_name,collection_name=collection_name) |
| return vectorstore |
|
|
|
|
|
|