RagChatbot / main.py
sami606713's picture
Upload 17 files
27a8994 verified
import os
import shutil
from utils.helper import LoadAndExtractData # Uncomment if you want to process files
from summerizer.imageSummerizer import Image_Summerizer
from summerizer.textSummerizer import TextSummerizer
from langchain_core.documents import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from vectorStore.vectorStore import add_to_vector_store
def main():
try:
root_dir = "data"
processed_log_path = "processed_files.txt"
# Load already processed file names
if os.path.exists(processed_log_path):
with open(processed_log_path, 'r') as f:
processed_files = set(f.read().splitlines())
else:
processed_files = set()
files = os.listdir(root_dir)
print(">> Files: ",files)
print(">> Process Files: ",processed_files)
print(">> Processing Files ")
for file in files:
file_path = os.path.join(root_dir, file)
# Only process files that don't exist in the process directory
if file not in processed_files and file.lower().endswith('.pdf'):
print(f">> Processing: {file}")
tables, texts, images = LoadAndExtractData(file_path)
print(">> Generating Summaries ")
text_summary = TextSummerizer(data=texts)
tables_summary = TextSummerizer(data=tables)
images_summary = Image_Summerizer(data=images)
print("Text Sumary: ",text_summary)
print("Table Summary: ",tables_summary)
print("Image Susmmary: ",images_summary)
print(">> Summary Generated")
print(">> Combine Each and every thing into one document")
# Create Document objects for text chunks
text_docs = [Document(page_content=str(text), metadata={"type": "text", "summary": text_summary[i], "source":file_path,"name":file}) for i, text in enumerate(texts)]
# Create Document objects for table summaries (using the HTML representation)
table_docs = [Document(page_content=tables[i], metadata={"type": "table", "summary": tables_summary[i],"source":file_path,"name":file}) for i, table in enumerate(tables)]
# Create Document objects for image summaries
image_docs = [Document(page_content=images[i], metadata={"type": "image", "summary": images_summary[i],"source":file_path,"name":file}) for i, image in enumerate(images)]
# Combine all document types into a single list
docs = text_docs + table_docs + image_docs
print(">> Splitting Documents")
document_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000, # Example size, adjust based on your needs
chunk_overlap=200, # Example overlap, adjust based on your needs
length_function=len,
is_separator_regex=False,
)
# Spli the documents
docs_chunks = document_splitter.split_documents(docs)
print(">> Splitting Done")
add_to_vector_store(docs_chunks=docs_chunks)
# Append to log file
with open(processed_log_path, 'a') as f:
f.write(file + '\n')
print(f">> Marked {file} as processed")
else:
print(f"!! Skipping already processed or unsupported file: {file}")
except Exception as e:
print("Error is:", str(e))
return str(e)
if __name__ == "__main__":
main()