File size: 7,257 Bytes
6792445 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 |
# import os
# from pathlib import Path
# import cv2
# import pytesseract
# from PIL import Image
# from docx import Document
# from pptx import Presentation
# from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
# from langchain.text_splitter import RecursiveCharacterTextSplitter
# from langchain_huggingface import HuggingFaceEmbeddings
# from langchain_community.vectorstores import FAISS
# from langchain.schema import Document as LangchainDocument # β
Ensure correct Document format
# from dotenv import load_dotenv, find_dotenv
# # Load environment variables
# load_dotenv(find_dotenv())
# # Paths
# DATA_PATH = "data/"
# DB_FAISS_PATH = "vectorstore/db_faiss"
# # Set Tesseract OCR Path (update this based on your installation)
# pytesseract.pytesseract.tesseract_cmd = r"C:\\Users\\Rupesh Shinde\\Tesseract\\tesseract.exe"
# # Step 1: Load Documents from Multiple Sources
# def load_documents(data_path):
# documents = []
# # Load PDFs
# pdf_loader = DirectoryLoader(data_path, glob="*.pdf", loader_cls=PyPDFLoader)
# documents.extend(pdf_loader.load()) # PDFs are already in Document format
# # Load Word files
# for file in Path(data_path).glob("*.docx"):
# doc = Document(file)
# text = "\n".join([para.text for para in doc.paragraphs])
# documents.append(LangchainDocument(page_content=text, metadata={"source": file.name}))
# # Load PowerPoint files
# for file in Path(data_path).glob("*.pptx"):
# prs = Presentation(file)
# text = ""
# for slide in prs.slides:
# for shape in slide.shapes:
# if hasattr(shape, "text"):
# text += shape.text + "\n"
# documents.append(LangchainDocument(page_content=text, metadata={"source": file.name}))
# # Load Images (OCR)
# for image_file in Path(data_path).glob("*.jpg"):
# img = cv2.imread(str(image_file))
# text = pytesseract.image_to_string(img)
# documents.append(LangchainDocument(page_content=text, metadata={"source": image_file.name}))
# for image_file in Path(data_path).glob("*.png"):
# img = cv2.imread(str(image_file))
# text = pytesseract.image_to_string(img)
# documents.append(LangchainDocument(page_content=text, metadata={"source": image_file.name}))
# print(f"β
Loaded {len(documents)} documents from {data_path}")
# return documents
# # Step 2: Create Chunks
# def create_chunks(documents):
# text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
# text_chunks = text_splitter.split_documents(documents)
# print(f"β
Created {len(text_chunks)} text chunks")
# return text_chunks
# # Step 3: Create Vector Embeddings
# def get_embedding_model():
# return HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
# # Step 4: Store embeddings in FAISS
# def create_vector_store(text_chunks):
# embedding_model = get_embedding_model()
# print("π Creating vector store...")
# db = FAISS.from_documents(text_chunks, embedding_model)
# db.save_local(DB_FAISS_PATH)
# print("β
Vector store created/updated successfully.")
# # Step 5: Main Execution
# if __name__ == "__main__":
# print("π Starting process...")
# documents = load_documents(DATA_PATH)
# text_chunks = create_chunks(documents)
# create_vector_store(text_chunks)
# print("π Process completed successfully!")
import os
from pathlib import Path
import cv2
import pytesseract
from PIL import Image
from docx import Document
from pptx import Presentation
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.schema import Document as LangchainDocument
from dotenv import load_dotenv, find_dotenv
# Load environment variables
load_dotenv(find_dotenv())
# Paths
DATA_PATH = "data/"
DB_FAISS_PATH = "vectorstore/db_faiss"
# Set Tesseract OCR Path (update this based on your installation)
pytesseract.pytesseract.tesseract_cmd = r"C:\\Users\\Rupesh Shinde\\Tesseract\\tesseract.exe"
# Function to extract text from images
def extract_text_from_image(image_path):
img = cv2.imread(str(image_path))
if img is None:
print(f"β οΈ Warning: Unable to read image {image_path}")
return ""
text = pytesseract.image_to_string(img)
return text.strip()
# Step 1: Load Documents from Multiple Sources
def load_documents(data_path):
documents = []
# Load PDFs
pdf_loader = DirectoryLoader(data_path, glob="*.pdf", loader_cls=PyPDFLoader)
documents.extend(pdf_loader.load())
# Load Word files
for file in Path(data_path).glob("*.docx"):
doc = Document(file)
text = "\n".join([para.text for para in doc.paragraphs])
documents.append(LangchainDocument(page_content=text, metadata={"source": file.name}))
# Load PowerPoint files
for file in Path(data_path).glob("*.pptx"):
prs = Presentation(file)
for i, slide in enumerate(prs.slides):
text = "\n".join([shape.text for shape in slide.shapes if hasattr(shape, "text")])
if text.strip():
documents.append(LangchainDocument(page_content=text, metadata={"source": file.name, "slide": i + 1}))
# Load Images (OCR) - JPG and PNG
for image_file in Path(data_path).rglob("*.jpg"):
text = extract_text_from_image(image_file)
if text:
documents.append(LangchainDocument(page_content=text, metadata={"source": image_file.name}))
for image_file in Path(data_path).rglob("*.png"):
text = extract_text_from_image(image_file)
if text:
documents.append(LangchainDocument(page_content=text, metadata={"source": image_file.name}))
print(f"β
Loaded {len(documents)} documents from {data_path}")
return documents
# Step 2: Create Chunks
def create_chunks(documents):
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
text_chunks = text_splitter.split_documents(documents)
print(f"β
Created {len(text_chunks)} text chunks")
return text_chunks
# Step 3: Create Vector Embeddings
def get_embedding_model():
return HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
# Step 4: Store embeddings in FAISS
def create_vector_store(text_chunks):
embedding_model = get_embedding_model()
print("π Creating vector store...")
db = FAISS.from_documents(text_chunks, embedding_model)
db.save_local(DB_FAISS_PATH)
print("β
Vector store created/updated successfully.")
# Step 5: Main Execution
if __name__ == "__main__":
print("π Starting process...")
documents = load_documents(DATA_PATH)
text_chunks = create_chunks(documents)
create_vector_store(text_chunks)
print("π Process completed successfully!")
|