File size: 7,257 Bytes
6792445
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
# import os
# from pathlib import Path
# import cv2
# import pytesseract
# from PIL import Image
# from docx import Document
# from pptx import Presentation
# from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
# from langchain.text_splitter import RecursiveCharacterTextSplitter
# from langchain_huggingface import HuggingFaceEmbeddings
# from langchain_community.vectorstores import FAISS
# from langchain.schema import Document as LangchainDocument  # βœ… Ensure correct Document format
# from dotenv import load_dotenv, find_dotenv

# # Load environment variables
# load_dotenv(find_dotenv())

# # Paths
# DATA_PATH = "data/"
# DB_FAISS_PATH = "vectorstore/db_faiss"

# # Set Tesseract OCR Path (update this based on your installation)
# pytesseract.pytesseract.tesseract_cmd = r"C:\\Users\\Rupesh Shinde\\Tesseract\\tesseract.exe"

# # Step 1: Load Documents from Multiple Sources
# def load_documents(data_path):
#     documents = []

#     # Load PDFs
#     pdf_loader = DirectoryLoader(data_path, glob="*.pdf", loader_cls=PyPDFLoader)
#     documents.extend(pdf_loader.load())  # PDFs are already in Document format

#     # Load Word files
#     for file in Path(data_path).glob("*.docx"):
#         doc = Document(file)
#         text = "\n".join([para.text for para in doc.paragraphs])
#         documents.append(LangchainDocument(page_content=text, metadata={"source": file.name}))

#     # Load PowerPoint files
#     for file in Path(data_path).glob("*.pptx"):
#         prs = Presentation(file)
#         text = ""
#         for slide in prs.slides:
#             for shape in slide.shapes:
#                 if hasattr(shape, "text"):
#                     text += shape.text + "\n"
#         documents.append(LangchainDocument(page_content=text, metadata={"source": file.name}))

#     # Load Images (OCR)
#     for image_file in Path(data_path).glob("*.jpg"):
#         img = cv2.imread(str(image_file))
#         text = pytesseract.image_to_string(img)
#         documents.append(LangchainDocument(page_content=text, metadata={"source": image_file.name}))

#     for image_file in Path(data_path).glob("*.png"):
#         img = cv2.imread(str(image_file))
#         text = pytesseract.image_to_string(img)
#         documents.append(LangchainDocument(page_content=text, metadata={"source": image_file.name}))

#     print(f"βœ… Loaded {len(documents)} documents from {data_path}")
#     return documents

# # Step 2: Create Chunks
# def create_chunks(documents):
#     text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
#     text_chunks = text_splitter.split_documents(documents)
#     print(f"βœ… Created {len(text_chunks)} text chunks")
#     return text_chunks

# # Step 3: Create Vector Embeddings
# def get_embedding_model():
#     return HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# # Step 4: Store embeddings in FAISS
# def create_vector_store(text_chunks):
#     embedding_model = get_embedding_model()
#     print("πŸ”„ Creating vector store...")
#     db = FAISS.from_documents(text_chunks, embedding_model)
#     db.save_local(DB_FAISS_PATH)
#     print("βœ… Vector store created/updated successfully.")

# # Step 5: Main Execution
# if __name__ == "__main__":
#     print("πŸš€ Starting process...")
#     documents = load_documents(DATA_PATH)
#     text_chunks = create_chunks(documents)
#     create_vector_store(text_chunks)
#     print("πŸŽ‰ Process completed successfully!")


import os
from pathlib import Path
import cv2
import pytesseract
from PIL import Image
from docx import Document
from pptx import Presentation
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.schema import Document as LangchainDocument
from dotenv import load_dotenv, find_dotenv

# Load environment variables
load_dotenv(find_dotenv())

# Paths
DATA_PATH = "data/"
DB_FAISS_PATH = "vectorstore/db_faiss"

# Set Tesseract OCR Path (update this based on your installation)
pytesseract.pytesseract.tesseract_cmd = r"C:\\Users\\Rupesh Shinde\\Tesseract\\tesseract.exe"

# Function to extract text from images
def extract_text_from_image(image_path):
    img = cv2.imread(str(image_path))
    if img is None:
        print(f"⚠️ Warning: Unable to read image {image_path}")
        return ""
    text = pytesseract.image_to_string(img)
    return text.strip()

# Step 1: Load Documents from Multiple Sources
def load_documents(data_path):
    documents = []

    # Load PDFs
    pdf_loader = DirectoryLoader(data_path, glob="*.pdf", loader_cls=PyPDFLoader)
    documents.extend(pdf_loader.load())

    # Load Word files
    for file in Path(data_path).glob("*.docx"):
        doc = Document(file)
        text = "\n".join([para.text for para in doc.paragraphs])
        documents.append(LangchainDocument(page_content=text, metadata={"source": file.name}))

    # Load PowerPoint files
    for file in Path(data_path).glob("*.pptx"):
        prs = Presentation(file)
        for i, slide in enumerate(prs.slides):
            text = "\n".join([shape.text for shape in slide.shapes if hasattr(shape, "text")])
            if text.strip():
                documents.append(LangchainDocument(page_content=text, metadata={"source": file.name, "slide": i + 1}))

    # Load Images (OCR) - JPG and PNG
    for image_file in Path(data_path).rglob("*.jpg"):
        text = extract_text_from_image(image_file)
        if text:
            documents.append(LangchainDocument(page_content=text, metadata={"source": image_file.name}))
    
    for image_file in Path(data_path).rglob("*.png"):
        text = extract_text_from_image(image_file)
        if text:
            documents.append(LangchainDocument(page_content=text, metadata={"source": image_file.name}))

    print(f"βœ… Loaded {len(documents)} documents from {data_path}")
    return documents

# Step 2: Create Chunks
def create_chunks(documents):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    text_chunks = text_splitter.split_documents(documents)
    print(f"βœ… Created {len(text_chunks)} text chunks")
    return text_chunks

# Step 3: Create Vector Embeddings
def get_embedding_model():
    return HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Step 4: Store embeddings in FAISS
def create_vector_store(text_chunks):
    embedding_model = get_embedding_model()
    print("πŸ”„ Creating vector store...")
    db = FAISS.from_documents(text_chunks, embedding_model)
    db.save_local(DB_FAISS_PATH)
    print("βœ… Vector store created/updated successfully.")

# Step 5: Main Execution
if __name__ == "__main__":
    print("πŸš€ Starting process...")
    documents = load_documents(DATA_PATH)
    text_chunks = create_chunks(documents)
    create_vector_store(text_chunks)
    print("πŸŽ‰ Process completed successfully!")