Dewasheesh commited on
Commit
0879db7
·
verified ·
1 Parent(s): 3cf1026

Update app/vector_store.py

Browse files
Files changed (1) hide show
  1. app/vector_store.py +29 -0
app/vector_store.py CHANGED
@@ -1,10 +1,18 @@
 
 
 
1
  import os
2
  from langchain.vectorstores import Chroma
3
  from langchain.embeddings import SentenceTransformerEmbeddings
4
  from langchain.document_loaders import PyMuPDFLoader
5
  from langchain.text_splitter import RecursiveCharacterTextSplitter
 
 
 
 
6
 
7
  CHROMA_DIR = "data/chroma_db"
 
8
 
9
 
10
  def store_pdf(pdf_path):
@@ -15,3 +23,24 @@ def store_pdf(pdf_path):
15
  chunks = splitter.split_documents(docs)
16
  embeddings = SentenceTransformerEmbeddings(model_name='thenlper/gte-large')
17
  Chroma.from_documents(chunks, embeddings, persist_directory=CHROMA_DIR)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from PIL import Image
2
+ import pytesseract
3
+ from pdf2image import convert_from_path
4
  import os
5
  from langchain.vectorstores import Chroma
6
  from langchain.embeddings import SentenceTransformerEmbeddings
7
  from langchain.document_loaders import PyMuPDFLoader
8
  from langchain.text_splitter import RecursiveCharacterTextSplitter
9
+ from langchain.docstore.document import Document
10
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
11
+ from langchain.vectorstores import Chroma
12
+ from langchain.embeddings import HuggingFaceEmbeddings
13
 
14
  CHROMA_DIR = "data/chroma_db"
15
+ CHROMA_IMG_DIR = "data/image_db"
16
 
17
 
18
  def store_pdf(pdf_path):
 
23
  chunks = splitter.split_documents(docs)
24
  embeddings = SentenceTransformerEmbeddings(model_name='thenlper/gte-large')
25
  Chroma.from_documents(chunks, embeddings, persist_directory=CHROMA_DIR)
26
+
27
+
28
+ def store_pdf_image(pdf_path):
29
+ text = extract_text_from_scanned_pdf(pdf_path)
30
+ doc = Document(page_content=text)
31
+ splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
32
+ chunks = splitter.split_documents([doc])
33
+ embeddings = SentenceTransformerEmbeddings(model_name='thenlper/gte-large')
34
+ Chroma.from_documents(chunks, embeddings, persist_directory=CHROMA_IMG_DIR)
35
+
36
+
37
+ # images = convert_from_path("your_file.pdf", poppler_path="/opt/homebrew/bin")
38
+
39
+
40
+ def extract_text_from_scanned_pdf(pdf_path):
41
+ pages = convert_from_path(pdf_path, dpi=300)
42
+ all_text = ""
43
+ for i, page in enumerate(pages):
44
+ text = pytesseract.image_to_string(page, lang="eng")
45
+ all_text += f"\n--- Page {i+1} ---\n{text}"
46
+ return all_text