Spaces:
Sleeping
Sleeping
| from PIL import Image | |
| import pytesseract | |
| from pdf2image import convert_from_path | |
| import os | |
| from langchain.vectorstores import Chroma | |
| from langchain.embeddings import SentenceTransformerEmbeddings | |
| from langchain.document_loaders import PyMuPDFLoader | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain.docstore.document import Document | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain.vectorstores import Chroma | |
| from langchain.embeddings import HuggingFaceEmbeddings | |
| CHROMA_DIR = "data/chroma_db" | |
| CHROMA_IMG_DIR = "data/image_db" | |
| def store_pdf(pdf_path): | |
| loader = PyMuPDFLoader(pdf_path) | |
| docs = loader.load() | |
| splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=500, chunk_overlap=100) | |
| chunks = splitter.split_documents(docs) | |
| embeddings = SentenceTransformerEmbeddings(model_name='thenlper/gte-large') | |
| Chroma.from_documents(chunks, embeddings, persist_directory=CHROMA_DIR) | |
| def store_pdf_image(pdf_path): | |
| text = extract_text_from_scanned_pdf(pdf_path) | |
| doc = Document(page_content=text) | |
| splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50) | |
| chunks = splitter.split_documents([doc]) | |
| embeddings = SentenceTransformerEmbeddings(model_name='thenlper/gte-large') | |
| Chroma.from_documents(chunks, embeddings, persist_directory=CHROMA_IMG_DIR) | |
| def store_pdf_image_text(text): | |
| doc = Document(page_content=text) | |
| splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50) | |
| chunks = splitter.split_documents([doc]) | |
| embeddings = SentenceTransformerEmbeddings(model_name='thenlper/gte-large') | |
| Chroma.from_documents(chunks, embeddings, persist_directory=CHROMA_IMG_DIR) | |
| # images = convert_from_path("your_file.pdf", poppler_path="/opt/homebrew/bin") | |
| def extract_text_from_scanned_pdf(pdf_path): | |
| pages = convert_from_path(pdf_path, dpi=300) | |
| all_text = "" | |
| for i, page in enumerate(pages): | |
| text = pytesseract.image_to_string(page, lang="eng") | |
| all_text += f"\n--- Page {i+1} ---\n{text}" | |
| return all_text | |