|
|
import gradio as gr |
|
|
import os |
|
|
from langchain_community.vectorstores import Chroma |
|
|
from langchain_openai import OpenAIEmbeddings |
|
|
from langchain_text_splitters import RecursiveCharacterTextSplitter |
|
|
from langchain_community.document_loaders import PyPDFLoader |
|
|
|
|
|
|
|
|
if not os.path.exists("pdfs"): |
|
|
os.makedirs("pdfs") |
|
|
|
|
|
def get_pdf_files(): |
|
|
"""Gets the list of PDF files from the 'pdfs' directory.""" |
|
|
return [f for f in os.listdir("pdfs") if f.endswith(".pdf")] |
|
|
|
|
|
def index_pdfs(): |
|
|
"""Indexes the PDF files in the 'pdfs' directory.""" |
|
|
pdf_files = get_pdf_files() |
|
|
if not pdf_files: |
|
|
return "No PDF files found in the 'pdfs' directory." |
|
|
|
|
|
success_files = [] |
|
|
failed_files = [] |
|
|
|
|
|
for pdf_file in pdf_files: |
|
|
try: |
|
|
file_path = os.path.join("pdfs", pdf_file) |
|
|
if os.path.getsize(file_path) == 0: |
|
|
failed_files.append(f"{pdf_file} (file is empty)") |
|
|
continue |
|
|
|
|
|
loader = PyPDFLoader(file_path) |
|
|
documents = loader.load() |
|
|
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0) |
|
|
texts = text_splitter.split_documents(documents) |
|
|
embeddings = OpenAIEmbeddings() |
|
|
db = Chroma.from_documents(texts, embeddings, persist_directory="./db") |
|
|
db.persist() |
|
|
success_files.append(pdf_file) |
|
|
except Exception as e: |
|
|
failed_files.append(f"{pdf_file} (Error: {e})") |
|
|
|
|
|
status = "" |
|
|
if success_files: |
|
|
status += f"Successfully indexed: {', '.join(success_files)}\n" |
|
|
if failed_files: |
|
|
status += f"Failed to index: {', '.join(failed_files)}" |
|
|
|
|
|
return status if status else "No files were processed." |
|
|
|
|
|
def search(query): |
|
|
"""Searches the indexed PDFs for the given query.""" |
|
|
embeddings = OpenAIEmbeddings() |
|
|
db = Chroma(persist_directory="./db", embedding_function=embeddings) |
|
|
docs = db.similarity_search(query) |
|
|
results = "" |
|
|
for doc in docs: |
|
|
results += f"Source: {doc.metadata['source']}\n" |
|
|
results += f"Content: {doc.page_content}\n\n" |
|
|
return results |
|
|
|
|
|
with gr.Blocks() as demo: |
|
|
gr.Markdown("# Simple Semantic Search App") |
|
|
with gr.Tab("Index PDFs"): |
|
|
pdf_files_display = gr.Textbox(label="Available PDF Files", interactive=False, value="\n".join(get_pdf_files())) |
|
|
index_button = gr.Button("Index PDFs") |
|
|
index_status = gr.Textbox(label="Indexing Status", interactive=False) |
|
|
index_button.click(index_pdfs, inputs=None, outputs=index_status) |
|
|
with gr.Tab("Search"): |
|
|
search_query = gr.Textbox(label="Search Query") |
|
|
search_button = gr.Button("Search") |
|
|
search_results = gr.Textbox(label="Search Results", interactive=False) |
|
|
search_button.click(search, inputs=search_query, outputs=search_results) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch() |
|
|
|