Spaces:

boenci
/

rag

Runtime error

File size: 4,097 Bytes
import os
import pytesseract
import requests
from bs4 import BeautifulSoup
from PIL import Image
from pdf2image import convert_from_path

from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.llms import Ollama
from langchain.chains import RetrievalQA
from langchain.schema import Document

import gradio as gr

# ========== 1. Load Local Documents with OCR Support ==========

def load_local_documents(paths):
    all_docs = []

    for path in paths:
        if path.lower().endswith(".txt"):
            with open(path, 'r', encoding='utf-8') as f:
                text = f.read()
            all_docs.append(Document(page_content=text, metadata={"source": path}))

        elif path.lower().endswith(".pdf"):
            try:
                from langchain.document_loaders import PyPDFLoader
                all_docs.extend(PyPDFLoader(path).load())
            except:
                pages = convert_from_path(path)
                for i, page in enumerate(pages):
                    text = pytesseract.image_to_string(page)
                    all_docs.append(Document(page_content=text, metadata={"page": i, "source": path}))

        elif path.lower().endswith((".png", ".jpg", ".jpeg")):
            img = Image.open(path)
            text = pytesseract.image_to_string(img)
            all_docs.append(Document(page_content=text, metadata={"source": path}))

    return all_docs

# ========== 2. Crawl a Website and Extract Text ==========

def scrape_website(url):
    try:
        response = requests.get(url, timeout=10)
        soup = BeautifulSoup(response.text, "html.parser")
        text = soup.get_text(separator="\n")
        return [Document(page_content=text, metadata={"source": url})]
    except Exception as e:
        print(f"Failed to scrape {url}: {e}")
        return []

# ========== 3. Chunk, Embed, and Store in FAISS ==========

def build_vector_db(documents, db_path="faiss_index"):
    splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    chunks = splitter.split_documents(documents)

    embed_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
    vectordb = FAISS.from_documents(chunks, embed_model)
    vectordb.save_local(db_path)
    return vectordb

# ========== 4. Set Up RAG Chain with Local LLM (Ollama) ==========

def get_rag_chain(db_path="faiss_index"):
    embed_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
    vectordb = FAISS.load_local(db_path, embed_model)
    retriever = vectordb.as_retriever(search_type="similarity", k=3)
    llm = Ollama(model="mistral")  # make sure this model is running in Ollama
    return RetrievalQA.from_chain_type(llm=llm, retriever=retriever)

# ========== 5. Gradio UI ==========

qa_chain = None

def run_query(question):
    if not qa_chain:
        return "Please load documents first."
    result = qa_chain({"query": question})
    return result["result"]

def load_all_docs(local_files, website_urls):
    docs = []

    if local_files:
        docs.extend(load_local_documents(local_files))

    if website_urls:
        for url in website_urls.split(","):
            docs.extend(scrape_website(url.strip()))

    build_vector_db(docs)
    global qa_chain
    qa_chain = get_rag_chain()
    return f"Indexed {len(docs)} documents. Ready to answer queries!"

demo = gr.Interface(
    title="📚 Local RAG App",
    fn=run_query,
    inputs=gr.Textbox(placeholder="Ask your question..."),
    outputs="text",
    description="Load local files & websites, then ask questions below.",
)

load_interface = gr.Interface(
    fn=load_all_docs,
    inputs=[
        gr.File(file_types=[".txt", ".pdf", ".jpg", ".png"], file_count="multiple", label="Upload Files"),
        gr.Textbox(placeholder="https://example.com, https://another.com", label="Website URLs (comma-separated)")
    ],
    outputs="text",
    title="🗂️ Load Your Documents",
)

app = gr.TabbedInterface([load_interface, demo], ["Load Docs", "Ask Questions"])
app.launch()