File size: 4,097 Bytes
2853482
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import os
import pytesseract
import requests
from bs4 import BeautifulSoup
from PIL import Image
from pdf2image import convert_from_path

from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.llms import Ollama
from langchain.chains import RetrievalQA
from langchain.schema import Document

import gradio as gr

# ========== 1. Load Local Documents with OCR Support ==========

def load_local_documents(paths):
    all_docs = []

    for path in paths:
        if path.lower().endswith(".txt"):
            with open(path, 'r', encoding='utf-8') as f:
                text = f.read()
            all_docs.append(Document(page_content=text, metadata={"source": path}))

        elif path.lower().endswith(".pdf"):
            try:
                from langchain.document_loaders import PyPDFLoader
                all_docs.extend(PyPDFLoader(path).load())
            except:
                pages = convert_from_path(path)
                for i, page in enumerate(pages):
                    text = pytesseract.image_to_string(page)
                    all_docs.append(Document(page_content=text, metadata={"page": i, "source": path}))

        elif path.lower().endswith((".png", ".jpg", ".jpeg")):
            img = Image.open(path)
            text = pytesseract.image_to_string(img)
            all_docs.append(Document(page_content=text, metadata={"source": path}))

    return all_docs

# ========== 2. Crawl a Website and Extract Text ==========

def scrape_website(url):
    try:
        response = requests.get(url, timeout=10)
        soup = BeautifulSoup(response.text, "html.parser")
        text = soup.get_text(separator="\n")
        return [Document(page_content=text, metadata={"source": url})]
    except Exception as e:
        print(f"Failed to scrape {url}: {e}")
        return []

# ========== 3. Chunk, Embed, and Store in FAISS ==========

def build_vector_db(documents, db_path="faiss_index"):
    splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    chunks = splitter.split_documents(documents)

    embed_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
    vectordb = FAISS.from_documents(chunks, embed_model)
    vectordb.save_local(db_path)
    return vectordb

# ========== 4. Set Up RAG Chain with Local LLM (Ollama) ==========

def get_rag_chain(db_path="faiss_index"):
    embed_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
    vectordb = FAISS.load_local(db_path, embed_model)
    retriever = vectordb.as_retriever(search_type="similarity", k=3)
    llm = Ollama(model="mistral")  # make sure this model is running in Ollama
    return RetrievalQA.from_chain_type(llm=llm, retriever=retriever)

# ========== 5. Gradio UI ==========

qa_chain = None

def run_query(question):
    if not qa_chain:
        return "Please load documents first."
    result = qa_chain({"query": question})
    return result["result"]

def load_all_docs(local_files, website_urls):
    docs = []

    if local_files:
        docs.extend(load_local_documents(local_files))

    if website_urls:
        for url in website_urls.split(","):
            docs.extend(scrape_website(url.strip()))

    build_vector_db(docs)
    global qa_chain
    qa_chain = get_rag_chain()
    return f"Indexed {len(docs)} documents. Ready to answer queries!"

demo = gr.Interface(
    title="📚 Local RAG App",
    fn=run_query,
    inputs=gr.Textbox(placeholder="Ask your question..."),
    outputs="text",
    description="Load local files & websites, then ask questions below.",
)

load_interface = gr.Interface(
    fn=load_all_docs,
    inputs=[
        gr.File(file_types=[".txt", ".pdf", ".jpg", ".png"], file_count="multiple", label="Upload Files"),
        gr.Textbox(placeholder="https://example.com, https://another.com", label="Website URLs (comma-separated)")
    ],
    outputs="text",
    title="🗂️ Load Your Documents",
)

app = gr.TabbedInterface([load_interface, demo], ["Load Docs", "Ask Questions"])
app.launch()