File size: 2,982 Bytes
8e456d3
 
 
 
 
 
70d0eba
 
 
8e456d3
 
 
70d0eba
8e456d3
 
70d0eba
8e456d3
 
70d0eba
8e456d3
 
 
 
 
70d0eba
8e456d3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70d0eba
8e456d3
70d0eba
8e456d3
 
 
 
70d0eba
 
 
 
8e456d3
70d0eba
8e456d3
 
 
70d0eba
 
8e456d3
 
70d0eba
8e456d3
70d0eba
8e456d3
 
70d0eba
8e456d3
70d0eba
8e456d3
70d0eba
8e456d3
 
70d0eba
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import streamlit as st
import os
import tempfile
import fitz  # PyMuPDF for PDFs
import docx
import openpyxl
import faiss

from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
from langchain.llms import Groq
from langchain.chains import RetrievalQA

# Load LLM (API key from Hugging Face secrets)
llm = Groq(
    model="llama3-8b-8192",
    api_key=os.getenv("GROQ_API_KEY")
)

# Embeddings model
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# File readers
def read_pdf(file_path):
    text = ""
    doc = fitz.open(file_path)
    for page in doc:
        text += page.get_text()
    return text

def read_docx(file_path):
    doc = docx.Document(file_path)
    return "\n".join([p.text for p in doc.paragraphs])

def read_excel(file_path):
    wb = openpyxl.load_workbook(file_path, data_only=True)
    text = ""
    for sheet in wb.sheetnames:
        ws = wb[sheet]
        for row in ws.iter_rows(values_only=True):
            text += " ".join([str(cell) for cell in row if cell is not None]) + "\n"
    return text

def process_file(uploaded_file):
    suffix = uploaded_file.name.split(".")[-1]
    with tempfile.NamedTemporaryFile(delete=False, suffix="." + suffix) as tmp_file:
        tmp_file.write(uploaded_file.read())
        tmp_path = tmp_file.name

    if suffix.lower() == "pdf":
        return read_pdf(tmp_path)
    elif suffix.lower() == "docx":
        return read_docx(tmp_path)
    elif suffix.lower() == "xlsx":
        return read_excel(tmp_path)
    else:
        return "Unsupported file type."

# Streamlit App
st.set_page_config(page_title="DocuQuery AI", layout="centered")
st.title("πŸ“„ DocuQuery AI")
st.markdown("Upload a document (PDF, Word, or Excel) and ask questions about its content using LLaMA3.")

uploaded_file = st.file_uploader("Upload your document", type=["pdf", "docx", "xlsx"])

if uploaded_file:
    st.success("βœ… File uploaded successfully.")
    with st.spinner("Reading and processing file..."):
        raw_text = process_file(uploaded_file)

    splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    docs = [Document(page_content=chunk) for chunk in splitter.split_text(raw_text)]

    with st.spinner("Indexing document with FAISS..."):
        db = FAISS.from_documents(docs, embedding_model)
        retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 4})
        qa_chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever)

    st.success("πŸ“š Document indexed. Ask your question below!")

    user_query = st.text_input("❓ Ask something about the document:")
    if user_query:
        with st.spinner("Generating answer..."):
            response = qa_chain.run(user_query)
            st.markdown(f"**πŸ’¬ Answer:** {response}")