File size: 3,352 Bytes
8e456d3
 
 
c1a9c71
8e456d3
 
70d0eba
 
c1a9c71
276cacb
8e456d3
 
 
 
c1a9c71
 
8e456d3
c1a9c71
8e456d3
 
70d0eba
8e456d3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70d0eba
8e456d3
70d0eba
8e456d3
 
 
 
c1a9c71
 
 
 
 
 
 
 
 
 
 
 
70d0eba
 
 
c1a9c71
8e456d3
70d0eba
8e456d3
 
 
c1a9c71
70d0eba
8e456d3
 
70d0eba
8e456d3
c1a9c71
8e456d3
 
 
c1a9c71
8e456d3
70d0eba
8e456d3
c1a9c71
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import streamlit as st
import os
import tempfile
import fitz
import docx
import openpyxl
import faiss

from groq import Groq
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document

# Initialize Groq client
groq_client = Groq(api_key=os.environ.get("GROQ_API_KEY"))

# Embedding model
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# File readers
def read_pdf(file_path):
    text = ""
    doc = fitz.open(file_path)
    for page in doc:
        text += page.get_text()
    return text

def read_docx(file_path):
    doc = docx.Document(file_path)
    return "\n".join([p.text for p in doc.paragraphs])

def read_excel(file_path):
    wb = openpyxl.load_workbook(file_path, data_only=True)
    text = ""
    for sheet in wb.sheetnames:
        ws = wb[sheet]
        for row in ws.iter_rows(values_only=True):
            text += " ".join([str(cell) for cell in row if cell is not None]) + "\n"
    return text

def process_file(uploaded_file):
    suffix = uploaded_file.name.split(".")[-1]
    with tempfile.NamedTemporaryFile(delete=False, suffix="." + suffix) as tmp_file:
        tmp_file.write(uploaded_file.read())
        tmp_path = tmp_file.name

    if suffix.lower() == "pdf":
        return read_pdf(tmp_path)
    elif suffix.lower() == "docx":
        return read_docx(tmp_path)
    elif suffix.lower() == "xlsx":
        return read_excel(tmp_path)
    else:
        return "Unsupported file type."

# Prompt builder
def build_prompt(context, question):
    return f"""You are a helpful assistant. Answer the question based only on the context provided below.

Context:
{context}

Question:
{question}

Answer:"""

# Streamlit App
st.set_page_config(page_title="DocuQuery AI", layout="centered")
st.title("πŸ“„ DocuQuery AI")
st.markdown("Upload a document and ask questions about it using LLaMA-3 from Groq.")

uploaded_file = st.file_uploader("Upload your document", type=["pdf", "docx", "xlsx"])

if uploaded_file:
    st.success("βœ… File uploaded successfully.")
    with st.spinner("Processing file..."):
        raw_text = process_file(uploaded_file)

    splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    docs = [Document(page_content=chunk) for chunk in splitter.split_text(raw_text)]

    with st.spinner("Embedding & indexing..."):
        db = FAISS.from_documents(docs, embedding_model)
        retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 4})

    st.success("πŸ“š Document indexed. Ask a question!")

    user_query = st.text_input("❓ Ask something about the document:")
    if user_query:
        with st.spinner("Generating response..."):
            retrieved_docs = retriever.get_relevant_documents(user_query)
            context = "\n".join([doc.page_content for doc in retrieved_docs])

            prompt = build_prompt(context, user_query)

            response = groq_client.chat.completions.create(
                model="llama3-8b-8192",
                messages=[
                    {"role": "user", "content": prompt}
                ]
            )

            st.markdown(f"**πŸ’¬ Answer:** {response.choices[0].message.content}")