Spaces:
Sleeping
Sleeping
File size: 3,352 Bytes
8e456d3 c1a9c71 8e456d3 70d0eba c1a9c71 276cacb 8e456d3 c1a9c71 8e456d3 c1a9c71 8e456d3 70d0eba 8e456d3 70d0eba 8e456d3 70d0eba 8e456d3 c1a9c71 70d0eba c1a9c71 8e456d3 70d0eba 8e456d3 c1a9c71 70d0eba 8e456d3 70d0eba 8e456d3 c1a9c71 8e456d3 c1a9c71 8e456d3 70d0eba 8e456d3 c1a9c71 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 |
import streamlit as st
import os
import tempfile
import fitz
import docx
import openpyxl
import faiss
from groq import Groq
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
# Initialize Groq client
groq_client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
# Embedding model
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
# File readers
def read_pdf(file_path):
text = ""
doc = fitz.open(file_path)
for page in doc:
text += page.get_text()
return text
def read_docx(file_path):
doc = docx.Document(file_path)
return "\n".join([p.text for p in doc.paragraphs])
def read_excel(file_path):
wb = openpyxl.load_workbook(file_path, data_only=True)
text = ""
for sheet in wb.sheetnames:
ws = wb[sheet]
for row in ws.iter_rows(values_only=True):
text += " ".join([str(cell) for cell in row if cell is not None]) + "\n"
return text
def process_file(uploaded_file):
suffix = uploaded_file.name.split(".")[-1]
with tempfile.NamedTemporaryFile(delete=False, suffix="." + suffix) as tmp_file:
tmp_file.write(uploaded_file.read())
tmp_path = tmp_file.name
if suffix.lower() == "pdf":
return read_pdf(tmp_path)
elif suffix.lower() == "docx":
return read_docx(tmp_path)
elif suffix.lower() == "xlsx":
return read_excel(tmp_path)
else:
return "Unsupported file type."
# Prompt builder
def build_prompt(context, question):
return f"""You are a helpful assistant. Answer the question based only on the context provided below.
Context:
{context}
Question:
{question}
Answer:"""
# Streamlit App
st.set_page_config(page_title="DocuQuery AI", layout="centered")
st.title("π DocuQuery AI")
st.markdown("Upload a document and ask questions about it using LLaMA-3 from Groq.")
uploaded_file = st.file_uploader("Upload your document", type=["pdf", "docx", "xlsx"])
if uploaded_file:
st.success("β
File uploaded successfully.")
with st.spinner("Processing file..."):
raw_text = process_file(uploaded_file)
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
docs = [Document(page_content=chunk) for chunk in splitter.split_text(raw_text)]
with st.spinner("Embedding & indexing..."):
db = FAISS.from_documents(docs, embedding_model)
retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 4})
st.success("π Document indexed. Ask a question!")
user_query = st.text_input("β Ask something about the document:")
if user_query:
with st.spinner("Generating response..."):
retrieved_docs = retriever.get_relevant_documents(user_query)
context = "\n".join([doc.page_content for doc in retrieved_docs])
prompt = build_prompt(context, user_query)
response = groq_client.chat.completions.create(
model="llama3-8b-8192",
messages=[
{"role": "user", "content": prompt}
]
)
st.markdown(f"**π¬ Answer:** {response.choices[0].message.content}")
|