Spaces:
Sleeping
Sleeping
File size: 2,982 Bytes
8e456d3 70d0eba 8e456d3 70d0eba 8e456d3 70d0eba 8e456d3 70d0eba 8e456d3 70d0eba 8e456d3 70d0eba 8e456d3 70d0eba 8e456d3 70d0eba 8e456d3 70d0eba 8e456d3 70d0eba 8e456d3 70d0eba 8e456d3 70d0eba 8e456d3 70d0eba 8e456d3 70d0eba 8e456d3 70d0eba 8e456d3 70d0eba |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 |
import streamlit as st
import os
import tempfile
import fitz # PyMuPDF for PDFs
import docx
import openpyxl
import faiss
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
from langchain.llms import Groq
from langchain.chains import RetrievalQA
# Load LLM (API key from Hugging Face secrets)
llm = Groq(
model="llama3-8b-8192",
api_key=os.getenv("GROQ_API_KEY")
)
# Embeddings model
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
# File readers
def read_pdf(file_path):
text = ""
doc = fitz.open(file_path)
for page in doc:
text += page.get_text()
return text
def read_docx(file_path):
doc = docx.Document(file_path)
return "\n".join([p.text for p in doc.paragraphs])
def read_excel(file_path):
wb = openpyxl.load_workbook(file_path, data_only=True)
text = ""
for sheet in wb.sheetnames:
ws = wb[sheet]
for row in ws.iter_rows(values_only=True):
text += " ".join([str(cell) for cell in row if cell is not None]) + "\n"
return text
def process_file(uploaded_file):
suffix = uploaded_file.name.split(".")[-1]
with tempfile.NamedTemporaryFile(delete=False, suffix="." + suffix) as tmp_file:
tmp_file.write(uploaded_file.read())
tmp_path = tmp_file.name
if suffix.lower() == "pdf":
return read_pdf(tmp_path)
elif suffix.lower() == "docx":
return read_docx(tmp_path)
elif suffix.lower() == "xlsx":
return read_excel(tmp_path)
else:
return "Unsupported file type."
# Streamlit App
st.set_page_config(page_title="DocuQuery AI", layout="centered")
st.title("π DocuQuery AI")
st.markdown("Upload a document (PDF, Word, or Excel) and ask questions about its content using LLaMA3.")
uploaded_file = st.file_uploader("Upload your document", type=["pdf", "docx", "xlsx"])
if uploaded_file:
st.success("β
File uploaded successfully.")
with st.spinner("Reading and processing file..."):
raw_text = process_file(uploaded_file)
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
docs = [Document(page_content=chunk) for chunk in splitter.split_text(raw_text)]
with st.spinner("Indexing document with FAISS..."):
db = FAISS.from_documents(docs, embedding_model)
retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 4})
qa_chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever)
st.success("π Document indexed. Ask your question below!")
user_query = st.text_input("β Ask something about the document:")
if user_query:
with st.spinner("Generating answer..."):
response = qa_chain.run(user_query)
st.markdown(f"**π¬ Answer:** {response}")
|