Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import os | |
| import tempfile | |
| import fitz # PyMuPDF for PDFs | |
| import docx | |
| import openpyxl | |
| import faiss | |
| from langchain.embeddings import HuggingFaceEmbeddings | |
| from langchain.vectorstores import FAISS | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain.docstore.document import Document | |
| from langchain.llms import Groq | |
| from langchain.chains import RetrievalQA | |
| # Load LLM (API key from Hugging Face secrets) | |
| llm = Groq( | |
| model="llama3-8b-8192", | |
| api_key=os.getenv("GROQ_API_KEY") | |
| ) | |
| # Embeddings model | |
| embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") | |
| # File readers | |
| def read_pdf(file_path): | |
| text = "" | |
| doc = fitz.open(file_path) | |
| for page in doc: | |
| text += page.get_text() | |
| return text | |
| def read_docx(file_path): | |
| doc = docx.Document(file_path) | |
| return "\n".join([p.text for p in doc.paragraphs]) | |
| def read_excel(file_path): | |
| wb = openpyxl.load_workbook(file_path, data_only=True) | |
| text = "" | |
| for sheet in wb.sheetnames: | |
| ws = wb[sheet] | |
| for row in ws.iter_rows(values_only=True): | |
| text += " ".join([str(cell) for cell in row if cell is not None]) + "\n" | |
| return text | |
| def process_file(uploaded_file): | |
| suffix = uploaded_file.name.split(".")[-1] | |
| with tempfile.NamedTemporaryFile(delete=False, suffix="." + suffix) as tmp_file: | |
| tmp_file.write(uploaded_file.read()) | |
| tmp_path = tmp_file.name | |
| if suffix.lower() == "pdf": | |
| return read_pdf(tmp_path) | |
| elif suffix.lower() == "docx": | |
| return read_docx(tmp_path) | |
| elif suffix.lower() == "xlsx": | |
| return read_excel(tmp_path) | |
| else: | |
| return "Unsupported file type." | |
| # Streamlit App | |
| st.set_page_config(page_title="DocuQuery AI", layout="centered") | |
| st.title("π DocuQuery AI") | |
| st.markdown("Upload a document (PDF, Word, or Excel) and ask questions about its content using LLaMA3.") | |
| uploaded_file = st.file_uploader("Upload your document", type=["pdf", "docx", "xlsx"]) | |
| if uploaded_file: | |
| st.success("β File uploaded successfully.") | |
| with st.spinner("Reading and processing file..."): | |
| raw_text = process_file(uploaded_file) | |
| splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50) | |
| docs = [Document(page_content=chunk) for chunk in splitter.split_text(raw_text)] | |
| with st.spinner("Indexing document with FAISS..."): | |
| db = FAISS.from_documents(docs, embedding_model) | |
| retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 4}) | |
| qa_chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever) | |
| st.success("π Document indexed. Ask your question below!") | |
| user_query = st.text_input("β Ask something about the document:") | |
| if user_query: | |
| with st.spinner("Generating answer..."): | |
| response = qa_chain.run(user_query) | |
| st.markdown(f"**π¬ Answer:** {response}") | |