Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import os | |
| import tempfile | |
| import fitz | |
| import docx | |
| import openpyxl | |
| import faiss | |
| from groq import Groq | |
| from langchain_community.embeddings import HuggingFaceEmbeddings | |
| from langchain.vectorstores import FAISS | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain.docstore.document import Document | |
| # Initialize Groq client | |
| groq_client = Groq(api_key=os.environ.get("GROQ_API_KEY")) | |
| # Embedding model | |
| embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") | |
| # File readers | |
| def read_pdf(file_path): | |
| text = "" | |
| doc = fitz.open(file_path) | |
| for page in doc: | |
| text += page.get_text() | |
| return text | |
| def read_docx(file_path): | |
| doc = docx.Document(file_path) | |
| return "\n".join([p.text for p in doc.paragraphs]) | |
| def read_excel(file_path): | |
| wb = openpyxl.load_workbook(file_path, data_only=True) | |
| text = "" | |
| for sheet in wb.sheetnames: | |
| ws = wb[sheet] | |
| for row in ws.iter_rows(values_only=True): | |
| text += " ".join([str(cell) for cell in row if cell is not None]) + "\n" | |
| return text | |
| def process_file(uploaded_file): | |
| suffix = uploaded_file.name.split(".")[-1] | |
| with tempfile.NamedTemporaryFile(delete=False, suffix="." + suffix) as tmp_file: | |
| tmp_file.write(uploaded_file.read()) | |
| tmp_path = tmp_file.name | |
| if suffix.lower() == "pdf": | |
| return read_pdf(tmp_path) | |
| elif suffix.lower() == "docx": | |
| return read_docx(tmp_path) | |
| elif suffix.lower() == "xlsx": | |
| return read_excel(tmp_path) | |
| else: | |
| return "Unsupported file type." | |
| # Prompt builder | |
| def build_prompt(context, question): | |
| return f"""You are a helpful assistant. Answer the question based only on the context provided below. | |
| Context: | |
| {context} | |
| Question: | |
| {question} | |
| Answer:""" | |
| # Streamlit App | |
| st.set_page_config(page_title="DocuQuery AI", layout="centered") | |
| st.title("π DocuQuery AI") | |
| st.markdown("Upload a document and ask questions about it using LLaMA-3 from Groq.") | |
| uploaded_file = st.file_uploader("Upload your document", type=["pdf", "docx", "xlsx"]) | |
| if uploaded_file: | |
| st.success("β File uploaded successfully.") | |
| with st.spinner("Processing file..."): | |
| raw_text = process_file(uploaded_file) | |
| splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50) | |
| docs = [Document(page_content=chunk) for chunk in splitter.split_text(raw_text)] | |
| with st.spinner("Embedding & indexing..."): | |
| db = FAISS.from_documents(docs, embedding_model) | |
| retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 4}) | |
| st.success("π Document indexed. Ask a question!") | |
| user_query = st.text_input("β Ask something about the document:") | |
| if user_query: | |
| with st.spinner("Generating response..."): | |
| retrieved_docs = retriever.get_relevant_documents(user_query) | |
| context = "\n".join([doc.page_content for doc in retrieved_docs]) | |
| prompt = build_prompt(context, user_query) | |
| response = groq_client.chat.completions.create( | |
| model="llama3-8b-8192", | |
| messages=[ | |
| {"role": "user", "content": prompt} | |
| ] | |
| ) | |
| st.markdown(f"**π¬ Answer:** {response.choices[0].message.content}") | |