Spaces:

shamilcoded
/

DocuQuery_AI

Sleeping

App Files Files Community

DocuQuery_AI / app.py

shamilcoded

Update app.py

c1a9c71 verified 10 months ago

raw

history blame contribute delete

3.35 kB

	import streamlit as st
	import os
	import tempfile
	import fitz
	import docx
	import openpyxl
	import faiss

	from groq import Groq
	from langchain_community.embeddings import HuggingFaceEmbeddings
	from langchain.vectorstores import FAISS
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain.docstore.document import Document

	# Initialize Groq client
	groq_client = Groq(api_key=os.environ.get("GROQ_API_KEY"))

	# Embedding model
	embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

	# File readers
	def read_pdf(file_path):
	text = ""
	doc = fitz.open(file_path)
	for page in doc:
	text += page.get_text()
	return text

	def read_docx(file_path):
	doc = docx.Document(file_path)
	return "\n".join([p.text for p in doc.paragraphs])

	def read_excel(file_path):
	wb = openpyxl.load_workbook(file_path, data_only=True)
	text = ""
	for sheet in wb.sheetnames:
	ws = wb[sheet]
	for row in ws.iter_rows(values_only=True):
	text += " ".join([str(cell) for cell in row if cell is not None]) + "\n"
	return text

	def process_file(uploaded_file):
	suffix = uploaded_file.name.split(".")[-1]
	with tempfile.NamedTemporaryFile(delete=False, suffix="." + suffix) as tmp_file:
	tmp_file.write(uploaded_file.read())
	tmp_path = tmp_file.name

	if suffix.lower() == "pdf":
	return read_pdf(tmp_path)
	elif suffix.lower() == "docx":
	return read_docx(tmp_path)
	elif suffix.lower() == "xlsx":
	return read_excel(tmp_path)
	else:
	return "Unsupported file type."

	# Prompt builder
	def build_prompt(context, question):
	return f"""You are a helpful assistant. Answer the question based only on the context provided below.

	Context:
	{context}

	Question:
	{question}

	Answer:"""

	# Streamlit App
	st.set_page_config(page_title="DocuQuery AI", layout="centered")
	st.title("📄 DocuQuery AI")
	st.markdown("Upload a document and ask questions about it using LLaMA-3 from Groq.")

	uploaded_file = st.file_uploader("Upload your document", type=["pdf", "docx", "xlsx"])

	if uploaded_file:
	st.success("✅ File uploaded successfully.")
	with st.spinner("Processing file..."):
	raw_text = process_file(uploaded_file)

	splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
	docs = [Document(page_content=chunk) for chunk in splitter.split_text(raw_text)]

	with st.spinner("Embedding & indexing..."):
	db = FAISS.from_documents(docs, embedding_model)
	retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 4})

	st.success("📚 Document indexed. Ask a question!")

	user_query = st.text_input("❓ Ask something about the document:")
	if user_query:
	with st.spinner("Generating response..."):
	retrieved_docs = retriever.get_relevant_documents(user_query)
	context = "\n".join([doc.page_content for doc in retrieved_docs])

	prompt = build_prompt(context, user_query)

	response = groq_client.chat.completions.create(
	model="llama3-8b-8192",
	messages=[
	{"role": "user", "content": prompt}
	]
	)

	st.markdown(f"💬 Answer: {response.choices[0].message.content}")