shamilcoded commited on
Commit
8e456d3
·
verified ·
1 Parent(s): e8294f8

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +87 -0
app.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import os
3
+ import tempfile
4
+ import faiss
5
+ import fitz # PyMuPDF for PDFs
6
+ import docx
7
+ import openpyxl
8
+ from langchain_community.embeddings import HuggingFaceEmbeddings
9
+ from langchain.vectorstores import FAISS
10
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
11
+ from langchain.docstore.document import Document
12
+ from langchain_community.llms import Groq
13
+ from langchain.chains import RetrievalQA
14
+ from langchain.schema import Document as LCDocument
15
+
16
+ # Initialize LLM
17
+ llm = Groq(
18
+ model="llama3-8b-8192",
19
+ api_key=os.getenv("GROQ_API_KEY") # Put this in Hugging Face secrets
20
+ )
21
+
22
+ # Embeddings model
23
+ embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
24
+
25
+ # File processors
26
+ def read_pdf(file_path):
27
+ text = ""
28
+ doc = fitz.open(file_path)
29
+ for page in doc:
30
+ text += page.get_text()
31
+ return text
32
+
33
+ def read_docx(file_path):
34
+ doc = docx.Document(file_path)
35
+ return "\n".join([p.text for p in doc.paragraphs])
36
+
37
+ def read_excel(file_path):
38
+ wb = openpyxl.load_workbook(file_path, data_only=True)
39
+ text = ""
40
+ for sheet in wb.sheetnames:
41
+ ws = wb[sheet]
42
+ for row in ws.iter_rows(values_only=True):
43
+ text += " ".join([str(cell) for cell in row if cell is not None]) + "\n"
44
+ return text
45
+
46
+ def process_file(uploaded_file):
47
+ suffix = uploaded_file.name.split(".")[-1]
48
+ with tempfile.NamedTemporaryFile(delete=False, suffix="." + suffix) as tmp_file:
49
+ tmp_file.write(uploaded_file.read())
50
+ tmp_path = tmp_file.name
51
+
52
+ if suffix.lower() == "pdf":
53
+ return read_pdf(tmp_path)
54
+ elif suffix.lower() in ["docx"]:
55
+ return read_docx(tmp_path)
56
+ elif suffix.lower() in ["xlsx"]:
57
+ return read_excel(tmp_path)
58
+ else:
59
+ return "Unsupported file type."
60
+
61
+ # Streamlit UI
62
+ st.title("📄 RAG Document QA with Faiss + LLaMA3")
63
+
64
+ uploaded_file = st.file_uploader("Upload a PDF, Word or Excel file", type=["pdf", "docx", "xlsx"])
65
+
66
+ if uploaded_file:
67
+ st.success("✅ File uploaded successfully.")
68
+ raw_text = process_file(uploaded_file)
69
+
70
+ # Split text into chunks
71
+ splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
72
+ texts = splitter.split_text(raw_text)
73
+ docs = [Document(page_content=t) for t in texts]
74
+
75
+ # Embed and create vector store
76
+ with st.spinner("Indexing document..."):
77
+ db = FAISS.from_documents(docs, embedding_model)
78
+ retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 4})
79
+ qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever)
80
+
81
+ st.success("✅ Document indexed! Ask your questions below:")
82
+
83
+ user_query = st.text_input("❓ Ask a question about your document")
84
+ if user_query:
85
+ with st.spinner("Generating answer..."):
86
+ answer = qa.run(user_query)
87
+ st.markdown(f"**💬 Answer:** {answer}")