engrrifatullah commited on
Commit
a0fff8d
·
verified ·
1 Parent(s): 8eb5d55

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +33 -75
app.py CHANGED
@@ -1,90 +1,48 @@
 
 
1
  import os
2
  import streamlit as st
3
  import pandas as pd
4
  import pdfplumber
5
  from sentence_transformers import SentenceTransformer
6
- import faiss
7
  from groq import Groq
8
 
9
- # Set the Groq API key
10
- os.environ["GROQ_API_KEY"] = "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"
11
-
12
- # Initialize the embedding model and Groq API client
13
  embed_model = SentenceTransformer('all-MiniLM-L6-v2')
14
- client = Groq(api_key=os.environ["GROQ_API_KEY"])
15
 
16
  # Function to extract text from PDF
17
  def extract_text_from_pdf(pdf_file):
18
- text = ""
19
  with pdfplumber.open(pdf_file) as pdf:
20
- for page in pdf.pages:
21
- text += page.extract_text()
22
- return text
23
-
24
- # Function to extract text from CSV or Excel
25
- def extract_text_from_table(file):
26
- if file.name.endswith('.csv'):
27
- df = pd.read_csv(file)
28
- elif file.name.endswith('.xlsx'):
29
- df = pd.read_excel(file)
30
- else:
31
- raise ValueError("Unsupported file format")
32
- return " ".join(df.astype(str).values.flatten())
33
 
34
- # Chunk the text
35
- def chunk_text(text, chunk_size=512):
36
- words = text.split()
37
- chunks = [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
38
- return chunks
39
-
40
- # Store embeddings in FAISS
41
- def create_faiss_index(chunks):
42
  embeddings = embed_model.encode(chunks)
43
- dimension = embeddings.shape[1]
44
- index = faiss.IndexFlatL2(dimension)
45
  index.add(embeddings)
46
- return index, embeddings
47
-
48
- # Query FAISS for relevant chunks
49
- def query_faiss(index, query, chunks, embeddings):
50
- query_embedding = embed_model.encode([query])
51
- _, indices = index.search(query_embedding, k=3)
52
- return [chunks[i] for i in indices[0]]
53
-
54
- # Query Groq API
55
- def query_groq(context, question):
56
- response = client.chat.completions.create(
57
- messages=[{"role": "user", "content": f"Context: {context}\nQuestion: {question}"}],
58
- model="llama3-8b-8192"
59
- )
60
- return response.choices[0].message.content
61
-
62
- # Streamlit App
63
- st.title("RAG-based Application")
64
-
65
- uploaded_file = st.file_uploader("Upload your document (PDF, CSV, or Excel)", type=["pdf", "csv", "xlsx"])
66
-
67
- if uploaded_file:
68
- try:
69
- # Extract text from the uploaded document
70
- if uploaded_file.name.endswith('.pdf'):
71
- document_text = extract_text_from_pdf(uploaded_file)
72
- else:
73
- document_text = extract_text_from_table(uploaded_file)
74
-
75
- # Chunk the document and create FAISS index
76
- st.write("Processing document...")
77
- chunks = chunk_text(document_text)
78
- faiss_index, embeddings = create_faiss_index(chunks)
79
- st.success("Document processed successfully!")
80
-
81
- # Query input
82
- user_query = st.text_input("Enter your query:")
83
- if user_query:
84
- relevant_chunks = query_faiss(faiss_index, user_query, chunks, embeddings)
85
- context = " ".join(relevant_chunks)
86
- answer = query_groq(context, user_query)
87
- st.subheader("Answer:")
88
- st.write(answer)
89
- except Exception as e:
90
- st.error(f"Error: {str(e)}")
 
1
+ import numpy # Ensure NumPy is loaded first to avoid FAISS issues
2
+ import faiss # Load FAISS after NumPy
3
  import os
4
  import streamlit as st
5
  import pandas as pd
6
  import pdfplumber
7
  from sentence_transformers import SentenceTransformer
 
8
  from groq import Groq
9
 
10
+ # Initialize the embedding model and Groq API
 
 
 
11
  embed_model = SentenceTransformer('all-MiniLM-L6-v2')
12
+ client = Groq(api_key=os.environ.get("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"))
13
 
14
  # Function to extract text from PDF
15
  def extract_text_from_pdf(pdf_file):
 
16
  with pdfplumber.open(pdf_file) as pdf:
17
+ return ' '.join(page.extract_text() for page in pdf.pages)
 
 
 
 
 
 
 
 
 
 
 
 
18
 
19
+ # Function to create embeddings and store them in FAISS
20
+ def create_embeddings(text):
21
+ chunks = [text[i:i+500] for i in range(0, len(text), 500)]
 
 
 
 
 
22
  embeddings = embed_model.encode(chunks)
23
+ index = faiss.IndexFlatL2(embeddings.shape[1])
 
24
  index.add(embeddings)
25
+ return index
26
+
27
+ # Streamlit app
28
+ def main():
29
+ st.title("Document Processing with RAG")
30
+ uploaded_file = st.file_uploader("Upload a document (PDF, CSV, or Excel)", type=["pdf", "csv", "xlsx"])
31
+
32
+ if uploaded_file:
33
+ file_type = uploaded_file.type
34
+ if file_type == "application/pdf":
35
+ text = extract_text_from_pdf(uploaded_file)
36
+ elif file_type in ["text/csv", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"]:
37
+ df = pd.read_csv(uploaded_file) if file_type == "text/csv" else pd.read_excel(uploaded_file)
38
+ text = df.to_string()
39
+
40
+ st.write("Document Content:")
41
+ st.text_area("Extracted Text", text, height=300)
42
+
43
+ st.write("Creating embeddings...")
44
+ index = create_embeddings(text)
45
+ st.success("Embeddings created and stored in FAISS index.")
46
+
47
+ if __name__ == "__main__":
48
+ main()