Spaces:
Sleeping
Sleeping
File size: 7,542 Bytes
4184e11 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 |
import os
import pandas as pd
import PyPDF2
import docx
from sentence_transformers import SentenceTransformer
import faiss
import streamlit as st
import time
from groq import Groq
import re
# Initialize embedding model
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
# FAISS setup
dimension = 384 # Dimension of 'all-MiniLM-L6-v2' embeddings
index = faiss.IndexFlatL2(dimension)
document_texts = [] # Store text corresponding to embeddings
# Constants for file handling
MAX_FILE_SIZE_MB = 100 # 100 MB
MAX_NUM_FILES = 5
MAX_FILE_SIZE_BYTES = MAX_FILE_SIZE_MB * 1024 * 1024
# Set up the Groq API client directly with your API key
api_key = "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" # Replace with your actual Groq API key
client = Groq(api_key=api_key)
# Function to get human-readable file size
def get_human_readable_size(size_in_bytes):
if size_in_bytes < 1024:
return f"{size_in_bytes} Bytes"
elif size_in_bytes < 1024 ** 2:
return f"{size_in_bytes / 1024:.2f} KB"
elif size_in_bytes < 1024 ** 3:
return f"{size_in_bytes / (1024 ** 2):.2f} MB"
else:
return f"{size_in_bytes / (1024 ** 3):.2f} GB"
# Function to extract text from uploaded files
def extract_text_from_file(file):
text = ""
if file.name.endswith(".pdf"):
pdf_reader = PyPDF2.PdfReader(file)
for page in pdf_reader.pages:
text += page.extract_text()
elif file.name.endswith(".csv"):
df = pd.read_csv(file)
text = "\n".join([" ".join(map(str, row)) for row in df.values])
elif file.name.endswith(".xlsx") or file.name.endswith(".xls"):
df = pd.read_excel(file)
text = "\n".join([" ".join(map(str, row)) for row in df.values])
elif file.name.endswith(".txt"):
text = file.read().decode("utf-8")
elif file.name.endswith(".docx"):
doc = docx.Document(file)
text = "\n".join([p.text for p in doc.paragraphs])
else:
text = None
return text
# Function to split large text into smaller chunks
def split_text_into_chunks(text, max_chunk_size=500):
sentences = text.split(". ")
chunks = []
chunk = []
current_size = 0
for sentence in sentences:
sentence_size = len(sentence)
if current_size + sentence_size <= max_chunk_size:
chunk.append(sentence)
current_size += sentence_size
else:
chunks.append(". ".join(chunk))
chunk = [sentence]
current_size = sentence_size
if chunk:
chunks.append(". ".join(chunk))
return chunks
# Function to add document text to FAISS index
def add_to_index(text, index, document_texts):
chunks = split_text_into_chunks(text)
embeddings = embedding_model.encode(chunks, convert_to_numpy=True)
index.add(embeddings)
document_texts.extend(chunks)
# Function to generate pre-questions based on the document
def suggest_questions(text):
# Example simple questions based on content type
if len(text.split()) < 200:
return [
"Can you summarize the main points?",
"What is the main argument or conclusion?",
"What is the purpose of this document?"
]
else:
return [
"What are the key takeaways from this document?",
"Can you provide a summary of the main sections?",
"What are the major findings or conclusions?"
]
# Function to generate answer using Groq
def generate_answer_with_groq(question, context):
# Sending user input question to Groq for response
chat_completion = client.chat.completions.create(
messages=[{"role": "user", "content": f"Context: {context}\nQuestion: {question}"}],
model="gemma2-9b-it",
)
return chat_completion.choices[0].message.content
# Function to validate user input (basic check for valid text)
def is_valid_input(query):
# Check if the input contains only alphabetic characters, spaces, or common punctuation
# This heuristic helps detect typing errors or nonsensical queries
query = query.strip()
if not query:
return False # Empty input is invalid
# Regex to allow letters, spaces, and common punctuation
pattern = r"^[A-Za-z0-9\s.,!?'-]*$"
if re.match(pattern, query):
return True
return False
# Handling user feedback
def handle_feedback(feedback):
if feedback:
st.write("Thank you for your feedback!")
# Streamlit UI
st.title("Enhanced Document Q&A with RAG")
st.sidebar.title("Tips for Better Experience")
st.sidebar.write("""
1. Maximum file size: 100 MB per file.
2. You can upload up to 5 files at a time.
3. Larger files may take longer to process.
4. Please break large files into smaller chunks if necessary.
5. Use the pre-generated questions to guide your inquiry.
""")
feedback = st.sidebar.text_area("Provide feedback to improve your experience:")
# File uploader
uploaded_files = st.file_uploader(
"Upload documents (PDF, CSV, Excel, TXT, DOCX). Max size: 100 MB each.",
type=["pdf", "csv", "xlsx", "xls", "txt", "docx"],
accept_multiple_files=True,
)
if uploaded_files:
if len(uploaded_files) > MAX_NUM_FILES:
st.error(f"Maximum {MAX_NUM_FILES} files can be uploaded at a time.")
else:
for file in uploaded_files:
file_size = file.size
human_readable_size = get_human_readable_size(file_size)
st.write(f"File: {file.name} | Size: {human_readable_size}")
if file_size > MAX_FILE_SIZE_BYTES:
st.warning(
f"File '{file.name}' exceeds the {MAX_FILE_SIZE_MB} MB limit. "
"We will automatically break this file into smaller chunks."
)
with st.spinner(f"Processing {file.name}..."):
text = extract_text_from_file(file)
if text:
# Automatically break large file into chunks
chunks = split_text_into_chunks(text)
add_to_index(" ".join(chunks), index, document_texts)
st.success(f"Processed {file.name}")
else:
st.error(f"Could not process {file.name}. Unsupported format.")
else:
st.warning("No documents uploaded yet. Please upload documents before asking questions.")
# Display user feedback handling
if feedback:
handle_feedback(feedback)
# Input for question
query = st.text_input("Enter your question:")
# If query is entered and documents are uploaded
if query:
if not document_texts:
st.warning("Please upload and process documents before asking questions.")
elif not is_valid_input(query):
st.error("Please ask a relevant question.")
else:
# Use Groq to generate a response based on uploaded documents
with st.spinner("Generating response..."):
response = generate_answer_with_groq(query, " ".join(document_texts))
st.write("### Answer:")
st.write(response)
st.write("### Suggested Questions:")
questions = suggest_questions(" ".join(document_texts)) # Generate based on full document content
for question in questions:
st.write(f"- {question}")
# Instructions and reminders if not uploaded_files:
if not uploaded_files:
st.info("You haven't uploaded any documents yet. Please upload documents to start.")
else:
st.info("Enter a question to ask about the uploaded documents.")
|