Spaces:

purajith
/

Testing

No application file

App Files Files Community

purajith commited on Feb 18, 2025

Commit

f5f1a85

verified ·

1 Parent(s): c40d08d

Upload 5 file

Browse files

Files changed (5) hide show

.env +7 -0
data_extraction.py +171 -0
hybrid_search.py +184 -0
requirements.txt +17 -0
stm.py +70 -0

.env ADDED Viewed

	@@ -0,0 +1,7 @@

+FILESYSTEM_CLOUD=s3
+FILESYSTEM_DERIVER=s3
+AWS_BUCKET=esg-portal-dev
+AWS_DEFAULT_REGION=us-east-1
+AWS_ACCESS_KEY_ID=AKIASDLTYHTYTP6L6I7F
+AWS_SECRET_ACCESS_KEY=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
+openai_key = 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

data_extraction.py ADDED Viewed

	@@ -0,0 +1,171 @@

+import pandas as pd
+from docx import Document as DocxDocument  # Avoids conflict with langchain's Document
+import csv
+import fitz  # PyMuPDF for text extraction
+import camelot  # Table extraction
+from langchain.schema import Document  # Structured document format
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+import os
+from dotenv import load_dotenv
+load_dotenv()
+import warnings
+warnings.filterwarnings("ignore")
+# Ensure the API key is properly set
+openai_key = os.getenv("openai_key")
+os.environ["OPENAI_API_KEY"] = openai_key   # Ensure 'openai_key' is defined
+# Function to read and process .docx files
+def extract_text_and_tables(docx_path):
+    doc = DocxDocument(docx_path)  # Use renamed import to avoid conflict
+    # Extract text
+    text = "\n".join([para.text for para in doc.paragraphs if para.text.strip()])
+    # Extract tables
+    tables = []
+    for table in doc.tables:
+        table_data = []
+        for row in table.rows:
+            row_data = [cell.text.strip() for cell in row.cells]
+            table_data.append(row_data)
+        tables.append(Document(page_content=str(table_data), metadata={"source": docx_path}))  # Store as Document object
+    return text, tables
+# Function to read and process .xlsx (Excel) files
+def read_excel(file_path):
+    print(f"Reading Excel file: {file_path}")
+    excel_data = pd.read_excel(file_path, sheet_name=None)
+    text = []
+    for sheet_name, df in excel_data.items():
+        text.append(f"Sheet: {sheet_name}")
+        for row in df.values:
+            row_text = " | ".join(str(cell) for cell in row)
+            text.append(row_text)
+    return text
+# Function to read and process .csv files
+def read_csv(file_path):
+    print(f"Reading CSV file: {file_path}")
+    text = []
+    with open(file_path, mode='r') as file:
+        reader = csv.reader(file)
+        for row in reader:
+            row_text = " | ".join(row)
+            text.append(row_text)
+    return text
+# Function to extract text from PDFs
+def extract_text(pdf_path):
+    """Extracts text from a PDF file and returns it as a list of Document objects."""
+    documents = []
+    try:
+        doc = fitz.open(pdf_path)
+        for page_num, page in enumerate(doc, start=1):
+            text = page.get_text("text").strip()
+            if text:
+                documents.append(Document(
+                    page_content=text,
+                    metadata={"source": pdf_path, "page": page_num}
+                ))
+    except Exception as e:
+        print(f"❌ Error extracting text: {e}")
+    return documents
+# Function to extract tables from PDFs
+def extract_tables(pdf_path):
+    """Extracts tables from a PDF using Camelot and returns them as Document objects."""
+    table_documents = []
+    try:
+        tables = camelot.read_pdf(pdf_path, pages="all", flavor="stream")
+        if tables.n == 0:
+            print(f"⚠️ No tables found in {pdf_path}. Adding dummy data for testing.")
+            return [Document(page_content="Dummy Table: No real data found", metadata={"source": pdf_path, "table_index": 0})]
+        for i in range(tables.n):
+            table_text = tables[i].df.to_string()
+            table_documents.append(Document(
+                page_content=table_text,
+                metadata={"source": pdf_path, "table_index": i+1}
+            ))
+    except Exception as e:
+        print(f"❌ Error extracting tables from {pdf_path}: {e}")
+        return [Document(page_content="Dummy Table: Extraction error", metadata={"source": pdf_path, "table_index": -1})]
+    return table_documents
+# Function to chunk tables (for docx and pdf)
+def chunk_table(documents, chunk_size=2):
+    """Chunks table data row-wise from Document objects."""
+    chunks = []
+    for doc in documents:
+        if isinstance(doc, Document):  # Ensure it's a Document object
+            table_text = doc.page_content  # Extract the actual text
+            rows = table_text.split("\n")  # Split into rows
+            for i in range(0, len(rows), chunk_size):
+                chunk = "\n".join(rows[i:i+chunk_size])  # Group rows
+                chunks.append(Document(page_content=chunk, metadata=doc.metadata))  # Preserve metadata
+    return chunks
+# Function to process .docx, .xlsx, .csv, and PDF files
+def process_files(file, text_chunk_size=1000, chunk_overlap=40, table_chunk_size=2):
+    text = []
+    tables = []
+    # Process .docx file
+    if file.endswith(".docx"):
+        docx_text, docx_tables = extract_text_and_tables(file)
+        text.append(docx_text)
+        tables.extend(docx_tables)
+    # Process .xlsx file
+    if file.endswith((".xlsx", ".xls")):
+        excel_text = read_excel(file)
+        text.extend(excel_text)
+    # Process .csv file
+    if file.endswith(".csv"):
+        csv_text = read_csv(file)
+        text.extend(csv_text)
+    # Process PDF file
+    if file.endswith(".pdf"):
+        pdf_text_documents = extract_text(file)
+        pdf_table_documents = extract_tables(file)
+        text.extend([doc.page_content for doc in pdf_text_documents])
+        if pdf_table_documents:  # Only add tables if they exist
+            tables.extend(pdf_table_documents)
+        else:
+            print(f"⚠️ No tables found in {file}, skipping table embeddings.")
+    # Chunk the tables **only if tables exist**
+    table_chunks = chunk_table(tables, chunk_size=table_chunk_size) if tables else []
+    # Chunk the text
+    text_splitter = RecursiveCharacterTextSplitter(chunk_size=text_chunk_size, chunk_overlap=chunk_overlap)
+    text_chunks = text_splitter.split_documents([Document(page_content=t) for t in text]) if text else []
+    combined_chunks = text_chunks + table_chunks
+    return combined_chunks if combined_chunks else []  # Ensure no empty embeddings
+# Function to process multiple files
+# def data_processing(file_paths):
+#     all_combined_chunks = {}
+#     for file in file_paths:
+#         print(f"Processing file: {file.split('/')[-1]}")
+#         combined_chunks = process_files(file)
+#         all_combined_chunks[file] = combined_chunks
+#     return all_combined_chunks
+# # Example usage
+# file_paths = ["/content/Acceptable Use Policy.docx","/content/RiskAnalysisGuide.pdf"]
+# all_combined_chunks = data_processing(file_paths)

hybrid_search.py ADDED Viewed

	@@ -0,0 +1,184 @@

+import os
+from langchain.vectorstores import FAISS
+from langchain.embeddings.openai import OpenAIEmbeddings
+from langchain.embeddings.huggingface import HuggingFaceEmbeddings
+from langchain.document_loaders import TextLoader
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain.retrievers import BM25Retriever, EnsembleRetriever
+from langchain.schema import Document
+from langchain.chains import ConversationChain
+from langchain.chains.conversation.memory import ConversationBufferWindowMemory
+from langchain.callbacks import get_openai_callback
+from sentence_transformers import CrossEncoder
+from langchain.chat_models import ChatOpenAI
+from sentence_transformers import SentenceTransformer
+from data_extraction import process_files
+from dotenv import load_dotenv
+import warnings
+warnings.filterwarnings("ignore")
+load_dotenv()
+# 🔹 Set OpenAI API Key
+all_hybrid_retriever = {}
+file_names = []
+llm_conversations = {}  # {filename: ConversationChain}
+all_result = {}
+al_conversation_sum = {}
+openai_key = os.getenv("openai_key")
+os.environ["OPENAI_API_KEY"] = openai_key   # Ensure 'openai_key' is defined
+reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")
+def large_model(llm_model):
+    llm = ChatOpenAI(openai_api_key=openai_key, model="llm_model")
+    return llm
+# 🔹 Choose Embedding Model
+embedding_option = "open_source"
+if embedding_option == "open_source":
+    print("Using BGE-M3 Embeddings")
+    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
+else:
+    print("Using OpenAI Embeddings")
+    embeddings = OpenAIEmbeddings(openai_api_key=openai_key, model="text-embedding-ada-002")
+class ManualMemory:
+    def __init__(self, history_length=3):
+        self.history = []  # Stores chat history
+        self.history_length = history_length  # How many interactions to keep
+    def add_interaction(self, user_query, llm_response):
+        """Add the user's query and the LLM's response to history."""
+        # Add the interaction as a tuple (user_query, llm_response)
+        self.history.append((user_query, llm_response))
+        # Keep only the last 'history_length' interactions
+        if len(self.history) > self.history_length:
+            self.history.pop(0)
+    def get_history(self):
+        """Return the current chat history."""
+        return "\n".join([f"User: {q}\nLLM: {r}" for q, r in self.history])
+# 🔹 Function to Create Separate LLM + Memory for Each File
+def create_conversation_chain():
+    llm = ChatOpenAI(openai_api_key=openai_key, model="gpt-4o-mini")
+    memory = ConversationBufferWindowMemory(k=0)  # Stores last 3 interactions per file
+    return ConversationChain(llm=llm, memory=memory)
+def hybrid_retrievers(split_docs):
+    # Create Vector Store and Retrievers
+    vector_store = FAISS.from_documents(split_docs, embeddings)
+    dense_retriever = vector_store.as_retriever(search_kwargs={"k": 5})
+    bm25_retriever = BM25Retriever.from_documents(split_docs)
+    bm25_retriever.k = 4
+    hybrid_retriever = EnsembleRetriever(
+        retrievers=[dense_retriever, bm25_retriever],
+        weights=[0.5, 0.5])
+    return hybrid_retriever
+def rerank_with_cross_encoder(query, documents):
+    """Re-rank retrieved documents using a cross-encoder model."""
+    input_pairs = [(query, doc.page_content) for doc in documents]
+    scores = reranker.predict(input_pairs)
+    ranked_results = sorted(zip(documents, scores), key=lambda x: x[1], reverse=True)
+    print("ranked_results",ranked_results)
+    return ranked_results
+def count_tokens(chain, query, retriever, memory):
+    """Retrieve documents, run LLM, and count tokens."""
+    # Retrieve documents but don't store them in memory
+    retrieved_docs = retriever.get_relevant_documents(query)
+    reranked_docs = rerank_with_cross_encoder(query, retrieved_docs)
+    retrieved_text = "\n\n".join([doc.page_content for doc, _ in reranked_docs])  # Extract text
+    # Construct the prompt using the chat history and retrieved text
+    prompt = f"""You are a cybersecurity expert RAG bot, answering queries using retrieved documents and Chat history.
+    Retrieved documents: \n{retrieved_text}\n\nQuestion: {query}
+    Chat history:
+    {memory.get_history()}
+    If the documents are relevant, use them to answer.
+    If they don’t have enough useful information, say:
+    "No info."
+    Keep your responses clear and accurate."""
+    # Generate response using the LLM and the prompt
+    with get_openai_callback() as cb:
+        result = chain.run(prompt)  # Pass query + retrieved context + chat history as prompt
+        print(f"Spent a total of {cb.total_tokens} tokens")
+    # Store the interaction in memory
+    memory.add_interaction(query, result)
+    return result, reranked_docs
+manual_memory = ManualMemory(history_length=3)
+all_manual_memory = {}
+all_retrieved_docs = {}
+all_combined_chunks = {}
+all_hybrid_retriever = {}
+al_conversation_sum = {}
+# Global variables to track previous file paths and embeddings
+old_file_paths = []
+old_embeding = None  # Initialize properly
+def multimodelrag(query, file_paths, embeding, llm_model,conversation=3):
+    global old_file_paths, old_embeding
+    global all_manual_memory, all_retrieved_docs, all_combined_chunks, all_hybrid_retriever, al_conversation_sum
+    print("query, file_paths, embeding, conversation, llm_model", query, file_paths, embeding, conversation, llm_model)
+    if embedding_option == embeding:
+        print("Using BGE-M3 Embeddings")
+        embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
+    else:
+        print("Using OpenAI Embeddings")
+        embeddings = OpenAIEmbeddings(openai_api_key=openai_key, model="text-embedding-ada-002")
+    llm = ChatOpenAI(openai_api_key=openai_key, model=llm_model)
+    if (old_file_paths != file_paths) or (old_embeding != embeding):
+        # Reset memory only when new files are loaded
+        all_manual_memory = {}
+        all_retrieved_docs = {}
+        all_combined_chunks = {}
+        all_hybrid_retriever = {}
+        al_conversation_sum = {}
+        for file__name in file_paths:
+            file = file__name.split("/")[-1]
+            print("Processing file:", file)
+            old_embeding = embeding
+            old_file_paths = file_paths
+            combined_chunks = process_files(file__name)
+            all_combined_chunks[file] = combined_chunks
+            all_hybrid_retriever[file] = hybrid_retrievers(all_combined_chunks[file])
+            al_conversation_sum[file] = create_conversation_chain()
+            # ✅ Create a separate memory instance for each file
+            all_manual_memory[file] = ManualMemory(history_length=conversation)
+            # Using query
+            all_result[file], all_retrieved_docs[file] = count_tokens(
+                al_conversation_sum[file], query, all_hybrid_retriever[file], all_manual_memory[file]
+            )
+    else:
+        # Reuse existing memory for the same file
+        for file__name in file_paths:
+            file = file__name.split("/")[-1]
+            print("Reusing memory for:", file)
+            all_result[file], all_retrieved_docs[file] = count_tokens(
+                al_conversation_sum[file], query, all_hybrid_retriever[file], all_manual_memory[file]
+            )
+    return all_result

requirements.txt ADDED Viewed

	@@ -0,0 +1,17 @@

+python-docx
+PyMuPDF
+frontend
+langchain
+openai==0.28
+faiss-cpu
+tiktoken
+langchain_openai
+tools
+langchain-community
+rank_bm25
+openai  # No specific version added
+sentence-transformers
+camelot-py
+fitz
+streamlit
+frontend

stm.py ADDED Viewed

	@@ -0,0 +1,70 @@

+import streamlit as st
+import os
+import shutil
+from hybrid_search import multimodelrag
+import warnings
+warnings.filterwarnings("ignore")
+# Streamlit UI
+st.set_page_config(layout="wide")
+st.title("AI Document Processor with Conversational RAG")
+# Initialize conversation history in session state
+if "conversation_history" not in st.session_state:
+    st.session_state.conversation_history = []
+# Sidebar for file upload and settings
+with st.sidebar:
+    uploaded_files = st.file_uploader(
+        "Upload multiple files (PDF, DOCX, Excel, CSV)",
+        type=["pdf", "docx", "xlsx", "csv"],
+        accept_multiple_files=True
+    )
+    embeding = st.selectbox("Select Memory Mode", ["open_source", "openai"], index=0)
+    conversation = st.selectbox("Number of conversation", [2, 4, 6], index=0)
+    llm_option = st.selectbox("Select LLM Model", ["GPT-4o", "GPT-4o-mini"], index=1)
+    temp_dir = "temp_uploaded_files"
+    # Clear the previous uploads when new files are uploaded
+    if uploaded_files:
+        if os.path.exists(temp_dir):
+            shutil.rmtree(temp_dir)  # Delete the old directory and its contents
+        os.makedirs(temp_dir)  # Create a fresh directory
+        file_paths = []  # List to store saved file paths
+        for file in uploaded_files:
+            file_path = os.path.join(temp_dir, file.name)
+            with open(file_path, "wb") as f:
+                f.write(file.read())  # Save file locally
+            file_paths.append(file_path)
+            st.write(f"✅ Saved: {file.name}")
+# Chat interface
+st.write("### Chat Interface")
+chat_display = "\n".join(st.session_state.conversation_history)
+# st.text_area("Conversation History", chat_display, height=300, disabled=True)
+# Input for user question
+user_input = st.text_input("Ask a question:")
+llm_model = llm_option
+if st.button("Retrieve and Answer"):
+    if user_input or uploaded_files:
+        answer = multimodelrag(user_input, file_paths, embeding,  llm_model,conversation)
+        # Update conversation history
+        st.session_state.conversation_history.append(f"User: {user_input}")
+        st.session_state.conversation_history.append(f"AI: {answer}")
+        # Refresh chat display
+        chat_display = "\n".join(st.session_state.conversation_history)
+        st.text_area("Conversation History", chat_display, height=400, disabled=True)
+        st.write("### Answer:")
+        st.write(answer)