Spaces:

EN-collab
/

HQ_Project_EN

Sleeping

App Files Files Community

1mpreccable commited on Aug 10, 2025

Commit

7381c1f

1 Parent(s): 44be36b

reworked and updated RAG ED

Browse files

Files changed (14) hide show

pages/Project_2.2_-_Langchain_VectorDB.py +0 -23
pages/Project_3_-_Scrapper.py +0 -24
pages/Project_5_-_API.py +0 -9
pages/Project_6_-_RAG.py +0 -20
pages/Project_6_-_RAG_ED.py +268 -0
pages/archive/Project_2.2_-_Langchain_VectorDB.py +23 -0
pages/archive/Project_3_-_Scrapper.py +24 -0
pages/archive/Project_5_-_API.py +9 -0
src/__pycache__/functions_langchain.cpython-311.pyc +0 -0
src/__pycache__/functions_llm.cpython-311.pyc +0 -0
src/__pycache__/functions_nadia_llm.cpython-311.pyc +0 -0
src/__pycache__/functions_pdf.cpython-311.pyc +0 -0
src/functions_langchain.py +23 -1
src/functions_pdf.py +20 -12

pages/Project_2.2_-_Langchain_VectorDB.py DELETED Viewed

@@ -1,23 +0,0 @@
-import os
-from dotenv import load_dotenv
-import streamlit as st
-from src.functions_langchain import graph_init, initialize_inmemory_vector_store, load_and_split_documents_from_web
-load_dotenv()
-st.title("Langchain VectorDB")
-st.write("This is a simple demonstration of the Langchain VectorDB.")
-vector_store = initialize_inmemory_vector_store()
-all_splits = load_and_split_documents_from_web("https://www.gutenberg.org/files/1342/1342-h/1342-h.htm")
-# Index chunks
-_ = vector_store.add_documents(documents=all_splits)
-graph = graph_init(vector_store)
-question = st.text_input("Enter a question:")
-if st.button("Ask"):
-    st.write("Searching for an answer...")
-    response = graph.invoke({"question": question})
-    st.write(response["answer"])

pages/Project_3_-_Scrapper.py DELETED Viewed

@@ -1,24 +0,0 @@
-import streamlit as st
-import requests
-from bs4 import BeautifulSoup
-from src.functions_scrapper import scrape_website
-################################################################################
-tab1, tab2 = st.tabs(["Scrapper", "DB_Extraction"])
-st.sidebar.title("App parameters")
-link = st.sidebar.text_input("Enter the link to the website you want to scrape")
-selector = st.sidebar.selectbox("Select the tag you want to scrape", ["div", "p", "h1", "span", "a", "img"])
-button = st.sidebar.button("Scrape")
-####
-tab1.title("Project 3 - Scrapper")
-if link and button and selector:
-    result = scrape_website(link, selector=selector)
-    tab1.write(result)

pages/Project_5_-_API.py DELETED Viewed

@@ -1,9 +0,0 @@
-import streamlit as st
-################################################################################
-st.sidebar.title("App parameters")
-st.write("This is the API page. It is still under construction.")
-st.write(" Please come back later.")

pages/Project_6_-_RAG.py DELETED Viewed

@@ -1,20 +0,0 @@
-import streamlit as st
-################################################################################
-st.sidebar.title("App parameters")
-st.write("This is the RAG page. It is still under construction.")
-st.write("Please come back later.")
-# https://aws.amazon.com/what-is/retrieval-augmented-generation/
-# https://medium.com/@dminhk/retrieval-augmented-generation-rag-explained-b1dd89979681
-# https://huggingface.co/transformers/model_doc/rag.html
-# https://huggingface.co/transformers/model_doc/rag-tokenizer.html
-# (BM25, Dense Passage Retrieval or Sentence Transformers). - need to find a tools for this
-# PostgreSQL or MongoDB - need to find a tools for this ( should be vectorial database) for the future use in semantic search
-# Testing API of indeed, linkedin, pole emploi
-# Testing API of huggingface

pages/Project_6_-_RAG_ED.py ADDED Viewed

	@@ -0,0 +1,268 @@

+import streamlit as st
+import os
+from src.functions_pdf import pdfminer_pdf_to_text
+from src.functions_langchain import chunk_and_embed_pdf_text
+from src.functions_langchain import InMemoryVectorStore, graph_init, embeddings
+from src.functions_langchain import State, generate
+# https://aws.amazon.com/what-is/retrieval-augmented-generation/
+# https://medium.com/@dminhk/retrieval-augmented-generation-rag-explained-b1dd89979681
+# https://huggingface.co/transformers/model_doc/rag.html
+# https://huggingface.co/transformers/model_doc/rag-tokenizer.html
+# (BM25, Dense Passage Retrieval or Sentence Transformers). - need to find a tools for this
+# PostgreSQL or MongoDB - need to find a tools for this ( should be vectorial database) for the future use in semantic search
+# Testing API of indeed, linkedin, pole emploi
+# Testing API of huggingface
+################################################################################
+# Sidebar
+st.sidebar.title("App Parameters")
+chunk_size = st.sidebar.slider("Chunk Size", 100, 2000, 1000)
+chunk_overlap = st.sidebar.slider("Chunk Overlap", 0, 500, 100)
+# Main title
+st.title("RAG chat with PDF")
+st.divider()
+file = st.file_uploader("Upload a PDF file", type=["pdf"])
+tab1, tab2 = st.tabs(["RAG", "Debugging"])
+def save_uploaded_file(uploaded_file):
+    path = "temp_uploaded_file.pdf"
+    with open(path, "wb") as f:
+        f.write(uploaded_file.read())
+    return path
+def load_and_extract_text(pdf_path):
+    text = pdfminer_pdf_to_text(pdf_path)
+    if os.path.exists(pdf_path):
+        os.remove(pdf_path)
+    return text
+def init_vector_store_and_graph(pdf_text, chunk_size, chunk_overlap):
+    chunks, _ = chunk_and_embed_pdf_text(pdf_text, chunk_size=chunk_size, chunk_overlap=chunk_overlap)
+    vector_store = InMemoryVectorStore(embeddings)
+    vector_store.add_texts(chunks)
+    graph = graph_init(vector_store)
+    return vector_store, graph, chunks
+# main tab
+with tab1:
+    if file is not None:
+        if "pdf_path" not in st.session_state or st.session_state["pdf_path"] != file.name:
+            st.session_state["pdf_path"] = file.name
+            st.session_state["temp_pdf_path"] = save_uploaded_file(file)
+            st.session_state["pdf_text"] = None
+            st.session_state["vector_store"] = None
+            st.session_state["graph"] = None
+            st.session_state["chunks"] = None
+            st.session_state["state"] = None
+        if st.button("Launch app"):
+            with st.spinner("Extracting and processing PDF..."):
+                text = load_and_extract_text(st.session_state["temp_pdf_path"])
+                if not text:
+                    st.warning("No text extracted from PDF.")
+                else:
+                    st.session_state["pdf_text"] = text
+                    vector_store, graph, chunks = init_vector_store_and_graph(text, chunk_size, chunk_overlap)
+                    st.session_state["vector_store"] = vector_store
+                    st.session_state["graph"] = graph
+                    st.session_state["chunks"] = chunks
+                    st.success(f"Processed PDF with {len(chunks)} chunks.")
+    if "graph" in st.session_state and st.session_state["graph"] is not None:
+        query = st.text_input("Ask a question about the PDF:", key="query_tab1")
+        if query:
+            state = State(question=query, context=[], answer="")
+            st.session_state["state"] = state
+            with st.spinner("Retrieving context and generating answer..."):
+                result_state = st.session_state["graph"].invoke(state)
+                st.session_state["state"] = result_state
+            if result_state.get("context"):
+                st.success(f"Retrieved {len(result_state['context'])} relevant documents.")
+                st.markdown("### Answer:")
+                st.write(result_state.get("answer", "No answer generated."))
+            else:
+                st.warning("No relevant context found for the question.")
+# Debugging tab
+with tab2:
+    if file is not None:
+        st.info(f"Uploaded file: **{file.name}** ({file.size / 1024:.2f} KB)")
+        if st.button("Extract Text"):
+            temp_pdf_path = save_uploaded_file(file)
+            text = load_and_extract_text(temp_pdf_path)
+            if text:
+                st.success("Text extracted successfully!")
+                st.session_state["pdf_text"] = text
+                st.text_area("Extracted Text", text, height=300)
+                st.download_button("Download Extracted Text", text, "extracted_text.txt", "text/plain")
+            else:
+                st.warning("No text extracted. Please check the PDF.")
+    if "pdf_text" in st.session_state and st.session_state["pdf_text"]:
+        if st.button("Process and Embed Text"):
+            with st.spinner("Chunking and embedding text..."):
+                vector_store, graph, chunks = init_vector_store_and_graph(st.session_state["pdf_text"], chunk_size, chunk_overlap)
+                st.session_state["vector_store"] = vector_store
+                st.session_state["graph"] = graph
+                st.session_state["chunks"] = chunks
+                st.success(f"Processed {len(chunks)} chunks and created embeddings.")
+                for i, chunk in enumerate(chunks[:3]):
+                    st.markdown(f"**Chunk {i+1}:**")
+                    st.write(chunk)
+    if "graph" in st.session_state and st.session_state["graph"] is not None:
+        query_debug = st.text_input("Ask a question about the PDF:", key="query_tab2")
+        if query_debug:
+            state = State(question=query_debug, context=[], answer="")
+            st.session_state["state"] = state
+            with st.spinner("Retrieving context and generating answer..."):
+                result_state = st.session_state["graph"].invoke(state)
+                st.session_state["state"] = result_state
+            if result_state.get("context"):
+                st.success(f"Retrieved {len(result_state['context'])} documents.")
+                st.markdown("### Answer:")
+                st.write(result_state.get("answer", "No answer generated."))
+            else:
+                st.warning("No relevant context found for the question.")
+# with tab1:
+#     # Upload PDF
+#     if file is not None:
+#         temp_file_path = "temp_uploaded_file.pdf"
+#         with open(temp_file_path, "wb") as temp_file:
+#                 temp_file.write(file.read())
+#     if st.button("Launch app"):
+#         with st.spinner("Preloading information..."):
+#             text = pdfminer_pdf_to_text(temp_file_path)
+#             st.session_state["pdf_text"] = text
+#             vector_store = InMemoryVectorStore(embeddings)
+#             chunks, vectors = chunk_and_embed_pdf_text(st.session_state["pdf_text"], chunk_size=chunk_size, chunk_overlap=chunk_overlap)
+#             vector_store = InMemoryVectorStore(embeddings)
+#             vector_store.add_texts(chunks)
+#             st.session_state["vector_store"] = vector_store
+#             st.session_state["graph"] = graph_init(vector_store)
+#             st.success("App is ready to use!")
+#     if "graph" in st.session_state:
+#         query = st.text_input("Ask a question about the PDF:")
+#         if query:
+#             state = State(question=query, context=[], answer="")
+#             st.session_state["state"] = state
+#             with st.spinner("Retrieving context..."):
+#                 context = st.session_state["graph"].invoke(state)
+#                 st.session_state["state"]["context"] = context["context"]
+#             if st.session_state["state"]["context"]:
+#                 st.success(f"Retrieved {len(st.session_state['state']['context'])} documents.")
+#                 with st.spinner("Generating answer..."):
+#                     answer = generate(st.session_state["state"])
+#                     st.session_state["state"]["answer"] = answer["answer"]
+#                 st.markdown("### Answer:")
+#                 st.write(st.session_state["state"]["answer"])
+#             else:
+#                 st.warning("No relevant context found for the question.")
+# with tab2:
+#     ### FIRST ETAPE ----UPLOAD THE PDF-FILE AND RETURN THE TEXT RESULT ----
+#     if file is not None:
+#         st.info(f"Uploaded file: **{file.name}** ({file.size / 1024:.2f} KB)")
+#         if st.button("Extract Text"):
+#             temp_file_path = "temp_uploaded_file.pdf"
+#             with open(temp_file_path, "wb") as temp_file:
+#                 temp_file.write(file.read())
+#             text = pdfminer_pdf_to_text(temp_file_path)
+#             if os.path.exists(temp_file_path):
+#                 os.remove(temp_file_path)
+#             if text:
+#                 st.success("Text extracted successfully!")
+#                 st.session_state["pdf_text"] = text
+#                 if st.checkbox("Show extracted text"):
+#                     st.text_area("Extracted Text", text, height=300)
+#                 st.download_button(
+#                     label="Download Extracted Text",
+#                     data=text,
+#                     file_name="extracted_text.txt",
+#                     mime="text/plain"
+#                 )
+#             else:
+#                 st.warning("No text extracted. Please check the PDF.")
+#     else:
+#         st.warning("Please upload a PDF file to proceed.")
+#     # SECOND ETAPE ---- New button and logic for chunking & embedding  ( no mongo db, only session state ) ----
+#     vector_store = InMemoryVectorStore(embeddings)
+#     if "pdf_text" in st.session_state:
+#         if st.button("Process and Embed Text"):
+#             with st.spinner("Chunking and embedding text..."):
+#                 chunks, vectors = chunk_and_embed_pdf_text(st.session_state["pdf_text"], chunk_size=chunk_size, chunk_overlap=chunk_overlap)
+#                 # Initialize vector store and add texts
+#                 vector_store = InMemoryVectorStore(embeddings)
+#                 vector_store.add_texts(chunks)
+#                 # Save vector store and graph in session state
+#                 st.session_state["vector_store"] = vector_store
+#                 st.session_state["graph"] = graph_init(vector_store)
+#                 st.success(f"Processed {len(chunks)} chunks and created embeddings.")
+#                 for i, chunk in enumerate(chunks[:3]):
+#                     st.markdown(f"**Chunk {i+1}:**")
+#                     st.write(chunk)
+#     # THIRD ETAPE ---- Add a question and answer logic ----
+#     if "graph" in st.session_state:
+#         query = st.text_input("Ask a question about the PDF:")
+#         if query:
+#             state = State(question=query, context=[], answer="")
+#             st.session_state["state"] = state
+#             with st.spinner("Retrieving context..."):
+#                 context = st.session_state["graph"].invoke(state)
+#                 st.session_state["state"]["context"] = context["context"]
+#             if st.session_state["state"]["context"]:
+#                 st.success(f"Retrieved {len(st.session_state['state']['context'])} documents.")
+#                 with st.spinner("Generating answer..."):
+#                     answer = generate(st.session_state["state"])
+#                     st.session_state["state"]["answer"] = answer["answer"]
+#                 st.markdown("### Answer:")
+#                 st.write(st.session_state["state"]["answer"])
+#             else:
+#                 st.warning("No relevant context found for the question.")

pages/archive/Project_2.2_-_Langchain_VectorDB.py ADDED Viewed

	@@ -0,0 +1,23 @@

+# import os
+# from dotenv import load_dotenv
+# import streamlit as st
+# from src.functions_langchain import graph_init, initialize_inmemory_vector_store, load_and_split_documents_from_web
+# load_dotenv()
+# st.title("Langchain VectorDB")
+# st.write("This is a simple demonstration of the Langchain VectorDB.")
+# vector_store = initialize_inmemory_vector_store()
+# all_splits = load_and_split_documents_from_web("https://www.gutenberg.org/files/1342/1342-h/1342-h.htm")
+# # Index chunks
+# _ = vector_store.add_documents(documents=all_splits)
+# graph = graph_init(vector_store)
+# question = st.text_input("Enter a question:")
+# if st.button("Ask"):
+#     st.write("Searching for an answer...")
+#     response = graph.invoke({"question": question})
+#     st.write(response["answer"])

pages/archive/Project_3_-_Scrapper.py ADDED Viewed

	@@ -0,0 +1,24 @@

+# import streamlit as st
+# import requests
+# from bs4 import BeautifulSoup
+# from src.functions_scrapper import scrape_website
+# ################################################################################
+# tab1, tab2 = st.tabs(["Scrapper", "DB_Extraction"])
+# st.sidebar.title("App parameters")
+# link = st.sidebar.text_input("Enter the link to the website you want to scrape")
+# selector = st.sidebar.selectbox("Select the tag you want to scrape", ["div", "p", "h1", "span", "a", "img"])
+# button = st.sidebar.button("Scrape")
+# ####
+# tab1.title("Project 3 - Scrapper")
+# if link and button and selector:
+#     result = scrape_website(link, selector=selector)
+#     tab1.write(result)

pages/archive/Project_5_-_API.py ADDED Viewed

	@@ -0,0 +1,9 @@

+# import streamlit as st
+# ################################################################################
+# st.sidebar.title("App parameters")
+# st.write("This is the API page. It is still under construction.")
+# st.write(" Please come back later.")

src/__pycache__/functions_langchain.cpython-311.pyc CHANGED Viewed

Binary files a/src/__pycache__/functions_langchain.cpython-311.pyc and b/src/__pycache__/functions_langchain.cpython-311.pyc differ

src/__pycache__/functions_llm.cpython-311.pyc CHANGED Viewed

Binary files a/src/__pycache__/functions_llm.cpython-311.pyc and b/src/__pycache__/functions_llm.cpython-311.pyc differ

src/__pycache__/functions_nadia_llm.cpython-311.pyc ADDED Viewed

Binary file (743 Bytes). View file

src/__pycache__/functions_pdf.cpython-311.pyc CHANGED Viewed

Binary files a/src/__pycache__/functions_pdf.cpython-311.pyc and b/src/__pycache__/functions_pdf.cpython-311.pyc differ

src/functions_langchain.py CHANGED Viewed

@@ -20,6 +20,8 @@ from langchain_text_splitters import RecursiveCharacterTextSplitter
 from langgraph.graph import START, StateGraph
 from typing_extensions import List, TypedDict
 from langchain_core.vectorstores import InMemoryVectorStore
 load_dotenv()
@@ -36,12 +38,32 @@ sentry_sdk.init(
     },
 )
-client = MongoClient(mongodb_uri, server_api=ServerApi('1'))
 llm = init_chat_model("llama3-8b-8192", model_provider="groq")
 embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
 prompt = hub.pull("rlm/rag-prompt")
 @serverless_function
 def initialize_inmemory_vector_store() -> InMemoryVectorStore:
     return InMemoryVectorStore(embeddings)

 from langgraph.graph import START, StateGraph
 from typing_extensions import List, TypedDict
 from langchain_core.vectorstores import InMemoryVectorStore
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_community.embeddings import HuggingFaceEmbeddings
 load_dotenv()
     },
 )
+# client = MongoClient(mongodb_uri, server_api=ServerApi('1'))
 llm = init_chat_model("llama3-8b-8192", model_provider="groq")
 embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
 prompt = hub.pull("rlm/rag-prompt")
+def chunk_and_embed_pdf_text(text: str, chunk_size=1000, chunk_overlap=100):
+    # 1. Split text into chunks
+    text_splitter = RecursiveCharacterTextSplitter(
+        chunk_size=chunk_size,  # size of each chunk in characters
+        chunk_overlap=chunk_overlap,  # overlap to preserve context
+        separators=["\n\n", "\n", ".", " "]
+    )
+    chunks = text_splitter.split_text(text)
+    # 2. Create HuggingFace embeddings instance
+    embeddings = HuggingFaceEmbeddings(
+        model_name="sentence-transformers/all-mpnet-base-v2"
+    )
+    # 3. Embed chunks
+    vectors = embeddings.embed_documents(chunks)
+    # Returning both for further processing
+    return chunks, vectors
 @serverless_function
 def initialize_inmemory_vector_store() -> InMemoryVectorStore:
     return InMemoryVectorStore(embeddings)

src/functions_pdf.py CHANGED Viewed

@@ -2,7 +2,7 @@ import pymupdf
 from PyPDF2 import PdfReader
 from pdfminer.high_level import extract_text
 from langchain.document_loaders import PDFPlumberLoader
 def pymupdf_pdf_to_text(file_path):
     """
@@ -36,19 +36,27 @@ def pypdf2_pdf_to_text(file_path):
         text += page.extract_text() + "\n"
     return text
-def pdfminer_pdf_to_text(file_path):
-    """
-    Extract text from a PDF file using pdfminer.
-    Args:
-        file_path (str): Path to the PDF file.
-    Returns:
-        str: Extracted text from the PDF file.
-    """
-    # Implementation for pdfminer extraction goes here
-    text = extract_text(file_path)
-    return text
 def pdfplumber_pdf_to_text(file_path):
     """

 from PyPDF2 import PdfReader
 from pdfminer.high_level import extract_text
 from langchain.document_loaders import PDFPlumberLoader
+import streamlit as st
 def pymupdf_pdf_to_text(file_path):
     """
         text += page.extract_text() + "\n"
     return text
+# def pdfminer_pdf_to_text(file_path):
+#     """
+#     Extract text from a PDF file using pdfminer.
+#     Args:
+#         file_path (str): Path to the PDF file.
+#     Returns:
+#         str: Extracted text from the PDF file.
+#     """
+#     # Implementation for pdfminer extraction goes here
+#     text = extract_text(file_path)
+#     return text
+def pdfminer_pdf_to_text(pdf_path: str) -> str:
+    try:
+        text = extract_text(pdf_path)
+        return text.strip()
+    except Exception as e:
+        st.error(f"Error extracting text: {e}")
+        return ""
 def pdfplumber_pdf_to_text(file_path):
     """