1mpreccable commited on
Commit
7381c1f
·
1 Parent(s): 44be36b

reworked and updated RAG ED

Browse files
pages/Project_2.2_-_Langchain_VectorDB.py DELETED
@@ -1,23 +0,0 @@
1
- import os
2
- from dotenv import load_dotenv
3
- import streamlit as st
4
- from src.functions_langchain import graph_init, initialize_inmemory_vector_store, load_and_split_documents_from_web
5
-
6
- load_dotenv()
7
-
8
- st.title("Langchain VectorDB")
9
- st.write("This is a simple demonstration of the Langchain VectorDB.")
10
-
11
- vector_store = initialize_inmemory_vector_store()
12
- all_splits = load_and_split_documents_from_web("https://www.gutenberg.org/files/1342/1342-h/1342-h.htm")
13
-
14
- # Index chunks
15
- _ = vector_store.add_documents(documents=all_splits)
16
-
17
- graph = graph_init(vector_store)
18
-
19
- question = st.text_input("Enter a question:")
20
- if st.button("Ask"):
21
- st.write("Searching for an answer...")
22
- response = graph.invoke({"question": question})
23
- st.write(response["answer"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
pages/Project_3_-_Scrapper.py DELETED
@@ -1,24 +0,0 @@
1
- import streamlit as st
2
- import requests
3
- from bs4 import BeautifulSoup
4
- from src.functions_scrapper import scrape_website
5
-
6
- ################################################################################
7
- tab1, tab2 = st.tabs(["Scrapper", "DB_Extraction"])
8
-
9
- st.sidebar.title("App parameters")
10
-
11
- link = st.sidebar.text_input("Enter the link to the website you want to scrape")
12
- selector = st.sidebar.selectbox("Select the tag you want to scrape", ["div", "p", "h1", "span", "a", "img"])
13
- button = st.sidebar.button("Scrape")
14
-
15
- ####
16
- tab1.title("Project 3 - Scrapper")
17
-
18
- if link and button and selector:
19
- result = scrape_website(link, selector=selector)
20
-
21
- tab1.write(result)
22
-
23
-
24
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
pages/Project_5_-_API.py DELETED
@@ -1,9 +0,0 @@
1
- import streamlit as st
2
-
3
-
4
- ################################################################################
5
-
6
- st.sidebar.title("App parameters")
7
-
8
- st.write("This is the API page. It is still under construction.")
9
- st.write(" Please come back later.")
 
 
 
 
 
 
 
 
 
 
pages/Project_6_-_RAG.py DELETED
@@ -1,20 +0,0 @@
1
- import streamlit as st
2
-
3
-
4
- ################################################################################
5
-
6
- st.sidebar.title("App parameters")
7
-
8
- st.write("This is the RAG page. It is still under construction.")
9
- st.write("Please come back later.")
10
-
11
-
12
- # https://aws.amazon.com/what-is/retrieval-augmented-generation/
13
- # https://medium.com/@dminhk/retrieval-augmented-generation-rag-explained-b1dd89979681
14
- # https://huggingface.co/transformers/model_doc/rag.html
15
- # https://huggingface.co/transformers/model_doc/rag-tokenizer.html
16
-
17
- # (BM25, Dense Passage Retrieval or Sentence Transformers). - need to find a tools for this
18
- # PostgreSQL or MongoDB - need to find a tools for this ( should be vectorial database) for the future use in semantic search
19
- # Testing API of indeed, linkedin, pole emploi
20
- # Testing API of huggingface
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
pages/Project_6_-_RAG_ED.py ADDED
@@ -0,0 +1,268 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import os
3
+ from src.functions_pdf import pdfminer_pdf_to_text
4
+ from src.functions_langchain import chunk_and_embed_pdf_text
5
+ from src.functions_langchain import InMemoryVectorStore, graph_init, embeddings
6
+ from src.functions_langchain import State, generate
7
+
8
+ # https://aws.amazon.com/what-is/retrieval-augmented-generation/
9
+ # https://medium.com/@dminhk/retrieval-augmented-generation-rag-explained-b1dd89979681
10
+ # https://huggingface.co/transformers/model_doc/rag.html
11
+ # https://huggingface.co/transformers/model_doc/rag-tokenizer.html
12
+
13
+ # (BM25, Dense Passage Retrieval or Sentence Transformers). - need to find a tools for this
14
+ # PostgreSQL or MongoDB - need to find a tools for this ( should be vectorial database) for the future use in semantic search
15
+ # Testing API of indeed, linkedin, pole emploi
16
+ # Testing API of huggingface
17
+
18
+ ################################################################################
19
+
20
+ # Sidebar
21
+ st.sidebar.title("App Parameters")
22
+ chunk_size = st.sidebar.slider("Chunk Size", 100, 2000, 1000)
23
+ chunk_overlap = st.sidebar.slider("Chunk Overlap", 0, 500, 100)
24
+
25
+ # Main title
26
+ st.title("RAG chat with PDF")
27
+ st.divider()
28
+
29
+
30
+ file = st.file_uploader("Upload a PDF file", type=["pdf"])
31
+ tab1, tab2 = st.tabs(["RAG", "Debugging"])
32
+
33
+
34
+ def save_uploaded_file(uploaded_file):
35
+ path = "temp_uploaded_file.pdf"
36
+ with open(path, "wb") as f:
37
+ f.write(uploaded_file.read())
38
+ return path
39
+
40
+ def load_and_extract_text(pdf_path):
41
+ text = pdfminer_pdf_to_text(pdf_path)
42
+ if os.path.exists(pdf_path):
43
+ os.remove(pdf_path)
44
+ return text
45
+
46
+ def init_vector_store_and_graph(pdf_text, chunk_size, chunk_overlap):
47
+ chunks, _ = chunk_and_embed_pdf_text(pdf_text, chunk_size=chunk_size, chunk_overlap=chunk_overlap)
48
+ vector_store = InMemoryVectorStore(embeddings)
49
+ vector_store.add_texts(chunks)
50
+ graph = graph_init(vector_store)
51
+ return vector_store, graph, chunks
52
+
53
+ # main tab
54
+ with tab1:
55
+ if file is not None:
56
+ if "pdf_path" not in st.session_state or st.session_state["pdf_path"] != file.name:
57
+ st.session_state["pdf_path"] = file.name
58
+ st.session_state["temp_pdf_path"] = save_uploaded_file(file)
59
+ st.session_state["pdf_text"] = None
60
+ st.session_state["vector_store"] = None
61
+ st.session_state["graph"] = None
62
+ st.session_state["chunks"] = None
63
+ st.session_state["state"] = None
64
+
65
+ if st.button("Launch app"):
66
+ with st.spinner("Extracting and processing PDF..."):
67
+ text = load_and_extract_text(st.session_state["temp_pdf_path"])
68
+ if not text:
69
+ st.warning("No text extracted from PDF.")
70
+ else:
71
+ st.session_state["pdf_text"] = text
72
+ vector_store, graph, chunks = init_vector_store_and_graph(text, chunk_size, chunk_overlap)
73
+ st.session_state["vector_store"] = vector_store
74
+ st.session_state["graph"] = graph
75
+ st.session_state["chunks"] = chunks
76
+ st.success(f"Processed PDF with {len(chunks)} chunks.")
77
+
78
+ if "graph" in st.session_state and st.session_state["graph"] is not None:
79
+ query = st.text_input("Ask a question about the PDF:", key="query_tab1")
80
+ if query:
81
+ state = State(question=query, context=[], answer="")
82
+ st.session_state["state"] = state
83
+ with st.spinner("Retrieving context and generating answer..."):
84
+ result_state = st.session_state["graph"].invoke(state)
85
+ st.session_state["state"] = result_state
86
+
87
+ if result_state.get("context"):
88
+ st.success(f"Retrieved {len(result_state['context'])} relevant documents.")
89
+ st.markdown("### Answer:")
90
+ st.write(result_state.get("answer", "No answer generated."))
91
+ else:
92
+ st.warning("No relevant context found for the question.")
93
+
94
+ # Debugging tab
95
+ with tab2:
96
+ if file is not None:
97
+ st.info(f"Uploaded file: **{file.name}** ({file.size / 1024:.2f} KB)")
98
+ if st.button("Extract Text"):
99
+ temp_pdf_path = save_uploaded_file(file)
100
+ text = load_and_extract_text(temp_pdf_path)
101
+ if text:
102
+ st.success("Text extracted successfully!")
103
+ st.session_state["pdf_text"] = text
104
+ st.text_area("Extracted Text", text, height=300)
105
+ st.download_button("Download Extracted Text", text, "extracted_text.txt", "text/plain")
106
+ else:
107
+ st.warning("No text extracted. Please check the PDF.")
108
+
109
+ if "pdf_text" in st.session_state and st.session_state["pdf_text"]:
110
+ if st.button("Process and Embed Text"):
111
+ with st.spinner("Chunking and embedding text..."):
112
+ vector_store, graph, chunks = init_vector_store_and_graph(st.session_state["pdf_text"], chunk_size, chunk_overlap)
113
+ st.session_state["vector_store"] = vector_store
114
+ st.session_state["graph"] = graph
115
+ st.session_state["chunks"] = chunks
116
+ st.success(f"Processed {len(chunks)} chunks and created embeddings.")
117
+ for i, chunk in enumerate(chunks[:3]):
118
+ st.markdown(f"**Chunk {i+1}:**")
119
+ st.write(chunk)
120
+
121
+ if "graph" in st.session_state and st.session_state["graph"] is not None:
122
+ query_debug = st.text_input("Ask a question about the PDF:", key="query_tab2")
123
+ if query_debug:
124
+ state = State(question=query_debug, context=[], answer="")
125
+ st.session_state["state"] = state
126
+ with st.spinner("Retrieving context and generating answer..."):
127
+ result_state = st.session_state["graph"].invoke(state)
128
+ st.session_state["state"] = result_state
129
+ if result_state.get("context"):
130
+ st.success(f"Retrieved {len(result_state['context'])} documents.")
131
+ st.markdown("### Answer:")
132
+ st.write(result_state.get("answer", "No answer generated."))
133
+ else:
134
+ st.warning("No relevant context found for the question.")
135
+
136
+
137
+ # with tab1:
138
+ # # Upload PDF
139
+
140
+ # if file is not None:
141
+ # temp_file_path = "temp_uploaded_file.pdf"
142
+ # with open(temp_file_path, "wb") as temp_file:
143
+ # temp_file.write(file.read())
144
+
145
+ # if st.button("Launch app"):
146
+ # with st.spinner("Preloading information..."):
147
+ # text = pdfminer_pdf_to_text(temp_file_path)
148
+ # st.session_state["pdf_text"] = text
149
+
150
+ # vector_store = InMemoryVectorStore(embeddings)
151
+ # chunks, vectors = chunk_and_embed_pdf_text(st.session_state["pdf_text"], chunk_size=chunk_size, chunk_overlap=chunk_overlap)
152
+
153
+ # vector_store = InMemoryVectorStore(embeddings)
154
+ # vector_store.add_texts(chunks)
155
+
156
+ # st.session_state["vector_store"] = vector_store
157
+ # st.session_state["graph"] = graph_init(vector_store)
158
+
159
+ # st.success("App is ready to use!")
160
+
161
+ # if "graph" in st.session_state:
162
+ # query = st.text_input("Ask a question about the PDF:")
163
+ # if query:
164
+ # state = State(question=query, context=[], answer="")
165
+ # st.session_state["state"] = state
166
+
167
+ # with st.spinner("Retrieving context..."):
168
+ # context = st.session_state["graph"].invoke(state)
169
+ # st.session_state["state"]["context"] = context["context"]
170
+
171
+ # if st.session_state["state"]["context"]:
172
+ # st.success(f"Retrieved {len(st.session_state['state']['context'])} documents.")
173
+
174
+ # with st.spinner("Generating answer..."):
175
+ # answer = generate(st.session_state["state"])
176
+ # st.session_state["state"]["answer"] = answer["answer"]
177
+
178
+ # st.markdown("### Answer:")
179
+ # st.write(st.session_state["state"]["answer"])
180
+ # else:
181
+ # st.warning("No relevant context found for the question.")
182
+
183
+
184
+
185
+ # with tab2:
186
+ # ### FIRST ETAPE ----UPLOAD THE PDF-FILE AND RETURN THE TEXT RESULT ----
187
+
188
+ # if file is not None:
189
+ # st.info(f"Uploaded file: **{file.name}** ({file.size / 1024:.2f} KB)")
190
+
191
+ # if st.button("Extract Text"):
192
+ # temp_file_path = "temp_uploaded_file.pdf"
193
+
194
+ # with open(temp_file_path, "wb") as temp_file:
195
+ # temp_file.write(file.read())
196
+
197
+ # text = pdfminer_pdf_to_text(temp_file_path)
198
+
199
+ # if os.path.exists(temp_file_path):
200
+ # os.remove(temp_file_path)
201
+
202
+ # if text:
203
+ # st.success("Text extracted successfully!")
204
+ # st.session_state["pdf_text"] = text
205
+
206
+ # if st.checkbox("Show extracted text"):
207
+ # st.text_area("Extracted Text", text, height=300)
208
+
209
+ # st.download_button(
210
+ # label="Download Extracted Text",
211
+ # data=text,
212
+ # file_name="extracted_text.txt",
213
+ # mime="text/plain"
214
+ # )
215
+ # else:
216
+ # st.warning("No text extracted. Please check the PDF.")
217
+ # else:
218
+ # st.warning("Please upload a PDF file to proceed.")
219
+
220
+
221
+ # # SECOND ETAPE ---- New button and logic for chunking & embedding ( no mongo db, only session state ) ----
222
+
223
+
224
+ # vector_store = InMemoryVectorStore(embeddings)
225
+
226
+
227
+ # if "pdf_text" in st.session_state:
228
+ # if st.button("Process and Embed Text"):
229
+ # with st.spinner("Chunking and embedding text..."):
230
+ # chunks, vectors = chunk_and_embed_pdf_text(st.session_state["pdf_text"], chunk_size=chunk_size, chunk_overlap=chunk_overlap)
231
+
232
+ # # Initialize vector store and add texts
233
+ # vector_store = InMemoryVectorStore(embeddings)
234
+ # vector_store.add_texts(chunks)
235
+
236
+ # # Save vector store and graph in session state
237
+ # st.session_state["vector_store"] = vector_store
238
+ # st.session_state["graph"] = graph_init(vector_store)
239
+
240
+ # st.success(f"Processed {len(chunks)} chunks and created embeddings.")
241
+ # for i, chunk in enumerate(chunks[:3]):
242
+ # st.markdown(f"**Chunk {i+1}:**")
243
+ # st.write(chunk)
244
+
245
+
246
+ # # THIRD ETAPE ---- Add a question and answer logic ----
247
+
248
+ # if "graph" in st.session_state:
249
+ # query = st.text_input("Ask a question about the PDF:")
250
+ # if query:
251
+ # state = State(question=query, context=[], answer="")
252
+ # st.session_state["state"] = state
253
+
254
+ # with st.spinner("Retrieving context..."):
255
+ # context = st.session_state["graph"].invoke(state)
256
+ # st.session_state["state"]["context"] = context["context"]
257
+
258
+ # if st.session_state["state"]["context"]:
259
+ # st.success(f"Retrieved {len(st.session_state['state']['context'])} documents.")
260
+
261
+ # with st.spinner("Generating answer..."):
262
+ # answer = generate(st.session_state["state"])
263
+ # st.session_state["state"]["answer"] = answer["answer"]
264
+
265
+ # st.markdown("### Answer:")
266
+ # st.write(st.session_state["state"]["answer"])
267
+ # else:
268
+ # st.warning("No relevant context found for the question.")
pages/archive/Project_2.2_-_Langchain_VectorDB.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # import os
2
+ # from dotenv import load_dotenv
3
+ # import streamlit as st
4
+ # from src.functions_langchain import graph_init, initialize_inmemory_vector_store, load_and_split_documents_from_web
5
+
6
+ # load_dotenv()
7
+
8
+ # st.title("Langchain VectorDB")
9
+ # st.write("This is a simple demonstration of the Langchain VectorDB.")
10
+
11
+ # vector_store = initialize_inmemory_vector_store()
12
+ # all_splits = load_and_split_documents_from_web("https://www.gutenberg.org/files/1342/1342-h/1342-h.htm")
13
+
14
+ # # Index chunks
15
+ # _ = vector_store.add_documents(documents=all_splits)
16
+
17
+ # graph = graph_init(vector_store)
18
+
19
+ # question = st.text_input("Enter a question:")
20
+ # if st.button("Ask"):
21
+ # st.write("Searching for an answer...")
22
+ # response = graph.invoke({"question": question})
23
+ # st.write(response["answer"])
pages/archive/Project_3_-_Scrapper.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # import streamlit as st
2
+ # import requests
3
+ # from bs4 import BeautifulSoup
4
+ # from src.functions_scrapper import scrape_website
5
+
6
+ # ################################################################################
7
+ # tab1, tab2 = st.tabs(["Scrapper", "DB_Extraction"])
8
+
9
+ # st.sidebar.title("App parameters")
10
+
11
+ # link = st.sidebar.text_input("Enter the link to the website you want to scrape")
12
+ # selector = st.sidebar.selectbox("Select the tag you want to scrape", ["div", "p", "h1", "span", "a", "img"])
13
+ # button = st.sidebar.button("Scrape")
14
+
15
+ # ####
16
+ # tab1.title("Project 3 - Scrapper")
17
+
18
+ # if link and button and selector:
19
+ # result = scrape_website(link, selector=selector)
20
+
21
+ # tab1.write(result)
22
+
23
+
24
+
pages/archive/Project_5_-_API.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ # import streamlit as st
2
+
3
+
4
+ # ################################################################################
5
+
6
+ # st.sidebar.title("App parameters")
7
+
8
+ # st.write("This is the API page. It is still under construction.")
9
+ # st.write(" Please come back later.")
src/__pycache__/functions_langchain.cpython-311.pyc CHANGED
Binary files a/src/__pycache__/functions_langchain.cpython-311.pyc and b/src/__pycache__/functions_langchain.cpython-311.pyc differ
 
src/__pycache__/functions_llm.cpython-311.pyc CHANGED
Binary files a/src/__pycache__/functions_llm.cpython-311.pyc and b/src/__pycache__/functions_llm.cpython-311.pyc differ
 
src/__pycache__/functions_nadia_llm.cpython-311.pyc ADDED
Binary file (743 Bytes). View file
 
src/__pycache__/functions_pdf.cpython-311.pyc CHANGED
Binary files a/src/__pycache__/functions_pdf.cpython-311.pyc and b/src/__pycache__/functions_pdf.cpython-311.pyc differ
 
src/functions_langchain.py CHANGED
@@ -20,6 +20,8 @@ from langchain_text_splitters import RecursiveCharacterTextSplitter
20
  from langgraph.graph import START, StateGraph
21
  from typing_extensions import List, TypedDict
22
  from langchain_core.vectorstores import InMemoryVectorStore
 
 
23
 
24
  load_dotenv()
25
 
@@ -36,12 +38,32 @@ sentry_sdk.init(
36
  },
37
  )
38
 
39
- client = MongoClient(mongodb_uri, server_api=ServerApi('1'))
40
 
41
  llm = init_chat_model("llama3-8b-8192", model_provider="groq")
42
  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
43
  prompt = hub.pull("rlm/rag-prompt")
44
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  @serverless_function
46
  def initialize_inmemory_vector_store() -> InMemoryVectorStore:
47
  return InMemoryVectorStore(embeddings)
 
20
  from langgraph.graph import START, StateGraph
21
  from typing_extensions import List, TypedDict
22
  from langchain_core.vectorstores import InMemoryVectorStore
23
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
24
+ from langchain_community.embeddings import HuggingFaceEmbeddings
25
 
26
  load_dotenv()
27
 
 
38
  },
39
  )
40
 
41
+ # client = MongoClient(mongodb_uri, server_api=ServerApi('1'))
42
 
43
  llm = init_chat_model("llama3-8b-8192", model_provider="groq")
44
  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
45
  prompt = hub.pull("rlm/rag-prompt")
46
 
47
+ def chunk_and_embed_pdf_text(text: str, chunk_size=1000, chunk_overlap=100):
48
+ # 1. Split text into chunks
49
+ text_splitter = RecursiveCharacterTextSplitter(
50
+ chunk_size=chunk_size, # size of each chunk in characters
51
+ chunk_overlap=chunk_overlap, # overlap to preserve context
52
+ separators=["\n\n", "\n", ".", " "]
53
+ )
54
+ chunks = text_splitter.split_text(text)
55
+
56
+ # 2. Create HuggingFace embeddings instance
57
+ embeddings = HuggingFaceEmbeddings(
58
+ model_name="sentence-transformers/all-mpnet-base-v2"
59
+ )
60
+
61
+ # 3. Embed chunks
62
+ vectors = embeddings.embed_documents(chunks)
63
+
64
+ # Returning both for further processing
65
+ return chunks, vectors
66
+
67
  @serverless_function
68
  def initialize_inmemory_vector_store() -> InMemoryVectorStore:
69
  return InMemoryVectorStore(embeddings)
src/functions_pdf.py CHANGED
@@ -2,7 +2,7 @@ import pymupdf
2
  from PyPDF2 import PdfReader
3
  from pdfminer.high_level import extract_text
4
  from langchain.document_loaders import PDFPlumberLoader
5
-
6
 
7
  def pymupdf_pdf_to_text(file_path):
8
  """
@@ -36,19 +36,27 @@ def pypdf2_pdf_to_text(file_path):
36
  text += page.extract_text() + "\n"
37
  return text
38
 
39
- def pdfminer_pdf_to_text(file_path):
40
- """
41
- Extract text from a PDF file using pdfminer.
42
 
43
- Args:
44
- file_path (str): Path to the PDF file.
45
 
46
- Returns:
47
- str: Extracted text from the PDF file.
48
- """
49
- # Implementation for pdfminer extraction goes here
50
- text = extract_text(file_path)
51
- return text
 
 
 
 
 
 
 
 
52
 
53
  def pdfplumber_pdf_to_text(file_path):
54
  """
 
2
  from PyPDF2 import PdfReader
3
  from pdfminer.high_level import extract_text
4
  from langchain.document_loaders import PDFPlumberLoader
5
+ import streamlit as st
6
 
7
  def pymupdf_pdf_to_text(file_path):
8
  """
 
36
  text += page.extract_text() + "\n"
37
  return text
38
 
39
+ # def pdfminer_pdf_to_text(file_path):
40
+ # """
41
+ # Extract text from a PDF file using pdfminer.
42
 
43
+ # Args:
44
+ # file_path (str): Path to the PDF file.
45
 
46
+ # Returns:
47
+ # str: Extracted text from the PDF file.
48
+ # """
49
+ # # Implementation for pdfminer extraction goes here
50
+ # text = extract_text(file_path)
51
+ # return text
52
+
53
+ def pdfminer_pdf_to_text(pdf_path: str) -> str:
54
+ try:
55
+ text = extract_text(pdf_path)
56
+ return text.strip()
57
+ except Exception as e:
58
+ st.error(f"Error extracting text: {e}")
59
+ return ""
60
 
61
  def pdfplumber_pdf_to_text(file_path):
62
  """