Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files
app.py
CHANGED
|
@@ -18,12 +18,22 @@ from llama_index.core.query_engine import SimpleMultiModalQueryEngine
|
|
| 18 |
from llama_index.llms.openai import OpenAI
|
| 19 |
from llama_index.core import load_index_from_storage, get_response_synthesizer
|
| 20 |
import tempfile
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
|
| 22 |
-
# from dotenv import load_dotenv
|
| 23 |
-
# load_dotenv()
|
| 24 |
|
| 25 |
-
OPENAI_API_KEY = "sk-proj-beorroDjV4FeoL6OAzbET3BlbkFJT4WcMiP0x30GxzmbpIEC"
|
| 26 |
-
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
|
| 27 |
|
| 28 |
|
| 29 |
def extract_text_from_pdf(pdf_path):
|
|
@@ -104,20 +114,27 @@ def remove_duplicate_images(data_path) :
|
|
| 104 |
except Exception as e:
|
| 105 |
print(e)
|
| 106 |
pass
|
| 107 |
-
|
| 108 |
-
|
|
|
|
| 109 |
|
| 110 |
-
|
| 111 |
-
# client = qdrant_client.QdrantClient(host = "192.168.0.1" , port = 2401 , https = True)
|
| 112 |
# client = qdrant_client.QdrantClient(url = "http://localhost:2452")
|
| 113 |
-
client = qdrant_client.QdrantClient(url="4b0af7be-d5b3-47ac-b215-128ebd6aa495.europe-west3-0.gcp.cloud.qdrant.io:6333", api_key="CO1sNGLmC6R_Q45qSIUxBSX8sxwHud4MCm4as_GTI-vzQqdUs-bXqw",)
|
|
|
|
| 114 |
|
| 115 |
if "vectordatabase" not in st.session_state or not st.session_state.vectordatabase:
|
| 116 |
-
|
| 117 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 118 |
storage_context = StorageContext.from_defaults(vector_store=text_store, image_store=image_store)
|
| 119 |
-
documents = SimpleDirectoryReader(os.path.join(temp_dir, "
|
| 120 |
index = MultiModalVectorStoreIndex.from_documents(documents, storage_context=storage_context)
|
|
|
|
| 121 |
st.session_state.vectordatabase = index
|
| 122 |
else :
|
| 123 |
index = st.session_state.vectordatabase
|
|
@@ -153,7 +170,7 @@ def retrieve_and_query(query, retriever_engine):
|
|
| 153 |
)
|
| 154 |
qa_tmpl = PromptTemplate(qa_tmpl_str)
|
| 155 |
|
| 156 |
-
llm = OpenAI(model="gpt-4o
|
| 157 |
response_synthesizer = get_response_synthesizer(response_mode="refine", text_qa_template=qa_tmpl, llm=llm)
|
| 158 |
|
| 159 |
response = response_synthesizer.synthesize(query, nodes=retrieval_results)
|
|
@@ -166,15 +183,16 @@ def retrieve_and_query(query, retriever_engine):
|
|
| 166 |
|
| 167 |
return response, retrieved_image_path_list
|
| 168 |
|
| 169 |
-
def process_pdf(pdf_file):
|
|
|
|
| 170 |
temp_dir = tempfile.TemporaryDirectory()
|
| 171 |
temp_pdf_path = os.path.join(temp_dir.name, pdf_file.name)
|
| 172 |
with open(temp_pdf_path, "wb") as f:
|
| 173 |
f.write(pdf_file.getvalue())
|
| 174 |
|
| 175 |
-
data_path = os.path.join(temp_dir.name, "
|
| 176 |
os.makedirs(data_path , exist_ok=True)
|
| 177 |
-
img_save_path = os.path.join(temp_dir.name, "
|
| 178 |
os.makedirs(img_save_path , exist_ok=True)
|
| 179 |
|
| 180 |
extracted_text = extract_text_from_pdf(temp_pdf_path)
|
|
@@ -185,7 +203,8 @@ def process_pdf(pdf_file):
|
|
| 185 |
moved_count = move_images(img_save_path, data_path)
|
| 186 |
remove_low_size_images(data_path)
|
| 187 |
remove_duplicate_images(data_path)
|
| 188 |
-
|
|
|
|
| 189 |
|
| 190 |
return temp_dir, retriever_engine
|
| 191 |
|
|
@@ -199,13 +218,15 @@ def main():
|
|
| 199 |
st.session_state.vectordatabase = None
|
| 200 |
|
| 201 |
uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
|
|
|
|
| 202 |
if uploaded_file is None:
|
| 203 |
st.info("Please upload a PDF file.")
|
| 204 |
else:
|
|
|
|
| 205 |
st.info(f"Uploaded PDF: {uploaded_file.name}")
|
| 206 |
if st.button("Process PDF"):
|
| 207 |
with st.spinner("Processing PDF..."):
|
| 208 |
-
temp_dir, st.session_state.retriever_engine = process_pdf(uploaded_file)
|
| 209 |
|
| 210 |
st.success("PDF processed successfully!")
|
| 211 |
|
|
|
|
| 18 |
from llama_index.llms.openai import OpenAI
|
| 19 |
from llama_index.core import load_index_from_storage, get_response_synthesizer
|
| 20 |
import tempfile
|
| 21 |
+
from qdrant_client import QdrantClient, models
|
| 22 |
+
import getpass
|
| 23 |
+
|
| 24 |
+
curr_user = getpass.getuser()
|
| 25 |
+
# from langchain.vectorstores import Chroma
|
| 26 |
+
# To connect to the same event-loop,
|
| 27 |
+
# allows async events to run on notebook
|
| 28 |
+
|
| 29 |
+
# import nest_asyncio
|
| 30 |
+
|
| 31 |
+
# nest_asyncio.apply()
|
| 32 |
+
|
| 33 |
+
from dotenv import load_dotenv
|
| 34 |
+
load_dotenv()
|
| 35 |
|
|
|
|
|
|
|
| 36 |
|
|
|
|
|
|
|
| 37 |
|
| 38 |
|
| 39 |
def extract_text_from_pdf(pdf_path):
|
|
|
|
| 114 |
except Exception as e:
|
| 115 |
print(e)
|
| 116 |
pass
|
| 117 |
+
# from langchain_chroma import Chroma
|
| 118 |
+
# import chromadb
|
| 119 |
+
def initialize_qdrant(temp_dir , file_name , user):
|
| 120 |
|
| 121 |
+
client = qdrant_client.QdrantClient(path=f"qdrant_mm_db_pipeline_{user}_{file_name}")
|
|
|
|
| 122 |
# client = qdrant_client.QdrantClient(url = "http://localhost:2452")
|
| 123 |
+
# client = qdrant_client.QdrantClient(url="4b0af7be-d5b3-47ac-b215-128ebd6aa495.europe-west3-0.gcp.cloud.qdrant.io:6333", api_key="CO1sNGLmC6R_Q45qSIUxBSX8sxwHud4MCm4as_GTI-vzQqdUs-bXqw",)
|
| 124 |
+
# client = qdrant_client.AsyncQdrantClient(location = ":memory:")
|
| 125 |
|
| 126 |
if "vectordatabase" not in st.session_state or not st.session_state.vectordatabase:
|
| 127 |
+
|
| 128 |
+
# text_store = client.create_collection(f"text_collection_pipeline_{user}_{file_name}" )
|
| 129 |
+
# image_store = client.create_collection(f"image_collection_pipeline_{user}_{file_name}" )
|
| 130 |
+
|
| 131 |
+
|
| 132 |
+
text_store = QdrantVectorStore( client = client , collection_name=f"text_collection_pipeline_{user}_{file_name}" )
|
| 133 |
+
image_store = QdrantVectorStore(client = client , collection_name=f"image_collection_pipeline_{user}_{file_name}")
|
| 134 |
storage_context = StorageContext.from_defaults(vector_store=text_store, image_store=image_store)
|
| 135 |
+
documents = SimpleDirectoryReader(os.path.join(temp_dir, f"my_own_data_{user}_{file_name}")).load_data()
|
| 136 |
index = MultiModalVectorStoreIndex.from_documents(documents, storage_context=storage_context)
|
| 137 |
+
|
| 138 |
st.session_state.vectordatabase = index
|
| 139 |
else :
|
| 140 |
index = st.session_state.vectordatabase
|
|
|
|
| 170 |
)
|
| 171 |
qa_tmpl = PromptTemplate(qa_tmpl_str)
|
| 172 |
|
| 173 |
+
llm = OpenAI(model="gpt-4o", temperature=0)
|
| 174 |
response_synthesizer = get_response_synthesizer(response_mode="refine", text_qa_template=qa_tmpl, llm=llm)
|
| 175 |
|
| 176 |
response = response_synthesizer.synthesize(query, nodes=retrieval_results)
|
|
|
|
| 183 |
|
| 184 |
return response, retrieved_image_path_list
|
| 185 |
|
| 186 |
+
def process_pdf(pdf_file , user):
|
| 187 |
+
import pdb; pdb.set_trace()
|
| 188 |
temp_dir = tempfile.TemporaryDirectory()
|
| 189 |
temp_pdf_path = os.path.join(temp_dir.name, pdf_file.name)
|
| 190 |
with open(temp_pdf_path, "wb") as f:
|
| 191 |
f.write(pdf_file.getvalue())
|
| 192 |
|
| 193 |
+
data_path = os.path.join(temp_dir.name, f"my_own_data_{user}_{os.path.splitext(pdf_file.name)[0]}")
|
| 194 |
os.makedirs(data_path , exist_ok=True)
|
| 195 |
+
img_save_path = os.path.join(temp_dir.name, f"extracted_images_{user}_{os.path.splitext(pdf_file.name)[0]}")
|
| 196 |
os.makedirs(img_save_path , exist_ok=True)
|
| 197 |
|
| 198 |
extracted_text = extract_text_from_pdf(temp_pdf_path)
|
|
|
|
| 203 |
moved_count = move_images(img_save_path, data_path)
|
| 204 |
remove_low_size_images(data_path)
|
| 205 |
remove_duplicate_images(data_path)
|
| 206 |
+
import pdb; pdb.set_trace()
|
| 207 |
+
retriever_engine = initialize_qdrant(temp_dir.name , os.path.splitext(pdf_file.name)[0] , curr_user)
|
| 208 |
|
| 209 |
return temp_dir, retriever_engine
|
| 210 |
|
|
|
|
| 218 |
st.session_state.vectordatabase = None
|
| 219 |
|
| 220 |
uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
|
| 221 |
+
# import pdb; pdb.set_trace()
|
| 222 |
if uploaded_file is None:
|
| 223 |
st.info("Please upload a PDF file.")
|
| 224 |
else:
|
| 225 |
+
# import pdb; pdb.set_trace()
|
| 226 |
st.info(f"Uploaded PDF: {uploaded_file.name}")
|
| 227 |
if st.button("Process PDF"):
|
| 228 |
with st.spinner("Processing PDF..."):
|
| 229 |
+
temp_dir, st.session_state.retriever_engine = process_pdf(uploaded_file , curr_user)
|
| 230 |
|
| 231 |
st.success("PDF processed successfully!")
|
| 232 |
|