Spaces:
Build error
Build error
Upload folder using huggingface_hub
Browse files- app.py +52 -30
- qdrant_mm_db_pipeline/.lock +1 -0
- qdrant_mm_db_pipeline/meta.json +1 -0
app.py
CHANGED
|
@@ -19,6 +19,8 @@ from llama_index.llms.openai import OpenAI
|
|
| 19 |
from llama_index.core import load_index_from_storage, get_response_synthesizer
|
| 20 |
import tempfile
|
| 21 |
|
|
|
|
|
|
|
| 22 |
|
| 23 |
def extract_text_from_pdf(pdf_path):
|
| 24 |
reader = PdfReader(pdf_path)
|
|
@@ -72,11 +74,31 @@ def remove_low_size_images(data_path):
|
|
| 72 |
for one_image in low_size_photo_list[1:]:
|
| 73 |
os.remove(os.path.join(data_path, one_image))
|
| 74 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
def initialize_qdrant(temp_dir):
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
if "vectordatabase" not in st.session_state or not st.session_state.vectordatabase:
|
| 81 |
text_store = QdrantVectorStore(client=client, collection_name="text_collection_pipeline")
|
| 82 |
image_store = QdrantVectorStore(client=client, collection_name="image_collection_pipeline")
|
|
@@ -150,7 +172,7 @@ def process_pdf(pdf_file):
|
|
| 150 |
extract_images_from_pdf(temp_pdf_path, img_save_path)
|
| 151 |
moved_count = move_images(img_save_path, data_path)
|
| 152 |
remove_low_size_images(data_path)
|
| 153 |
-
|
| 154 |
retriever_engine = initialize_qdrant(temp_dir.name)
|
| 155 |
|
| 156 |
return temp_dir, retriever_engine
|
|
@@ -175,32 +197,32 @@ def main():
|
|
| 175 |
|
| 176 |
st.success("PDF processed successfully!")
|
| 177 |
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
if st.button("Ask Question"):
|
| 183 |
-
print("running")
|
| 184 |
-
try:
|
| 185 |
-
import pdb; pdb.set_trace()
|
| 186 |
-
|
| 187 |
-
with st.spinner("Retrieving information..."):
|
| 188 |
-
import pdb; pdb.set_trace()
|
| 189 |
-
response, retrieved_image_path_list = retrieve_and_query(query, st.session_state.retriever_engine)
|
| 190 |
-
|
| 191 |
-
st.write("Retrieved Context:")
|
| 192 |
-
for node in response.source_nodes:
|
| 193 |
-
st.code(node.node.get_text())
|
| 194 |
-
|
| 195 |
-
st.write("\nRetrieved Images:")
|
| 196 |
-
plot_images(retrieved_image_path_list)
|
| 197 |
-
st.pyplot()
|
| 198 |
-
|
| 199 |
-
st.write("\nFinal Answer:")
|
| 200 |
-
st.code(response.response)
|
| 201 |
|
| 202 |
-
|
| 203 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 204 |
|
| 205 |
if __name__ == "__main__":
|
| 206 |
main()
|
|
|
|
| 19 |
from llama_index.core import load_index_from_storage, get_response_synthesizer
|
| 20 |
import tempfile
|
| 21 |
|
| 22 |
+
from dotenv import load_dotenv
|
| 23 |
+
load_dotenv()
|
| 24 |
|
| 25 |
def extract_text_from_pdf(pdf_path):
|
| 26 |
reader = PdfReader(pdf_path)
|
|
|
|
| 74 |
for one_image in low_size_photo_list[1:]:
|
| 75 |
os.remove(os.path.join(data_path, one_image))
|
| 76 |
|
| 77 |
+
def remove_duplicate_images(data_path) :
|
| 78 |
+
image_files = os.listdir(data_path)
|
| 79 |
+
only_images = []
|
| 80 |
+
for one_image in image_files :
|
| 81 |
+
if one_image.endswith('jpeg') or one_image.endswith('png') or one_image.endswith('jpg') :
|
| 82 |
+
only_images.append(one_image)
|
| 83 |
+
only_images1 = sorted(only_images)
|
| 84 |
+
for one_image in only_images1 :
|
| 85 |
+
for another_image in only_images1 :
|
| 86 |
+
try :
|
| 87 |
+
if one_image == another_image :
|
| 88 |
+
continue
|
| 89 |
+
else :
|
| 90 |
+
diff = calc_diff(os.path.join(data_path ,one_image) , os.path.join(data_path ,another_image))
|
| 91 |
+
if diff ==0 :
|
| 92 |
+
os.remove(os.path.join(data_path , another_image))
|
| 93 |
+
except Exception as e:
|
| 94 |
+
print(e)
|
| 95 |
+
pass
|
| 96 |
+
|
| 97 |
def initialize_qdrant(temp_dir):
|
| 98 |
+
|
| 99 |
+
client = qdrant_client.QdrantClient(path="qdrant_mm_db_pipeline")
|
| 100 |
+
|
| 101 |
+
|
| 102 |
if "vectordatabase" not in st.session_state or not st.session_state.vectordatabase:
|
| 103 |
text_store = QdrantVectorStore(client=client, collection_name="text_collection_pipeline")
|
| 104 |
image_store = QdrantVectorStore(client=client, collection_name="image_collection_pipeline")
|
|
|
|
| 172 |
extract_images_from_pdf(temp_pdf_path, img_save_path)
|
| 173 |
moved_count = move_images(img_save_path, data_path)
|
| 174 |
remove_low_size_images(data_path)
|
| 175 |
+
remove_duplicate_images(data_path)
|
| 176 |
retriever_engine = initialize_qdrant(temp_dir.name)
|
| 177 |
|
| 178 |
return temp_dir, retriever_engine
|
|
|
|
| 197 |
|
| 198 |
st.success("PDF processed successfully!")
|
| 199 |
|
| 200 |
+
if st.session_state.retriever_engine :
|
| 201 |
+
query = st.text_input("Enter your question:")
|
| 202 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 203 |
|
| 204 |
+
if st.button("Ask Question"):
|
| 205 |
+
print("running")
|
| 206 |
+
try:
|
| 207 |
+
import pdb; pdb.set_trace()
|
| 208 |
+
|
| 209 |
+
with st.spinner("Retrieving information..."):
|
| 210 |
+
import pdb; pdb.set_trace()
|
| 211 |
+
response, retrieved_image_path_list = retrieve_and_query(query, st.session_state.retriever_engine)
|
| 212 |
+
|
| 213 |
+
st.write("Retrieved Context:")
|
| 214 |
+
for node in response.source_nodes:
|
| 215 |
+
st.code(node.node.get_text())
|
| 216 |
+
|
| 217 |
+
st.write("\nRetrieved Images:")
|
| 218 |
+
plot_images(retrieved_image_path_list)
|
| 219 |
+
st.pyplot()
|
| 220 |
+
|
| 221 |
+
st.write("\nFinal Answer:")
|
| 222 |
+
st.code(response.response)
|
| 223 |
+
|
| 224 |
+
except Exception as e:
|
| 225 |
+
st.error(f"An error occurred: {e}")
|
| 226 |
|
| 227 |
if __name__ == "__main__":
|
| 228 |
main()
|
qdrant_mm_db_pipeline/.lock
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
tmp lock file
|
qdrant_mm_db_pipeline/meta.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"collections": {}, "aliases": {}}
|