prernajeet01 commited on
Commit
140ce43
·
verified ·
1 Parent(s): a3b1ba3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +102 -14
app.py CHANGED
@@ -83,11 +83,7 @@ def init_astra_db():
83
  astra_db_endpoint = os.getenv("ASTRA_DB_ENDPOINT", "https://8e3fd85c-5f28-4e1f-8538-9dd28a3ea2b0-us-east-2.apps.astra.datastax.com")
84
 
85
  # Initialize the client
86
- client = DataAPIClient(astra_db_application_token)
87
- db = client.get_database_by_api_endpoint(
88
- astra_db_endpoint,
89
- keyspace=astra_db_keyspace
90
- )
91
 
92
  # Get or create collections
93
  product_embeddings = db.get_collection("product_embeddings")
@@ -259,6 +255,57 @@ def process_pdf_catalogs():
259
  print(f"Error processing PDF catalogs: {e}")
260
  return {"status": "error", "message": str(e)}
261
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
262
  # Function to store text chunks in Astra DB with embeddings
263
  def store_chunks_in_db(chunks, product_type):
264
  """Store text chunks with embeddings in Astra DB"""
@@ -648,22 +695,30 @@ def setup_and_update():
648
 
649
  # Initialize database and other services
650
  global astra_session, astra_keyspace, s3_client, embeddings_model
651
- astra_db_result = init_astra_db()
652
- if astra_db_result:
653
- astra_session = astra_db_result.get("db")
654
- astra_keyspace = astra_db_result.get("keyspace")
 
655
  else:
656
  astra_session = None
657
  astra_keyspace = None
658
-
659
  s3_client = init_s3_client()
660
  embeddings_model = get_embeddings_model()
661
 
662
  # Return status
663
- if openai_initialized and mistral_initialized:
664
- return "System is ready. You can start chatting!"
665
- else:
666
- return "System initialization incomplete. Some features may not work properly."
 
 
 
 
 
 
 
667
 
668
  def create_gradio_app():
669
  # Define CSS styles for a more modern, appealing interface
@@ -824,7 +879,24 @@ def create_gradio_app():
824
  s3_bucket = gr.Textbox(label="S3 Bucket Name")
825
  s3_prefix = gr.Textbox(label="S3 Prefix (folder)", value="catalogs/")
826
  process_btn = gr.Button("Process PDFs from S3", elem_classes="action-button")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
827
  result_text = gr.Textbox(label="Processing Result")
 
828
 
829
  # Set up event handlers
830
  send_btn.click(
@@ -855,6 +927,22 @@ def create_gradio_app():
855
  api_name="process_pdfs"
856
  )
857
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
858
  # Add the system setup to run when the app loads
859
  app.load(setup_and_update, None, status_display)
860
 
 
83
  astra_db_endpoint = os.getenv("ASTRA_DB_ENDPOINT", "https://8e3fd85c-5f28-4e1f-8538-9dd28a3ea2b0-us-east-2.apps.astra.datastax.com")
84
 
85
  # Initialize the client
86
+ client = DataAPIClient(api_endpoint=astra_db_endpoint, token=astra_db_application_token)
 
 
 
 
87
 
88
  # Get or create collections
89
  product_embeddings = db.get_collection("product_embeddings")
 
255
  print(f"Error processing PDF catalogs: {e}")
256
  return {"status": "error", "message": str(e)}
257
 
258
+ # Add this function to process PDFs from URLs
259
+ def process_pdf_from_url(url):
260
+ """Download and process a PDF from a URL"""
261
+ try:
262
+ # Download the PDF
263
+ response = requests.get(url, stream=True)
264
+ if response.status_code != 200:
265
+ return f"Error downloading PDF: HTTP status code {response.status_code}"
266
+
267
+ # Get the content
268
+ pdf_content = response.content
269
+
270
+ # Determine product type from URL or filename
271
+ product_type = "other"
272
+ for pt in ["circuit_breaker", "motor_starter", "contactor", "switch", "relay"]:
273
+ if pt in url.lower():
274
+ product_type = pt.replace("_", " ")
275
+ break
276
+
277
+ # Process PDF text content
278
+ pdf_reader = PyPDF2.PdfReader(io.BytesIO(pdf_content))
279
+ text_content = ""
280
+
281
+ # Extract text from each page
282
+ for page in pdf_reader.pages:
283
+ text_content += page.extract_text() + "\n\n"
284
+
285
+ # Split text into smaller chunks for efficient embedding
286
+ text_splitter = RecursiveCharacterTextSplitter(
287
+ chunk_size=1000,
288
+ chunk_overlap=200,
289
+ length_function=len,
290
+ )
291
+ chunks = text_splitter.split_text(text_content)
292
+
293
+ # Store chunks in vector database (if available)
294
+ if astra_session:
295
+ store_chunks_in_db(chunks, product_type)
296
+
297
+ # Extract and store images (if database available)
298
+ images_count = 0
299
+ if astra_session:
300
+ images_count = extract_images_from_pdf(pdf_content, product_type)
301
+
302
+ print(f"Processed PDF from URL: {url}: {len(chunks)} text chunks and {images_count} images extracted")
303
+ return f"Successfully processed PDF from URL: {len(chunks)} chunks, {images_count} images"
304
+
305
+ except Exception as e:
306
+ print(f"Error processing PDF from URL: {e}")
307
+ return f"Error processing PDF: {str(e)}"
308
+
309
  # Function to store text chunks in Astra DB with embeddings
310
  def store_chunks_in_db(chunks, product_type):
311
  """Store text chunks with embeddings in Astra DB"""
 
695
 
696
  # Initialize database and other services
697
  global astra_session, astra_keyspace, s3_client, embeddings_model
698
+ astra_result = init_astra_db()
699
+
700
+ if astra_result:
701
+ astra_session = astra_result.get("db")
702
+ astra_keyspace = astra_result.get("keyspace")
703
  else:
704
  astra_session = None
705
  astra_keyspace = None
706
+
707
  s3_client = init_s3_client()
708
  embeddings_model = get_embeddings_model()
709
 
710
  # Return status
711
+ status_msg = "System is ready. "
712
+ if not openai_initialized:
713
+ status_msg += "OpenAI API not initialized. "
714
+ if not mistral_initialized:
715
+ status_msg += "Mistral API not initialized. "
716
+ if not astra_session:
717
+ status_msg += "Astra DB not connected. "
718
+ if not s3_client:
719
+ status_msg += "S3 client not initialized. "
720
+
721
+ return status_msg
722
 
723
  def create_gradio_app():
724
  # Define CSS styles for a more modern, appealing interface
 
879
  s3_bucket = gr.Textbox(label="S3 Bucket Name")
880
  s3_prefix = gr.Textbox(label="S3 Prefix (folder)", value="catalogs/")
881
  process_btn = gr.Button("Process PDFs from S3", elem_classes="action-button")
882
+
883
+ # Add direct PDF URL input
884
+ with gr.Tab("Direct PDF URLs"):
885
+ pdf_url = gr.Textbox(label="PDF URL", placeholder="https://example.com/sample.pdf")
886
+ pdf_dropdown = gr.Dropdown(
887
+ label="ABB Catalog PDFs",
888
+ choices=[
889
+ "https://agent-product-discovery.s3.ap-south-1.amazonaws.com/ABB-catalog/ABB+Ability%E2%84%A2+System+800xA%C2%AE+6.2.pdf",
890
+ "https://agent-product-discovery.s3.ap-south-1.amazonaws.com/ABB-catalog/Enclosed+Softstarters.pdf",
891
+ "https://agent-product-discovery.s3.ap-south-1.amazonaws.com/ABB-catalog/Ex-Solutions.pdf",
892
+ "https://agent-product-discovery.s3.ap-south-1.amazonaws.com/ABB-catalog/Low_power_UPS_catalogue_EN.pdf"
893
+ ],
894
+ interactive=True
895
+ )
896
+ process_url_btn = gr.Button("Process PDF from URL", elem_classes="action-button")
897
+
898
  result_text = gr.Textbox(label="Processing Result")
899
+
900
 
901
  # Set up event handlers
902
  send_btn.click(
 
927
  api_name="process_pdfs"
928
  )
929
 
930
+ # Add this event handler
931
+ process_url_btn.click(
932
+ process_pdf_from_url,
933
+ [pdf_url],
934
+ [result_text],
935
+ api_name="process_pdf_url"
936
+ )
937
+
938
+ # Add this dropdown change event
939
+ pdf_dropdown.change(
940
+ lambda x: x,
941
+ [pdf_dropdown],
942
+ [pdf_url],
943
+ api_name="update_pdf_url"
944
+ )
945
+
946
  # Add the system setup to run when the app loads
947
  app.load(setup_and_update, None, status_display)
948