Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -83,11 +83,7 @@ def init_astra_db():
|
|
| 83 |
astra_db_endpoint = os.getenv("ASTRA_DB_ENDPOINT", "https://8e3fd85c-5f28-4e1f-8538-9dd28a3ea2b0-us-east-2.apps.astra.datastax.com")
|
| 84 |
|
| 85 |
# Initialize the client
|
| 86 |
-
client = DataAPIClient(astra_db_application_token)
|
| 87 |
-
db = client.get_database_by_api_endpoint(
|
| 88 |
-
astra_db_endpoint,
|
| 89 |
-
keyspace=astra_db_keyspace
|
| 90 |
-
)
|
| 91 |
|
| 92 |
# Get or create collections
|
| 93 |
product_embeddings = db.get_collection("product_embeddings")
|
|
@@ -259,6 +255,57 @@ def process_pdf_catalogs():
|
|
| 259 |
print(f"Error processing PDF catalogs: {e}")
|
| 260 |
return {"status": "error", "message": str(e)}
|
| 261 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 262 |
# Function to store text chunks in Astra DB with embeddings
|
| 263 |
def store_chunks_in_db(chunks, product_type):
|
| 264 |
"""Store text chunks with embeddings in Astra DB"""
|
|
@@ -648,22 +695,30 @@ def setup_and_update():
|
|
| 648 |
|
| 649 |
# Initialize database and other services
|
| 650 |
global astra_session, astra_keyspace, s3_client, embeddings_model
|
| 651 |
-
|
| 652 |
-
|
| 653 |
-
|
| 654 |
-
|
|
|
|
| 655 |
else:
|
| 656 |
astra_session = None
|
| 657 |
astra_keyspace = None
|
| 658 |
-
|
| 659 |
s3_client = init_s3_client()
|
| 660 |
embeddings_model = get_embeddings_model()
|
| 661 |
|
| 662 |
# Return status
|
| 663 |
-
|
| 664 |
-
|
| 665 |
-
|
| 666 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 667 |
|
| 668 |
def create_gradio_app():
|
| 669 |
# Define CSS styles for a more modern, appealing interface
|
|
@@ -824,7 +879,24 @@ def create_gradio_app():
|
|
| 824 |
s3_bucket = gr.Textbox(label="S3 Bucket Name")
|
| 825 |
s3_prefix = gr.Textbox(label="S3 Prefix (folder)", value="catalogs/")
|
| 826 |
process_btn = gr.Button("Process PDFs from S3", elem_classes="action-button")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 827 |
result_text = gr.Textbox(label="Processing Result")
|
|
|
|
| 828 |
|
| 829 |
# Set up event handlers
|
| 830 |
send_btn.click(
|
|
@@ -855,6 +927,22 @@ def create_gradio_app():
|
|
| 855 |
api_name="process_pdfs"
|
| 856 |
)
|
| 857 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 858 |
# Add the system setup to run when the app loads
|
| 859 |
app.load(setup_and_update, None, status_display)
|
| 860 |
|
|
|
|
| 83 |
astra_db_endpoint = os.getenv("ASTRA_DB_ENDPOINT", "https://8e3fd85c-5f28-4e1f-8538-9dd28a3ea2b0-us-east-2.apps.astra.datastax.com")
|
| 84 |
|
| 85 |
# Initialize the client
|
| 86 |
+
client = DataAPIClient(api_endpoint=astra_db_endpoint, token=astra_db_application_token)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 87 |
|
| 88 |
# Get or create collections
|
| 89 |
product_embeddings = db.get_collection("product_embeddings")
|
|
|
|
| 255 |
print(f"Error processing PDF catalogs: {e}")
|
| 256 |
return {"status": "error", "message": str(e)}
|
| 257 |
|
| 258 |
+
# Add this function to process PDFs from URLs
|
| 259 |
+
def process_pdf_from_url(url):
|
| 260 |
+
"""Download and process a PDF from a URL"""
|
| 261 |
+
try:
|
| 262 |
+
# Download the PDF
|
| 263 |
+
response = requests.get(url, stream=True)
|
| 264 |
+
if response.status_code != 200:
|
| 265 |
+
return f"Error downloading PDF: HTTP status code {response.status_code}"
|
| 266 |
+
|
| 267 |
+
# Get the content
|
| 268 |
+
pdf_content = response.content
|
| 269 |
+
|
| 270 |
+
# Determine product type from URL or filename
|
| 271 |
+
product_type = "other"
|
| 272 |
+
for pt in ["circuit_breaker", "motor_starter", "contactor", "switch", "relay"]:
|
| 273 |
+
if pt in url.lower():
|
| 274 |
+
product_type = pt.replace("_", " ")
|
| 275 |
+
break
|
| 276 |
+
|
| 277 |
+
# Process PDF text content
|
| 278 |
+
pdf_reader = PyPDF2.PdfReader(io.BytesIO(pdf_content))
|
| 279 |
+
text_content = ""
|
| 280 |
+
|
| 281 |
+
# Extract text from each page
|
| 282 |
+
for page in pdf_reader.pages:
|
| 283 |
+
text_content += page.extract_text() + "\n\n"
|
| 284 |
+
|
| 285 |
+
# Split text into smaller chunks for efficient embedding
|
| 286 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
| 287 |
+
chunk_size=1000,
|
| 288 |
+
chunk_overlap=200,
|
| 289 |
+
length_function=len,
|
| 290 |
+
)
|
| 291 |
+
chunks = text_splitter.split_text(text_content)
|
| 292 |
+
|
| 293 |
+
# Store chunks in vector database (if available)
|
| 294 |
+
if astra_session:
|
| 295 |
+
store_chunks_in_db(chunks, product_type)
|
| 296 |
+
|
| 297 |
+
# Extract and store images (if database available)
|
| 298 |
+
images_count = 0
|
| 299 |
+
if astra_session:
|
| 300 |
+
images_count = extract_images_from_pdf(pdf_content, product_type)
|
| 301 |
+
|
| 302 |
+
print(f"Processed PDF from URL: {url}: {len(chunks)} text chunks and {images_count} images extracted")
|
| 303 |
+
return f"Successfully processed PDF from URL: {len(chunks)} chunks, {images_count} images"
|
| 304 |
+
|
| 305 |
+
except Exception as e:
|
| 306 |
+
print(f"Error processing PDF from URL: {e}")
|
| 307 |
+
return f"Error processing PDF: {str(e)}"
|
| 308 |
+
|
| 309 |
# Function to store text chunks in Astra DB with embeddings
|
| 310 |
def store_chunks_in_db(chunks, product_type):
|
| 311 |
"""Store text chunks with embeddings in Astra DB"""
|
|
|
|
| 695 |
|
| 696 |
# Initialize database and other services
|
| 697 |
global astra_session, astra_keyspace, s3_client, embeddings_model
|
| 698 |
+
astra_result = init_astra_db()
|
| 699 |
+
|
| 700 |
+
if astra_result:
|
| 701 |
+
astra_session = astra_result.get("db")
|
| 702 |
+
astra_keyspace = astra_result.get("keyspace")
|
| 703 |
else:
|
| 704 |
astra_session = None
|
| 705 |
astra_keyspace = None
|
| 706 |
+
|
| 707 |
s3_client = init_s3_client()
|
| 708 |
embeddings_model = get_embeddings_model()
|
| 709 |
|
| 710 |
# Return status
|
| 711 |
+
status_msg = "System is ready. "
|
| 712 |
+
if not openai_initialized:
|
| 713 |
+
status_msg += "OpenAI API not initialized. "
|
| 714 |
+
if not mistral_initialized:
|
| 715 |
+
status_msg += "Mistral API not initialized. "
|
| 716 |
+
if not astra_session:
|
| 717 |
+
status_msg += "Astra DB not connected. "
|
| 718 |
+
if not s3_client:
|
| 719 |
+
status_msg += "S3 client not initialized. "
|
| 720 |
+
|
| 721 |
+
return status_msg
|
| 722 |
|
| 723 |
def create_gradio_app():
|
| 724 |
# Define CSS styles for a more modern, appealing interface
|
|
|
|
| 879 |
s3_bucket = gr.Textbox(label="S3 Bucket Name")
|
| 880 |
s3_prefix = gr.Textbox(label="S3 Prefix (folder)", value="catalogs/")
|
| 881 |
process_btn = gr.Button("Process PDFs from S3", elem_classes="action-button")
|
| 882 |
+
|
| 883 |
+
# Add direct PDF URL input
|
| 884 |
+
with gr.Tab("Direct PDF URLs"):
|
| 885 |
+
pdf_url = gr.Textbox(label="PDF URL", placeholder="https://example.com/sample.pdf")
|
| 886 |
+
pdf_dropdown = gr.Dropdown(
|
| 887 |
+
label="ABB Catalog PDFs",
|
| 888 |
+
choices=[
|
| 889 |
+
"https://agent-product-discovery.s3.ap-south-1.amazonaws.com/ABB-catalog/ABB+Ability%E2%84%A2+System+800xA%C2%AE+6.2.pdf",
|
| 890 |
+
"https://agent-product-discovery.s3.ap-south-1.amazonaws.com/ABB-catalog/Enclosed+Softstarters.pdf",
|
| 891 |
+
"https://agent-product-discovery.s3.ap-south-1.amazonaws.com/ABB-catalog/Ex-Solutions.pdf",
|
| 892 |
+
"https://agent-product-discovery.s3.ap-south-1.amazonaws.com/ABB-catalog/Low_power_UPS_catalogue_EN.pdf"
|
| 893 |
+
],
|
| 894 |
+
interactive=True
|
| 895 |
+
)
|
| 896 |
+
process_url_btn = gr.Button("Process PDF from URL", elem_classes="action-button")
|
| 897 |
+
|
| 898 |
result_text = gr.Textbox(label="Processing Result")
|
| 899 |
+
|
| 900 |
|
| 901 |
# Set up event handlers
|
| 902 |
send_btn.click(
|
|
|
|
| 927 |
api_name="process_pdfs"
|
| 928 |
)
|
| 929 |
|
| 930 |
+
# Add this event handler
|
| 931 |
+
process_url_btn.click(
|
| 932 |
+
process_pdf_from_url,
|
| 933 |
+
[pdf_url],
|
| 934 |
+
[result_text],
|
| 935 |
+
api_name="process_pdf_url"
|
| 936 |
+
)
|
| 937 |
+
|
| 938 |
+
# Add this dropdown change event
|
| 939 |
+
pdf_dropdown.change(
|
| 940 |
+
lambda x: x,
|
| 941 |
+
[pdf_dropdown],
|
| 942 |
+
[pdf_url],
|
| 943 |
+
api_name="update_pdf_url"
|
| 944 |
+
)
|
| 945 |
+
|
| 946 |
# Add the system setup to run when the app loads
|
| 947 |
app.load(setup_and_update, None, status_display)
|
| 948 |
|