"""UI components for the Streamlit app."""
import streamlit as st
import time
import pandas as pd
import io
import re
import csv
# Precompiled regex pattern for extracting data from markdown code blocks
_DATA_BLOCK_PATTERN = re.compile(r'```(csv|excel)\n(.*?)\n```', re.DOTALL)
def display_info_icons():
if "info_icons_displayed" not in st.session_state:
st.session_state.info_icons_displayed = True
st.session_state.info_icons_time = time.time()
if st.session_state.info_icons_displayed:
st.markdown(
"""
💻 Enter URL
Fetch webpage content for extraction.
🔍 Specify Data
Define what data you want to extract.
💾 Save Data
Save in JSON, CSV, or Excel format.
🔄 Convert Data
Convert between different formats.
""",
unsafe_allow_html=True
)
if time.time() - st.session_state.info_icons_time > 10 or ("messages" in st.session_state and len(st.session_state.messages) > 0):
st.session_state.info_icons_displayed = False
def extract_data_from_markdown(text: str | bytes | io.BytesIO) -> str | bytes | io.BytesIO | None:
"""Extract data content from markdown code blocks."""
if isinstance(text, io.BytesIO):
return text
if isinstance(text, bytes):
text = text.decode('utf-8')
if match := _DATA_BLOCK_PATTERN.search(text):
data_type = match.group(1)
data = match.group(2).strip()
if data_type == 'excel':
return io.BytesIO(data.encode())
return data
return None
def format_data(data: str | bytes | io.BytesIO, format_type: str) -> pd.DataFrame | None:
"""Format data into a pandas DataFrame."""
try:
if isinstance(data, io.BytesIO):
if format_type == 'excel':
return pd.read_excel(data, engine='openpyxl')
data.seek(0)
return pd.read_csv(data)
elif isinstance(data, bytes):
if format_type == 'excel':
return pd.read_excel(io.BytesIO(data), engine='openpyxl')
return pd.read_csv(io.BytesIO(data))
else:
if format_type == 'csv':
csv_data = list(csv.reader(io.StringIO(data)))
if not csv_data:
raise ValueError("Empty CSV data")
max_columns = max(len(row) for row in csv_data)
padded_data = [row + [''] * (max_columns - len(row)) for row in csv_data]
headers = padded_data[0]
unique_headers = []
for i, header in enumerate(headers):
if header == '' or header in unique_headers:
unique_headers.append(f'Column_{i+1}')
else:
unique_headers.append(header)
df = pd.DataFrame(padded_data[1:], columns=unique_headers)
# Remove empty columns
return df.loc[:, (df != '').any(axis=0)]
elif format_type == 'excel':
return pd.read_excel(io.BytesIO(data.encode()), engine='openpyxl')
except Exception as e:
st.error(f"Error formatting data: {str(e)}")
return None
def display_message(message):
content = message["content"]
if isinstance(content, (str, bytes, io.BytesIO)):
data = extract_data_from_markdown(content)
if data is not None:
if isinstance(data, io.BytesIO) or (isinstance(content, str) and 'excel' in content.lower()):
df = format_data(data, 'excel')
else:
df = format_data(data, 'csv')
if df is not None:
st.dataframe(df)
else:
st.warning("Failed to display data as a table. Showing raw content:")
st.code(content)
else:
st.markdown(content)
else:
st.markdown(str(content))