Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -89,8 +89,10 @@ class WebpageContentProcessor:
|
|
| 89 |
structured_chunks = []
|
| 90 |
for i, node in enumerate(nodes):
|
| 91 |
content = node.get_content(metadata_mode=MetadataMode.NONE).strip()
|
|
|
|
|
|
|
| 92 |
title_match = re.match(r"^(#+)\s*(.*)", content)
|
| 93 |
-
title = title_match.group(2).strip() if title_match else (content.split('\n')[0][:70] + "...")
|
| 94 |
structured_chunks.append({"id": i, "title": title, "content": content})
|
| 95 |
return structured_chunks
|
| 96 |
|
|
@@ -159,6 +161,7 @@ class ChunkManager:
|
|
| 159 |
|
| 160 |
def delete_chunk(self, chunk_id: int):
|
| 161 |
self._chunks = [c for c in self._chunks if c["id"] != chunk_id]
|
|
|
|
| 162 |
for i, chunk in enumerate(self._chunks):
|
| 163 |
chunk['id'] = i
|
| 164 |
|
|
@@ -172,19 +175,26 @@ class ChunkManager:
|
|
| 172 |
self.target_grade_max = grade_max
|
| 173 |
self.target_min_chunk_words = min_words
|
| 174 |
self.target_max_chunk_words = max_words
|
| 175 |
-
|
|
|
|
| 176 |
|
|
|
|
| 177 |
st.set_page_config(layout="wide", page_title="Webpage Content Editor")
|
| 178 |
|
| 179 |
-
# Initialize session state variables
|
| 180 |
-
|
| 181 |
-
st.session_state
|
| 182 |
-
|
| 183 |
-
st.session_state
|
| 184 |
-
|
| 185 |
-
st.session_state
|
| 186 |
-
|
| 187 |
-
st.session_state
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 188 |
|
| 189 |
processor = st.session_state.content_processor
|
| 190 |
manager = st.session_state.chunk_manager
|
|
@@ -198,29 +208,31 @@ st.info(
|
|
| 198 |
icon="ℹ️"
|
| 199 |
)
|
| 200 |
|
| 201 |
-
url_input = st.text_input("Enter a webpage URL to begin", key="
|
| 202 |
|
| 203 |
if st.button("Process URL", use_container_width=True):
|
| 204 |
-
|
|
|
|
| 205 |
with st.spinner("Fetching and processing content..."):
|
| 206 |
-
markdown = processor.fetch_and_convert_to_markdown(url_input)
|
| 207 |
if "Error" in markdown:
|
| 208 |
st.session_state.status_message = markdown
|
| 209 |
manager.set_chunks([])
|
|
|
|
| 210 |
else:
|
| 211 |
chunks = processor.parse_markdown_into_chunks(markdown)
|
| 212 |
manager.set_chunks(chunks)
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
| 219 |
st.rerun()
|
| 220 |
|
| 221 |
if st.session_state.status_message:
|
| 222 |
st.toast(st.session_state.status_message)
|
| 223 |
-
st.session_state.status_message = ""
|
| 224 |
|
| 225 |
tab1, tab2 = st.tabs(["Chunk Editor", "Settings & Overview"])
|
| 226 |
|
|
@@ -229,23 +241,28 @@ with tab1:
|
|
| 229 |
if not chunks:
|
| 230 |
st.write("Process a URL to start editing chunks.")
|
| 231 |
else:
|
|
|
|
| 232 |
# Ensure selected_chunk_id is valid
|
| 233 |
-
if st.session_state.selected_chunk_id not in
|
| 234 |
-
st.session_state.selected_chunk_id =
|
| 235 |
|
| 236 |
if st.session_state.selected_chunk_id is not None:
|
| 237 |
chunk_options = {c['id']: f"Chunk {c['id']}: {c['title']}" for c in chunks}
|
| 238 |
|
| 239 |
-
#
|
| 240 |
-
# When
|
| 241 |
-
st.selectbox(
|
| 242 |
"Select a chunk to edit",
|
| 243 |
-
options=
|
| 244 |
format_func=lambda x: chunk_options.get(x, "Invalid Chunk"),
|
| 245 |
-
|
| 246 |
-
index=list(chunk_options.keys()).index(st.session_state.selected_chunk_id)
|
| 247 |
)
|
| 248 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 249 |
selected_chunk = manager.get_chunk_by_id(st.session_state.selected_chunk_id)
|
| 250 |
|
| 251 |
if selected_chunk:
|
|
@@ -255,7 +272,7 @@ with tab1:
|
|
| 255 |
"Chunk Content",
|
| 256 |
value=selected_chunk['content'],
|
| 257 |
height=300,
|
| 258 |
-
key=f"editor_{selected_chunk['id']}" # Unique key forces re-render
|
| 259 |
)
|
| 260 |
|
| 261 |
col1, col2, _ = st.columns([1, 1, 4])
|
|
@@ -265,10 +282,8 @@ with tab1:
|
|
| 265 |
st.rerun()
|
| 266 |
|
| 267 |
if col2.button("Delete Chunk", use_container_width=True, key=f"delete_{selected_chunk['id']}"):
|
| 268 |
-
|
| 269 |
-
manager.delete_chunk(old_id)
|
| 270 |
st.session_state.status_message = "Chunk deleted!"
|
| 271 |
-
# Select the next available chunk or none if empty
|
| 272 |
remaining_chunks = manager.get_chunks()
|
| 273 |
st.session_state.selected_chunk_id = remaining_chunks[0]['id'] if remaining_chunks else None
|
| 274 |
st.rerun()
|
|
|
|
| 89 |
structured_chunks = []
|
| 90 |
for i, node in enumerate(nodes):
|
| 91 |
content = node.get_content(metadata_mode=MetadataMode.NONE).strip()
|
| 92 |
+
if not content:
|
| 93 |
+
continue
|
| 94 |
title_match = re.match(r"^(#+)\s*(.*)", content)
|
| 95 |
+
title = title_match.group(2).strip() if title_match and title_match.group(2).strip() else (content.split('\n')[0][:70] + "...")
|
| 96 |
structured_chunks.append({"id": i, "title": title, "content": content})
|
| 97 |
return structured_chunks
|
| 98 |
|
|
|
|
| 161 |
|
| 162 |
def delete_chunk(self, chunk_id: int):
|
| 163 |
self._chunks = [c for c in self._chunks if c["id"] != chunk_id]
|
| 164 |
+
# Re-index remaining chunks to maintain sequential IDs
|
| 165 |
for i, chunk in enumerate(self._chunks):
|
| 166 |
chunk['id'] = i
|
| 167 |
|
|
|
|
| 175 |
self.target_grade_max = grade_max
|
| 176 |
self.target_min_chunk_words = min_words
|
| 177 |
self.target_max_chunk_words = max_words
|
| 178 |
+
# Recalculate stats for all chunks with new targets
|
| 179 |
+
self.set_chunks(self.get_chunks())
|
| 180 |
|
| 181 |
+
# --- Streamlit UI ---
|
| 182 |
st.set_page_config(layout="wide", page_title="Webpage Content Editor")
|
| 183 |
|
| 184 |
+
# Initialize session state variables if they don't exist
|
| 185 |
+
def init_session_state():
|
| 186 |
+
if 'chunk_manager' not in st.session_state:
|
| 187 |
+
st.session_state.chunk_manager = ChunkManager()
|
| 188 |
+
if 'content_processor' not in st.session_state:
|
| 189 |
+
st.session_state.content_processor = WebpageContentProcessor()
|
| 190 |
+
if 'selected_chunk_id' not in st.session_state:
|
| 191 |
+
st.session_state.selected_chunk_id = None
|
| 192 |
+
if 'status_message' not in st.session_state:
|
| 193 |
+
st.session_state.status_message = ""
|
| 194 |
+
if 'url_input' not in st.session_state:
|
| 195 |
+
st.session_state.url_input = ""
|
| 196 |
+
|
| 197 |
+
init_session_state()
|
| 198 |
|
| 199 |
processor = st.session_state.content_processor
|
| 200 |
manager = st.session_state.chunk_manager
|
|
|
|
| 208 |
icon="ℹ️"
|
| 209 |
)
|
| 210 |
|
| 211 |
+
url_input = st.text_input("Enter a webpage URL to begin", value=st.session_state.url_input, key="url_input_widget")
|
| 212 |
|
| 213 |
if st.button("Process URL", use_container_width=True):
|
| 214 |
+
st.session_state.url_input = st.session_state.url_input_widget
|
| 215 |
+
if st.session_state.url_input:
|
| 216 |
with st.spinner("Fetching and processing content..."):
|
| 217 |
+
markdown = processor.fetch_and_convert_to_markdown(st.session_state.url_input)
|
| 218 |
if "Error" in markdown:
|
| 219 |
st.session_state.status_message = markdown
|
| 220 |
manager.set_chunks([])
|
| 221 |
+
st.session_state.selected_chunk_id = None
|
| 222 |
else:
|
| 223 |
chunks = processor.parse_markdown_into_chunks(markdown)
|
| 224 |
manager.set_chunks(chunks)
|
| 225 |
+
if chunks:
|
| 226 |
+
st.session_state.status_message = f"Successfully processed {len(chunks)} chunks."
|
| 227 |
+
st.session_state.selected_chunk_id = chunks[0]['id']
|
| 228 |
+
else:
|
| 229 |
+
st.session_state.status_message = "Could not extract content chunks."
|
| 230 |
+
st.session_state.selected_chunk_id = None
|
| 231 |
st.rerun()
|
| 232 |
|
| 233 |
if st.session_state.status_message:
|
| 234 |
st.toast(st.session_state.status_message)
|
| 235 |
+
st.session_state.status_message = ""
|
| 236 |
|
| 237 |
tab1, tab2 = st.tabs(["Chunk Editor", "Settings & Overview"])
|
| 238 |
|
|
|
|
| 241 |
if not chunks:
|
| 242 |
st.write("Process a URL to start editing chunks.")
|
| 243 |
else:
|
| 244 |
+
chunk_ids = [c['id'] for c in chunks]
|
| 245 |
# Ensure selected_chunk_id is valid
|
| 246 |
+
if st.session_state.selected_chunk_id not in chunk_ids:
|
| 247 |
+
st.session_state.selected_chunk_id = chunk_ids[0] if chunk_ids else None
|
| 248 |
|
| 249 |
if st.session_state.selected_chunk_id is not None:
|
| 250 |
chunk_options = {c['id']: f"Chunk {c['id']}: {c['title']}" for c in chunks}
|
| 251 |
|
| 252 |
+
# The selectbox's state is now managed directly by st.session_state.selected_chunk_id
|
| 253 |
+
# When the user selects a new option, Streamlit automatically updates this state variable and reruns the script.
|
| 254 |
+
selected_id = st.selectbox(
|
| 255 |
"Select a chunk to edit",
|
| 256 |
+
options=chunk_ids,
|
| 257 |
format_func=lambda x: chunk_options.get(x, "Invalid Chunk"),
|
| 258 |
+
index=chunk_ids.index(st.session_state.selected_chunk_id)
|
|
|
|
| 259 |
)
|
| 260 |
|
| 261 |
+
# Update the session state ONLY if the user selection has changed
|
| 262 |
+
if selected_id != st.session_state.selected_chunk_id:
|
| 263 |
+
st.session_state.selected_chunk_id = selected_id
|
| 264 |
+
st.rerun()
|
| 265 |
+
|
| 266 |
selected_chunk = manager.get_chunk_by_id(st.session_state.selected_chunk_id)
|
| 267 |
|
| 268 |
if selected_chunk:
|
|
|
|
| 272 |
"Chunk Content",
|
| 273 |
value=selected_chunk['content'],
|
| 274 |
height=300,
|
| 275 |
+
key=f"editor_{selected_chunk['id']}" # Unique key forces widget to re-render on selection change
|
| 276 |
)
|
| 277 |
|
| 278 |
col1, col2, _ = st.columns([1, 1, 4])
|
|
|
|
| 282 |
st.rerun()
|
| 283 |
|
| 284 |
if col2.button("Delete Chunk", use_container_width=True, key=f"delete_{selected_chunk['id']}"):
|
| 285 |
+
manager.delete_chunk(selected_chunk['id'])
|
|
|
|
| 286 |
st.session_state.status_message = "Chunk deleted!"
|
|
|
|
| 287 |
remaining_chunks = manager.get_chunks()
|
| 288 |
st.session_state.selected_chunk_id = remaining_chunks[0]['id'] if remaining_chunks else None
|
| 289 |
st.rerun()
|