Spaces:
Sleeping
Sleeping
New regex matching for years and fixed UI issues
Browse files- src/document_processor.py +14 -3
- streamlit_app.py +1 -27
src/document_processor.py
CHANGED
|
@@ -126,9 +126,20 @@ class DocumentProcessor:
|
|
| 126 |
metadata["authors"] = possible_authors
|
| 127 |
|
| 128 |
# crude heuristic: find year (e.g., 2023, 2024)
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 132 |
|
| 133 |
except Exception:
|
| 134 |
pass
|
|
|
|
| 126 |
metadata["authors"] = possible_authors
|
| 127 |
|
| 128 |
# crude heuristic: find year (e.g., 2023, 2024)
|
| 129 |
+
year_patterns = [
|
| 130 |
+
r"\b(19|20)\d{2}\b", # Basic year
|
| 131 |
+
r"©\s*(19|20)\d{2}", # Copyright year
|
| 132 |
+
r"\((19|20)\d{2}\)", # Year in parentheses
|
| 133 |
+
r"(19|20)\d{2}[,.)]", # Year followed by comma/period
|
| 134 |
+
]
|
| 135 |
+
|
| 136 |
+
for pattern in year_patterns:
|
| 137 |
+
year_match = re.search(pattern, first_page)
|
| 138 |
+
if year_match:
|
| 139 |
+
year_text = re.search(r"(19|20)\d{2}", year_match.group(0))
|
| 140 |
+
if year_text:
|
| 141 |
+
metadata["year"] = year_text.group(0)
|
| 142 |
+
break
|
| 143 |
|
| 144 |
except Exception:
|
| 145 |
pass
|
streamlit_app.py
CHANGED
|
@@ -39,7 +39,7 @@ with st.sidebar:
|
|
| 39 |
"What is paternalism in bioethics?",
|
| 40 |
"What are the ethical issues with genetic testing?",
|
| 41 |
"How should AI bias in healthcare be addressed?",
|
| 42 |
-
"
|
| 43 |
"What does it mean for women’s autonomy to be respected?"
|
| 44 |
]
|
| 45 |
|
|
@@ -119,32 +119,6 @@ if question and st.session_state.query_count < 30:
|
|
| 119 |
# Display the final answer
|
| 120 |
response_placeholder.markdown(answer)
|
| 121 |
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
"""
|
| 125 |
-
with debug_col:
|
| 126 |
-
st.markdown("### 🔍 Debug Info")
|
| 127 |
-
|
| 128 |
-
# Show search results info
|
| 129 |
-
if 'bot' in st.session_state:
|
| 130 |
-
# Get search results for debug display
|
| 131 |
-
search_results = st.session_state.bot.vector_store.search(question, k=3)
|
| 132 |
-
with st.expander("📊 Search Results", expanded=True):
|
| 133 |
-
for i, r in enumerate(search_results):
|
| 134 |
-
st.write(f"**Result {i + 1}** (Score: {r.get('similarity_score', 0):.3f})")
|
| 135 |
-
st.write(f"Source: {r['metadata'].get('filename', 'Unknown')}")
|
| 136 |
-
st.write(f"Preview: {r['content'][:200]}...")
|
| 137 |
-
st.write("---")
|
| 138 |
-
|
| 139 |
-
# Show response metadata
|
| 140 |
-
st.metric("Response Time", f"{response_time:.2f}s")
|
| 141 |
-
st.metric("Model", "GPT-4o-mini")
|
| 142 |
-
|
| 143 |
-
# Show conversation history count
|
| 144 |
-
if hasattr(st.session_state.bot, 'history'):
|
| 145 |
-
st.metric("Conversation Turn", len(st.session_state.bot.history))
|
| 146 |
-
"""
|
| 147 |
-
|
| 148 |
# Show source information
|
| 149 |
with st.expander("📚 About the Sources"):
|
| 150 |
st.markdown("""
|
|
|
|
| 39 |
"What is paternalism in bioethics?",
|
| 40 |
"What are the ethical issues with genetic testing?",
|
| 41 |
"How should AI bias in healthcare be addressed?",
|
| 42 |
+
"Is antinatalism rational?",
|
| 43 |
"What does it mean for women’s autonomy to be respected?"
|
| 44 |
]
|
| 45 |
|
|
|
|
| 119 |
# Display the final answer
|
| 120 |
response_placeholder.markdown(answer)
|
| 121 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 122 |
# Show source information
|
| 123 |
with st.expander("📚 About the Sources"):
|
| 124 |
st.markdown("""
|