ciorant commited on
Commit
4f2c271
·
1 Parent(s): dac9d02

New regex matching for years and fixed UI issues

Browse files
Files changed (2) hide show
  1. src/document_processor.py +14 -3
  2. streamlit_app.py +1 -27
src/document_processor.py CHANGED
@@ -126,9 +126,20 @@ class DocumentProcessor:
126
  metadata["authors"] = possible_authors
127
 
128
  # crude heuristic: find year (e.g., 2023, 2024)
129
- year_match = re.search(r"\b(19|20)\d{2}\b", first_page)
130
- if year_match:
131
- metadata["year"] = year_match.group(0)
 
 
 
 
 
 
 
 
 
 
 
132
 
133
  except Exception:
134
  pass
 
126
  metadata["authors"] = possible_authors
127
 
128
  # crude heuristic: find year (e.g., 2023, 2024)
129
+ year_patterns = [
130
+ r"\b(19|20)\d{2}\b", # Basic year
131
+ r"©\s*(19|20)\d{2}", # Copyright year
132
+ r"\((19|20)\d{2}\)", # Year in parentheses
133
+ r"(19|20)\d{2}[,.)]", # Year followed by comma/period
134
+ ]
135
+
136
+ for pattern in year_patterns:
137
+ year_match = re.search(pattern, first_page)
138
+ if year_match:
139
+ year_text = re.search(r"(19|20)\d{2}", year_match.group(0))
140
+ if year_text:
141
+ metadata["year"] = year_text.group(0)
142
+ break
143
 
144
  except Exception:
145
  pass
streamlit_app.py CHANGED
@@ -39,7 +39,7 @@ with st.sidebar:
39
  "What is paternalism in bioethics?",
40
  "What are the ethical issues with genetic testing?",
41
  "How should AI bias in healthcare be addressed?",
42
- "What are arguments raised by antinatalists?",
43
  "What does it mean for women’s autonomy to be respected?"
44
  ]
45
 
@@ -119,32 +119,6 @@ if question and st.session_state.query_count < 30:
119
  # Display the final answer
120
  response_placeholder.markdown(answer)
121
 
122
-
123
-
124
- """
125
- with debug_col:
126
- st.markdown("### 🔍 Debug Info")
127
-
128
- # Show search results info
129
- if 'bot' in st.session_state:
130
- # Get search results for debug display
131
- search_results = st.session_state.bot.vector_store.search(question, k=3)
132
- with st.expander("📊 Search Results", expanded=True):
133
- for i, r in enumerate(search_results):
134
- st.write(f"**Result {i + 1}** (Score: {r.get('similarity_score', 0):.3f})")
135
- st.write(f"Source: {r['metadata'].get('filename', 'Unknown')}")
136
- st.write(f"Preview: {r['content'][:200]}...")
137
- st.write("---")
138
-
139
- # Show response metadata
140
- st.metric("Response Time", f"{response_time:.2f}s")
141
- st.metric("Model", "GPT-4o-mini")
142
-
143
- # Show conversation history count
144
- if hasattr(st.session_state.bot, 'history'):
145
- st.metric("Conversation Turn", len(st.session_state.bot.history))
146
- """
147
-
148
  # Show source information
149
  with st.expander("📚 About the Sources"):
150
  st.markdown("""
 
39
  "What is paternalism in bioethics?",
40
  "What are the ethical issues with genetic testing?",
41
  "How should AI bias in healthcare be addressed?",
42
+ "Is antinatalism rational?",
43
  "What does it mean for women’s autonomy to be respected?"
44
  ]
45
 
 
119
  # Display the final answer
120
  response_placeholder.markdown(answer)
121
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122
  # Show source information
123
  with st.expander("📚 About the Sources"):
124
  st.markdown("""