forzen commited on
Commit
634b5dc
·
verified ·
1 Parent(s): 86ce4e4

Upload 11 files

Browse files
Files changed (11) hide show
  1. .dockerignore +38 -0
  2. Dockerfile +27 -0
  3. app.py +346 -0
  4. chat_processor.py +92 -0
  5. config.py +24 -0
  6. db_manager.py +134 -0
  7. feedback_generator.py +120 -0
  8. llm_handler.py +135 -0
  9. prompts.py +129 -0
  10. rag_manager.py +132 -0
  11. requirements.txt +6 -0
.dockerignore ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Git
2
+ .git
3
+ .gitignore
4
+
5
+ # Python virtual environment
6
+ venv/
7
+ venv_feedback/
8
+ *.egg-info/
9
+ __pycache__/
10
+ *.pyc
11
+ *.pyo
12
+
13
+ # Docker specific
14
+ Dockerfile
15
+ .dockerignore
16
+
17
+ # IDE / OS specific
18
+ .vscode/
19
+ .idea/
20
+ .DS_Store
21
+ Thumbs.db
22
+
23
+ # Local configuration not for image (API key via HF Secrets)
24
+ .env
25
+
26
+ # Logs and other local artifacts
27
+ *.log
28
+ dist/
29
+ build/
30
+ *.local
31
+
32
+ # 如果你决定在git中提交空的数据库目录/文件以确保路径存在于持久化存储中,
33
+ # 那么不要在这里忽略它们。否则,如果应用可以自动创建,则可以忽略。
34
+ # 对于HF持久化存储,最好是让应用在运行时按需创建这些文件/目录在持久化卷上。
35
+ # 所以,通常不在这里忽略它们,但要确保初始提交时它们是空的或不存在,
36
+ # 避免将本地测试数据打入镜像或仓库。
37
+ # chroma_db/
38
+ # students.db
Dockerfile ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 1. 使用官方Python基础镜像
2
+ FROM python:3.9-slim-buster
3
+
4
+ # 2. 设置环境变量
5
+ ENV PYTHONDONTWRITEBYTECODE 1
6
+ ENV PYTHONUNBUFFERED 1
7
+
8
+ # 3. 设置工作目录
9
+ WORKDIR /app
10
+
11
+ # 4. (可选) 安装系统依赖 - 根据需要取消注释
12
+ # RUN apt-get update && apt-get install -y --no-install-recommends gcc && rm -rf /var/lib/apt/lists/*
13
+
14
+ # 5. 复制依赖文件
15
+ COPY requirements.txt .
16
+
17
+ # 6. 安装Python依赖
18
+ RUN pip install --no-cache-dir -r requirements.txt
19
+
20
+ # 7. 复制项目所有文件到工作目录 (确保.dockerignore配置正确)
21
+ COPY . .
22
+
23
+ # 8. 暴露Streamlit运行的端口 (HF Spaces会自动处理端口映射)
24
+ EXPOSE 8501
25
+
26
+ # 9. 定义容器启动时运行的命令
27
+ CMD ["streamlit", "run", "app.py", "--server.port=8501", "--server.address=0.0.0.0", "--server.headless=true"]
app.py ADDED
@@ -0,0 +1,346 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py
2
+ import streamlit as st
3
+ import datetime
4
+ import os
5
+
6
+ # Import all necessary modules from your project
7
+ from config import GOOGLE_API_KEY, STUDENT_DB_PATH, CHROMA_DB_PATH, RAG_COLLECTION_NAME
8
+ from db_manager import init_student_db, get_all_student_names, get_student_characteristics, add_or_update_student
9
+ from rag_manager import add_documents_to_rag, query_rag # get_all_student_observations_from_rag is used by chat_processor
10
+ from chat_processor import extract_info_from_chat, update_student_characteristics_from_rag, batch_update_all_students_characteristics
11
+ from feedback_generator import (
12
+ generate_boss_feedback,
13
+ generate_public_feedback,
14
+ generate_parent_feedback,
15
+ get_events_summary_for_day
16
+ )
17
+ # import prompts # Prompts are used by other modules, not directly here typically
18
+
19
+ # --- Page Configuration and Initialization ---
20
+ st.set_page_config(page_title="晚托反馈助手", layout="wide", initial_sidebar_state="expanded")
21
+
22
+ # --- Check API Key ---
23
+ # On Hugging Face, this will be set via Secrets. For local, from .env or environment.
24
+ if not GOOGLE_API_KEY:
25
+ st.error("错误:GOOGLE_API_KEY 未配置。请在Hugging Face Space的Secrets中设置该值,或在本地的.env文件中配置。应用功能将受限。")
26
+ # st.stop() # Option to stop app, or let it run with limited functionality
27
+
28
+ # --- Initialize Databases (Idempotent) ---
29
+ # These functions now have internal error handling and directory creation
30
+ init_student_db() # For SQLite
31
+ # ChromaDB is initialized within rag_manager.py upon import/first use.
32
+
33
+ # --- Session State Management ---
34
+ # Helps persist data across Streamlit reruns
35
+ if 'processed_chat_extracts' not in st.session_state: # Renamed for clarity
36
+ st.session_state.processed_chat_extracts = [] # Stores list of {"student_name": ..., "observation": ...}
37
+ if 'current_processing_date' not in st.session_state: # Renamed
38
+ st.session_state.current_processing_date = datetime.date.today()
39
+ if 'student_list_cache' not in st.session_state: # Renamed
40
+ st.session_state.student_list_cache = get_all_student_names() # Initial load
41
+
42
+ # Feedback text states
43
+ if 'feedback_boss_text' not in st.session_state: st.session_state.feedback_boss_text = ""
44
+ if 'feedback_public_text' not in st.session_state: st.session_state.feedback_public_text = ""
45
+ if 'feedback_parent_text' not in st.session_state: st.session_state.feedback_parent_text = ""
46
+ if 'selected_student_for_parent_fb' not in st.session_state: st.session_state.selected_student_for_parent_fb = None
47
+
48
+
49
+ # --- Helper Functions for UI ---
50
+ def refresh_student_list_cache():
51
+ st.session_state.student_list_cache = get_all_student_names()
52
+ st.toast("学生列表已刷新。")
53
+
54
+ # --- Main Application UI ---
55
+ st.title("🚀 晚托反馈自动化助手")
56
+
57
+ # Sidebar for navigation and info
58
+ with st.sidebar:
59
+ st.header("导航")
60
+ menu_options = ["处理聊天记录", "生成反馈报告", "学生特点管理"]
61
+ choice = st.radio("选择功能:", menu_options, key="nav_menu")
62
+
63
+ st.markdown("---")
64
+ st.subheader("系统状态")
65
+ if GOOGLE_API_KEY:
66
+ st.success("Gemini API Key 已加载。")
67
+ else:
68
+ st.warning("Gemini API Key 未配置。")
69
+
70
+ # Simple check if DB files exist (more robust checks are within db/rag managers)
71
+ # These paths are inside the container / HF Space file system
72
+ student_db_exists = os.path.exists(STUDENT_DB_PATH)
73
+ chroma_dir_exists = os.path.exists(CHROMA_DB_PATH) and os.listdir(CHROMA_DB_PATH) # Check if dir exists and is not empty
74
+
75
+ if student_db_exists: st.markdown(f"✔️ 学生库: `{STUDENT_DB_PATH}`")
76
+ else: st.markdown(f"⚠️ 学生库未找到: `{STUDENT_DB_PATH}`")
77
+
78
+ if chroma_dir_exists: st.markdown(f"✔️ RAG库: `{CHROMA_DB_PATH}` (集合: {RAG_COLLECTION_NAME})")
79
+ else: st.markdown(f"⚠️ RAG库未找到: `{CHROMA_DB_PATH}`")
80
+
81
+ if st.button("🔄 刷新学生列表", key="sidebar_refresh_students"):
82
+ refresh_student_list_cache()
83
+
84
+
85
+ # --- Page 1: 处理聊天记录 ---
86
+ if choice == "处理聊天记录":
87
+ st.header("💬 聊天记录处理与数据构建")
88
+ st.markdown("在此粘贴每日微信聊天记录,AI将提取关键信息并存入知识库。")
89
+
90
+ # Date selection for the chat log
91
+ selected_date_for_processing = st.date_input(
92
+ "请选择聊天记录对应的日期",
93
+ value=st.session_state.current_processing_date, # Use session state for persistence
94
+ key="chat_date_input"
95
+ )
96
+ # Update session state if date changes
97
+ if selected_date_for_processing != st.session_state.current_processing_date:
98
+ st.session_state.current_processing_date = selected_date_for_processing
99
+ st.session_state.processed_chat_extracts = [] # Clear old extracts if date changes
100
+ st.experimental_rerun()
101
+
102
+
103
+ chat_log_text = st.text_area("在此粘贴聊天记录内容:", height=250, key="chat_log_input_area",
104
+ help="输入聊天内容后,点击“分析聊天记录”。")
105
+
106
+ if st.button("🤖 使用AI分析聊天记录", type="primary", key="analyze_chat_button"):
107
+ if not chat_log_text.strip():
108
+ st.warning("请输入聊天记录内容。")
109
+ elif not GOOGLE_API_KEY:
110
+ st.error("API Key未配置,无法分析。")
111
+ else:
112
+ with st.spinner("AI正在分析聊天记录,提取信息中..."):
113
+ st.session_state.processed_chat_extracts = extract_info_from_chat(chat_log_text)
114
+
115
+ if st.session_state.processed_chat_extracts:
116
+ st.success(f"AI成功提取到 {len(st.session_state.processed_chat_extracts)} 条信息!")
117
+ else:
118
+ st.info("AI分析完成,但未能从聊天记录中提取到格式化信息。")
119
+ # No st.error here as extract_info_from_chat might return empty on purpose
120
+
121
+ if st.session_state.processed_chat_extracts:
122
+ st.subheader("提取到的信息预览:")
123
+ preview_container = st.container()
124
+ with preview_container:
125
+ for item in st.session_state.processed_chat_extracts:
126
+ st.markdown(f"- **{item.get('student_name', 'N/A')}**: {item.get('observation', 'N/A')}")
127
+
128
+ st.markdown("---")
129
+ if st.button("➕ 确认并存入数据库和RAG知识库", key="store_extracted_data_button"):
130
+ with st.spinner("正在存储数据到RAG和学生数据库..."):
131
+ docs_to_rag = []
132
+ metadatas_to_rag = []
133
+ ids_to_rag = [] # RAG manager now generates robust IDs if None
134
+ processed_student_names_today = set()
135
+ date_str = st.session_state.current_processing_date.strftime("%Y-%m-%d")
136
+
137
+ for item_idx, item in enumerate(st.session_state.processed_chat_extracts):
138
+ s_name = item.get("student_name")
139
+ obs = item.get("observation")
140
+ if not s_name or not obs:
141
+ st.warning(f"跳过不完整的提取项: {item}")
142
+ continue
143
+
144
+ docs_to_rag.append(f"{s_name} 在 {date_str} 的表现: {obs}")
145
+ metadatas_to_rag.append({"student_name": str(s_name), "date": str(date_str), "source": "chat_log"})
146
+ # Let rag_manager handle ID generation if not provided or use robust ones here
147
+ # ids_to_rag.append(f"chat_{date_str.replace('-','')}_{str(s_name).replace(' ','_')}_{item_idx}")
148
+
149
+ add_or_update_student(s_name) # Ensure student exists in DB
150
+ processed_student_names_today.add(s_name)
151
+
152
+ storage_successful = False
153
+ if docs_to_rag:
154
+ if add_documents_to_rag(docs_to_rag, metadatas_to_rag, ids_to_rag): # ids can be None
155
+ storage_successful = True
156
+ else:
157
+ st.info("没有有效的提取信息可供存储。")
158
+
159
+ if storage_successful:
160
+ st.success(f"成功将 {len(docs_to_rag)} 条信息存入RAG。学生列表已更新。")
161
+ refresh_student_list_cache()
162
+ # Optionally trigger characteristics update for these students
163
+ if processed_student_names_today:
164
+ st.info("数据已存储。您可以前往“学生特点管理”页面更新这些学生的特点总结。")
165
+ st.session_state.processed_chat_extracts = [] # Clear after storing
166
+ st.experimental_rerun() # Rerun to clear preview and update UI
167
+ elif docs_to_rag: # If docs were there but storage failed
168
+ st.error("数据存入RAG失败。请检查日志。")
169
+
170
+
171
+ # --- Page 2: 生成反馈报告 ---
172
+ elif choice == "生成反馈报告":
173
+ st.header("📝 生成每日反馈报告")
174
+ st.markdown("根据已处理的信息或学生特点,选择不同模式生成反馈。")
175
+
176
+ feedback_target_date = st.date_input(
177
+ "请选择生成反馈对应的日期",
178
+ value=st.session_state.current_processing_date,
179
+ key="feedback_date_selector"
180
+ )
181
+ feedback_date_str = feedback_target_date.strftime("%Y-%m-%d")
182
+
183
+ # Determine summary for Boss/Public feedback
184
+ # Use extracts if date matches and extracts exist, otherwise query RAG
185
+ daily_summary_for_general_feedback = ""
186
+ processed_extracts_for_feedback_date = []
187
+
188
+ if feedback_target_date == st.session_state.current_processing_date and st.session_state.processed_chat_extracts:
189
+ processed_extracts_for_feedback_date = st.session_state.processed_chat_extracts
190
+ st.info(f"将使用为 {feedback_date_str} 刚处理的聊天记录生成反馈。")
191
+ temp_summary_parts = []
192
+ for item in processed_extracts_for_feedback_date:
193
+ temp_summary_parts.append(f"- {item.get('student_name', 'N/A')}: {item.get('observation', 'N/A')}")
194
+ if temp_summary_parts:
195
+ daily_summary_for_general_feedback = "\n".join(temp_summary_parts)
196
+ else:
197
+ daily_summary_for_general_feedback = get_events_summary_for_day(feedback_date_str) # Fallback
198
+ else:
199
+ with st.spinner(f"正在为日期 {feedback_date_str} 从知识库获取信息摘要..."):
200
+ daily_summary_for_general_feedback = get_events_summary_for_day(feedback_date_str)
201
+
202
+ st.markdown("---")
203
+ col1, col2 = st.columns(2)
204
+
205
+ with col1:
206
+ st.subheader("👔 给老板的反馈")
207
+ if st.button("生成老板反馈", key="generate_boss_fb"):
208
+ if not GOOGLE_API_KEY: st.error("API Key未配置。"); st.stop()
209
+ with st.spinner("正在生成老板反馈..."):
210
+ st.session_state.feedback_boss_text = generate_boss_feedback(daily_summary_for_general_feedback)
211
+ if st.session_state.feedback_boss_text: st.success("老板反馈生成成功!")
212
+ else: st.error("生成老板反馈失败或无内容返回。")
213
+ if st.session_state.feedback_boss_text:
214
+ st.text_area("老板反馈内容:", value=st.session_state.feedback_boss_text, height=200, key="boss_feedback_display")
215
+
216
+ with col2:
217
+ st.subheader("📢 公共反馈")
218
+ if st.button("生成公共反馈", key="generate_public_fb"):
219
+ if not GOOGLE_API_KEY: st.error("API Key未配置。"); st.stop()
220
+ with st.spinner("正在生成公共反馈..."):
221
+ st.session_state.feedback_public_text = generate_public_feedback(daily_summary_for_general_feedback)
222
+ if st.session_state.feedback_public_text: st.success("公共反馈生成成功!")
223
+ else: st.error("生成公共反馈失败或无内容返回。")
224
+ if st.session_state.feedback_public_text:
225
+ st.text_area("公共反馈内容:", value=st.session_state.feedback_public_text, height=200, key="public_feedback_display")
226
+
227
+ st.markdown("---")
228
+ st.subheader("👨‍👩‍👧‍👦 给家长的反馈")
229
+
230
+ if not st.session_state.student_list_cache:
231
+ st.warning("学生列表为空。请先通过“处理聊天记录”功能添加学生并处理数据。")
232
+ else:
233
+ st.session_state.selected_student_for_parent_fb = st.selectbox(
234
+ "选择学生:",
235
+ options=[""] + st.session_state.student_list_cache, # Add empty option for placeholder
236
+ index=0, # Default to empty
237
+ format_func=lambda x: "请选择..." if x == "" else x,
238
+ key="parent_feedback_student_selector"
239
+ )
240
+
241
+ feedback_modes_map = {
242
+ "正常模式 (基于当日记录)": "normal",
243
+ "偷懒模式 (组合历史事件)": "lazy",
244
+ "LLM特点生成 (创意发挥)": "llm_direct"
245
+ }
246
+ selected_mode_display_name = st.radio(
247
+ "选择反馈模式:",
248
+ options=list(feedback_modes_map.keys()),
249
+ key="parent_feedback_mode_selector"
250
+ )
251
+ mode_value = feedback_modes_map[selected_mode_display_name]
252
+
253
+ if st.button(f"为选定学生生成家长反馈", key="generate_parent_fb"):
254
+ if not GOOGLE_API_KEY: st.error("API Key未配置。"); st.stop()
255
+ if not st.session_state.selected_student_for_parent_fb:
256
+ st.warning("请先选择一个学生。")
257
+ else:
258
+ student_name = st.session_state.selected_student_for_parent_fb
259
+ with st.spinner(f"正在为 {student_name} ({selected_mode_display_name}) 生成家长反馈..."):
260
+ # Pass today's extracted data for the student if available (for "normal" mode)
261
+ student_specific_extracts_today = []
262
+ if feedback_target_date == st.session_state.current_processing_date and st.session_state.processed_chat_extracts:
263
+ student_specific_extracts_today = [
264
+ item for item in st.session_state.processed_chat_extracts if item.get("student_name") == student_name
265
+ ]
266
+
267
+ st.session_state.feedback_parent_text = generate_parent_feedback(
268
+ student_name,
269
+ mode_value,
270
+ feedback_date_str,
271
+ student_specific_extracts_today # Pass specific extracts for normal mode
272
+ )
273
+ if st.session_state.feedback_parent_text:
274
+ st.success(f"为 {student_name} 生成家长反馈成功!")
275
+ else:
276
+ st.error(f"为 {student_name} 生成家长反馈失败或无内容返回。")
277
+
278
+ if st.session_state.feedback_parent_text and st.session_state.selected_student_for_parent_fb:
279
+ st.text_area(
280
+ f"给 {st.session_state.selected_student_for_parent_fb} 家长的反馈:",
281
+ value=st.session_state.feedback_parent_text,
282
+ height=300,
283
+ key="parent_feedback_display"
284
+ )
285
+
286
+ # --- Page 3: 学生特点管理 ---
287
+ elif choice == "学生特点管理":
288
+ st.header("🧑‍🎓 学生特点数据库管理")
289
+ st.markdown("查看和更新AI总结的学生特点。特点会基于RAG中的历史记录生成。")
290
+
291
+ if st.button("🔄 强制刷新学生列表和显示", key="admin_refresh_students_btn"):
292
+ refresh_student_list_cache()
293
+ st.experimental_rerun()
294
+
295
+
296
+ if not st.session_state.student_list_cache:
297
+ st.info("当前没有学生数据。请先通过“处理聊天记录”功能添加并存储学生相关信息。")
298
+ else:
299
+ st.subheader("当前学生列表及特点:")
300
+
301
+ num_students = len(st.session_state.student_list_cache)
302
+ cols_per_row = 3 # Adjust number of columns for display
303
+
304
+ for i in range(0, num_students, cols_per_row):
305
+ cols = st.columns(cols_per_row)
306
+ for j in range(cols_per_row):
307
+ student_idx = i + j
308
+ if student_idx < num_students:
309
+ student_name = st.session_state.student_list_cache[student_idx]
310
+ with cols[j]:
311
+ with st.expander(f"{student_name}", expanded=False):
312
+ characteristics = get_student_characteristics(student_name)
313
+ st.markdown(f"**AI总结特点:**\n {characteristics if characteristics else '暂无总结。'}")
314
+ if st.button(f"更新 {student_name} 特点", key=f"update_char_{student_name}_{student_idx}"):
315
+ if not GOOGLE_API_KEY: st.error("API Key未配置。"); st.stop()
316
+ with st.spinner(f"正在为 {student_name} 更新特点..."):
317
+ update_student_characteristics_from_rag(student_name)
318
+ st.success(f"{student_name} 的特点已更新!请重新展开查看。")
319
+ st.experimental_rerun() # Rerun to reflect changes
320
+
321
+ st.markdown("---")
322
+ st.subheader("批量操作")
323
+ if st.button("✨ 批量更新所有学生的特点总结", key="batch_update_all_chars_btn"):
324
+ if not GOOGLE_API_KEY: st.error("API Key未配置。"); st.stop()
325
+ if not st.session_state.student_list_cache:
326
+ st.warning("没有学生可供批量更新。")
327
+ else:
328
+ # Confirmation dialog for safety
329
+ # Using a more explicit confirmation
330
+ placeholder = st.empty()
331
+ with placeholder.container():
332
+ st.warning(f"此操作将为数据库中所有 {len(st.session_state.student_list_cache)} 位学生重新生成特点总结,可能需要较长时间并消耗API额度。")
333
+ if st.button("我确认执行批量更新", key="confirm_batch_update"):
334
+ placeholder.empty() # Remove confirmation message
335
+ with st.spinner("正在批量更新所有学生特点,请耐心等待..."):
336
+ batch_update_all_students_characteristics() # This function has internal st.progress
337
+ st.success("所有学生特点总结批量更新完毕!")
338
+ st.experimental_rerun()
339
+ elif st.button("取消批量更新", key="cancel_batch_update"):
340
+ placeholder.empty()
341
+ st.info("批量更新已取消。")
342
+
343
+
344
+ # --- Footer ---
345
+ st.markdown("---")
346
+ st.markdown("晚托反馈助手 v1.0.0 (HF Dockerized) | 技术支持: Gemini LLM + RAG")
chat_processor.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # chat_processor.py
2
+ import datetime
3
+ from llm_handler import get_gemini_response
4
+ from rag_manager import add_documents_to_rag, get_all_student_observations_from_rag
5
+ from db_manager import add_or_update_student, get_all_student_names
6
+ import prompts
7
+ import re
8
+ import streamlit as st
9
+
10
+ def extract_info_from_chat(chat_log_text: str) -> list:
11
+ """使用LLM从聊天记录中提取学生表现信息"""
12
+ if not chat_log_text.strip():
13
+ return []
14
+
15
+ prompt = prompts.CHAT_EXTRACTION_USER_PROMPT_TEMPLATE.format(chat_log_text=chat_log_text)
16
+ system_instruction = prompts.CHAT_EXTRACTION_SYSTEM_PROMPT
17
+
18
+ response_text = get_gemini_response(prompt, system_instruction=system_instruction)
19
+ if not response_text:
20
+ st.warning("AI未能从聊天记录中提取到文本响应。")
21
+ return []
22
+
23
+ extracted_items = []
24
+ lines = response_text.strip().split('\n')
25
+ for line in lines:
26
+ line = line.strip()
27
+ if not line: continue # Skip empty lines
28
+ # More robust regex: allows for names with spaces if not ending with colon immediately
29
+ match = re.match(r"([^:]+?)\s*:\s*(.+)", line)
30
+ if match:
31
+ student_name = match.group(1).strip()
32
+ observation = match.group(2).strip()
33
+ if student_name and observation: # Ensure both parts are non-empty
34
+ extracted_items.append({"student_name": student_name, "observation": observation})
35
+ else:
36
+ print(f"Skipping partially extracted line: '{line}'") # Log for debugging
37
+ else:
38
+ print(f"Could not parse line from LLM: '{line}'") # Log for debugging
39
+
40
+ if not extracted_items:
41
+ st.info("AI分析完成,但未能按预期格式解析出学生信息。可能是聊天内容不包含相关信息,或AI响应格式不符。")
42
+ return extracted_items
43
+
44
+ def update_student_characteristics_from_rag(student_name: str):
45
+ """从RAG中获取学生所有记录,让LLM总结特点,并更新到学生数据库"""
46
+ observations = get_all_student_observations_from_rag(student_name)
47
+ if not observations:
48
+ st.info(f"在RAG中未找到学生 {student_name} 的历史表现记录,无法更新特点。")
49
+ # Ensure student exists in DB even if no observations yet, or update timestamp
50
+ add_or_update_student(student_name)
51
+ return
52
+
53
+ # Limit number of observations to avoid overly long prompts for LLM
54
+ MAX_OBSERVATIONS_FOR_SUMMARY = 50 # Adjust as needed
55
+ if len(observations) > MAX_OBSERVATIONS_FOR_SUMMARY:
56
+ st.info(f"学生 {student_name} 有超过 {MAX_OBSERVATIONS_FOR_SUMMARY} 条记录,将使用最新的 {MAX_OBSERVATIONS_FOR_SUMMARY} 条进行特点总结。")
57
+ observations_to_use = observations[-MAX_OBSERVATIONS_FOR_SUMMARY:]
58
+ else:
59
+ observations_to_use = observations
60
+
61
+ observations_text = "\n".join([f"- {obs}" for obs in observations_to_use]) # Add bullet points for clarity
62
+
63
+ prompt = prompts.STUDENT_CHARACTERISTICS_USER_PROMPT_TEMPLATE.format(
64
+ student_name=student_name,
65
+ observations_text=observations_text
66
+ )
67
+ system_instruction = prompts.STUDENT_CHARACTERISTICS_SYSTEM_PROMPT
68
+
69
+ summary = get_gemini_response(prompt, system_instruction=system_instruction)
70
+
71
+ if summary:
72
+ if add_or_update_student(student_name, characteristics_summary=summary.strip()):
73
+ st.success(f"已更新学生 {student_name} 的特点总结。")
74
+ else:
75
+ st.error(f"更新学生 {student_name} 的特点总结到数据库时失败。")
76
+ else:
77
+ st.warning(f"未能为学生 {student_name} 生成特点总结。AI未返回有效内容。")
78
+
79
+ def batch_update_all_students_characteristics():
80
+ """为数据库中所有学生更新其特点总结"""
81
+ student_names = get_all_student_names()
82
+ if not student_names:
83
+ st.info("学生数据库为空,无法批量更新特点。")
84
+ return
85
+
86
+ st.info(f"开始批量更新 {len(student_names)} 位学生的特点总结...")
87
+ progress_bar = st.progress(0)
88
+ for i, name in enumerate(student_names):
89
+ st.write(f"正在处理: {name}...") # Give some feedback during long process
90
+ update_student_characteristics_from_rag(name)
91
+ progress_bar.progress((i + 1) / len(student_names))
92
+ st.success("所有学生特点总结批量更新完毕!")
config.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # config.py
2
+ import os
3
+ from dotenv import load_dotenv
4
+
5
+ # Load environment variables from .env file if it exists (for local development)
6
+ # In Hugging Face Spaces, GOOGLE_API_KEY will be set via Secrets.
7
+ load_dotenv()
8
+
9
+ # API Keys and Model Configuration
10
+ GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
11
+ EMBEDDING_MODEL = "models/embedding-001"
12
+ GENERATIVE_MODEL = "gemini-1.5-flash-latest" # Or "gemini-pro" or other preferred model
13
+
14
+ # Database Paths
15
+ # These paths are relative to the WORKDIR defined in Dockerfile (i.e., /app)
16
+ # Hugging Face Spaces persistent storage will store data created at these paths.
17
+ CHROMA_DB_PATH = "./chroma_db" # Will be /app/chroma_db inside the container
18
+ STUDENT_DB_PATH = "./students.db" # Will be /app/students.db inside the container
19
+ RAG_COLLECTION_NAME = "chat_records_v2" # Changed name to avoid conflicts if old data exists
20
+
21
+ # Ensure API key is available (especially for local runs, HF handles missing secrets with errors)
22
+ # if not GOOGLE_API_KEY:
23
+ # print("Warning: GOOGLE_API_KEY not found. Please set it in your environment or .env file.")
24
+ # For HF deployment, if secret is not set, the app might fail at runtime when API is called.
db_manager.py ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # db_manager.py
2
+ import sqlite3
3
+ from config import STUDENT_DB_PATH
4
+ import streamlit as st
5
+ import os
6
+ import time
7
+
8
+ MAX_RETRIES = 3
9
+ RETRY_DELAY = 2 # seconds
10
+
11
+ def get_db_connection():
12
+ # Ensure the directory for the SQLite DB exists
13
+ db_dir = os.path.dirname(STUDENT_DB_PATH)
14
+ if db_dir and not os.path.exists(db_dir):
15
+ try:
16
+ os.makedirs(db_dir, exist_ok=True)
17
+ print(f"Created directory for SQLite DB: {db_dir}")
18
+ except Exception as e:
19
+ st.error(f"无法创建SQLite数据库目录 {db_dir}: {e}")
20
+ print(f"Could not create SQLite DB directory {db_dir}: {e}")
21
+ return None # Cannot proceed if directory creation fails
22
+
23
+ conn = None
24
+ for attempt in range(MAX_RETRIES):
25
+ try:
26
+ conn = sqlite3.connect(STUDENT_DB_PATH, timeout=10) # Added timeout
27
+ return conn
28
+ except sqlite3.OperationalError as e:
29
+ if "database is locked" in str(e):
30
+ print(f"SQLite DB is locked (Attempt {attempt + 1}/{MAX_RETRIES}). Retrying in {RETRY_DELAY}s...")
31
+ if attempt < MAX_RETRIES - 1:
32
+ time.sleep(RETRY_DELAY)
33
+ else:
34
+ st.error("SQLite数据库持续锁定,请稍后再试。")
35
+ print("SQLite DB remains locked after multiple retries.")
36
+ return None
37
+ else:
38
+ st.error(f"连接SQLite数据库时出错: {e}")
39
+ print(f"Error connecting to SQLite DB: {e}")
40
+ return None
41
+ except Exception as e: # Catch other potential errors
42
+ st.error(f"连接SQLite数据库时发生未知错误: {e}")
43
+ print(f"Unknown error connecting to SQLite DB: {e}")
44
+ return None
45
+ return None
46
+
47
+
48
+ def init_student_db():
49
+ conn = get_db_connection()
50
+ if conn is None:
51
+ return
52
+ try:
53
+ cursor = conn.cursor()
54
+ cursor.execute('''
55
+ CREATE TABLE IF NOT EXISTS students (
56
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
57
+ name TEXT UNIQUE NOT NULL,
58
+ characteristics_summary TEXT,
59
+ last_updated TIMESTAMP DEFAULT CURRENT_TIMESTAMP
60
+ )
61
+ ''')
62
+ conn.commit()
63
+ except Exception as e:
64
+ st.error(f"初始化学生数据库表时出错: {e}")
65
+ print(f"Error initializing student DB table: {e}")
66
+ finally:
67
+ if conn:
68
+ conn.close()
69
+
70
+ def add_or_update_student(name: str, characteristics_summary: str = None):
71
+ conn = get_db_connection()
72
+ if conn is None:
73
+ return False
74
+ try:
75
+ cursor = conn.cursor()
76
+ # Upsert logic: Insert if name doesn't exist, or update if it does.
77
+ # Using INSERT OR IGNORE then UPDATE is a common pattern.
78
+ cursor.execute("INSERT OR IGNORE INTO students (name) VALUES (?)", (name,))
79
+ if characteristics_summary is not None: # Allow updating only name or also characteristics
80
+ cursor.execute("""
81
+ UPDATE students
82
+ SET characteristics_summary = ?, last_updated = CURRENT_TIMESTAMP
83
+ WHERE name = ?
84
+ """, (characteristics_summary, name))
85
+ else: # Only ensure the student exists, update last_updated if already present
86
+ cursor.execute("""
87
+ UPDATE students
88
+ SET last_updated = CURRENT_TIMESTAMP
89
+ WHERE name = ? AND EXISTS (SELECT 1 FROM students WHERE name = ?)
90
+ """, (name,name))
91
+
92
+ conn.commit()
93
+ return True
94
+ except Exception as e:
95
+ st.error(f"添加或更新学生 '{name}' 时出错: {e}")
96
+ print(f"Error adding/updating student '{name}': {e}")
97
+ return False
98
+ finally:
99
+ if conn:
100
+ conn.close()
101
+
102
+ def get_student_characteristics(name: str):
103
+ conn = get_db_connection()
104
+ if conn is None:
105
+ return None
106
+ try:
107
+ cursor = conn.cursor()
108
+ cursor.execute("SELECT characteristics_summary FROM students WHERE name = ?", (name,))
109
+ result = cursor.fetchone()
110
+ return result[0] if result else None
111
+ except Exception as e:
112
+ st.error(f"获取学生 '{name}' 特点时出错: {e}")
113
+ print(f"Error getting characteristics for student '{name}': {e}")
114
+ return None
115
+ finally:
116
+ if conn:
117
+ conn.close()
118
+
119
+ def get_all_student_names():
120
+ conn = get_db_connection()
121
+ if conn is None:
122
+ return []
123
+ try:
124
+ cursor = conn.cursor()
125
+ cursor.execute("SELECT name FROM students ORDER BY name ASC")
126
+ results = [row[0] for row in cursor.fetchall()]
127
+ return results
128
+ except Exception as e:
129
+ st.error(f"获取所有学生姓名时出错: {e}")
130
+ print(f"Error getting all student names: {e}")
131
+ return []
132
+ finally:
133
+ if conn:
134
+ conn.close()
feedback_generator.py ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # feedback_generator.py
2
+ from llm_handler import get_gemini_response
3
+ from rag_manager import query_rag
4
+ from db_manager import get_student_characteristics
5
+ import prompts
6
+ import datetime
7
+ import streamlit as st
8
+
9
+ def get_events_summary_for_day(date_str: str, processed_chat_data: list = None) -> str:
10
+ """
11
+ 获取指定日期的事件总结。
12
+ 优先使用当日处理的聊天数据,否则从RAG查询。
13
+ """
14
+ if processed_chat_data:
15
+ summary_parts = []
16
+ for item in processed_chat_data:
17
+ # Ensure item has the expected keys
18
+ student_name = item.get("student_name", "未知学生")
19
+ observation = item.get("observation", "无具体描述")
20
+ summary_parts.append(f"- {student_name}: {observation}")
21
+ if summary_parts:
22
+ return "\n".join(summary_parts)
23
+ else: # processed_chat_data was empty or malformed
24
+ st.info(f"当日处理的聊天数据为空或格式不正确 ({date_str})。")
25
+ # Fall through to RAG query
26
+
27
+ # Fallback to RAG if no direct processed_chat_data
28
+ # This query needs to be general enough to pull daily highlights
29
+ # Or specific if you store daily summary documents.
30
+ st.info(f"尝试从RAG中检索日期 {date_str} 的整体活动信息...")
31
+ rag_results = query_rag(
32
+ query_text=f"{date_str} 发生的关键事件和整体情况",
33
+ n_results=10, # Get a few diverse entries
34
+ filter_metadata={"date": date_str} # Filter by date if metadata is set
35
+ )
36
+ if not rag_results:
37
+ return f"关于日期 {date_str}:今日无特别记录或未能从RAG中检索到信息。"
38
+ return f"关于日期 {date_str} 的记录:\n" + "\n".join([f"- {r}" for r in rag_results])
39
+
40
+
41
+ def generate_boss_feedback(today_events_summary: str):
42
+ if not today_events_summary or "无特别记录" in today_events_summary:
43
+ return "今日无足够信息生成老板反馈。"
44
+ prompt = prompts.BOSS_FEEDBACK_USER_PROMPT_TEMPLATE.format(today_events_summary=today_events_summary)
45
+ return get_gemini_response(prompt, system_instruction=prompts.BOSS_FEEDBACK_SYSTEM_PROMPT)
46
+
47
+ def generate_public_feedback(today_events_summary: str):
48
+ if not today_events_summary or "无特别记录" in today_events_summary:
49
+ return "今日无足够信息生成公共反馈。"
50
+ prompt = prompts.PUBLIC_FEEDBACK_USER_PROMPT_TEMPLATE.format(today_events_summary=today_events_summary)
51
+ return get_gemini_response(prompt, system_instruction=prompts.PUBLIC_FEEDBACK_SYSTEM_PROMPT)
52
+
53
+ def generate_parent_feedback(student_name: str, mode: str, date_str: str, processed_student_data_today: list = None):
54
+ characteristics = get_student_characteristics(student_name) or "暂无该生详细特点记录。"
55
+
56
+ if mode == "normal":
57
+ today_student_specific_events = "今天没有关于该生的特别记录。"
58
+ if processed_student_data_today: # Prefer data extracted today for this student
59
+ student_obs = [item['observation'] for item in processed_student_data_today if item['student_name'] == student_name]
60
+ if student_obs:
61
+ today_student_specific_events = "\n".join([f"- {obs}" for obs in student_obs])
62
+
63
+ if today_student_specific_events == "今天没有关于该生的特别记录.": # Fallback to RAG if not found in today's extract
64
+ rag_student_events = query_rag(
65
+ query_text=f"{student_name} 在 {date_str} 的具体表现",
66
+ n_results=5,
67
+ filter_metadata={"student_name": student_name, "date": date_str}
68
+ )
69
+ if rag_student_events:
70
+ today_student_specific_events = "\n".join([f"- {r}" for r in rag_student_events])
71
+
72
+ prompt_vars = {
73
+ "student_name": student_name,
74
+ "student_characteristics": characteristics,
75
+ "today_student_specific_events": today_student_specific_events
76
+ }
77
+ user_prompt = prompts.PARENT_NORMAL_USER_PROMPT_TEMPLATE.format(**prompt_vars)
78
+ system_instruction = prompts.PARENT_NORMAL_SYSTEM_PROMPT
79
+
80
+ elif mode == "lazy":
81
+ past_events_list = query_rag(
82
+ query_text=f"{student_name} 过往的各种积极表现和活动片段",
83
+ n_results=10, # Get more for variety
84
+ filter_metadata={"student_name": student_name} # No date filter for past events
85
+ )
86
+ # Filter out any very short or generic entries if possible
87
+ past_events_for_student = "\n".join([f"- {r}" for r in past_events_list if len(r.split()) > 5]) if past_events_list else "暂无该生足够的多样化历史表现记录用于此模式。"
88
+
89
+ if "暂无该生足够的多样化历史表现记录" in past_events_for_student and characteristics != "暂无该生详细特点记录。":
90
+ st.info("偷懒模式:历史具体事件不足,将尝试结合��生特点进行创意生成。")
91
+ # Fallback to a slightly modified LLM direct mode if lazy mode has no data
92
+ user_prompt = prompts.PARENT_LLM_DIRECT_USER_PROMPT_TEMPLATE.format(
93
+ student_name=student_name,
94
+ student_characteristics=characteristics
95
+ )
96
+ system_instruction = prompts.PARENT_LLM_DIRECT_SYSTEM_PROMPT
97
+ else:
98
+ prompt_vars = {
99
+ "student_name": student_name,
100
+ "student_characteristics": characteristics, # Still useful for LLM to know
101
+ "past_events_for_student": past_events_for_student
102
+ }
103
+ user_prompt = prompts.PARENT_LAZY_USER_PROMPT_TEMPLATE.format(**prompt_vars)
104
+ system_instruction = prompts.PARENT_LAZY_SYSTEM_PROMPT
105
+
106
+
107
+ elif mode == "llm_direct":
108
+ if characteristics == "暂无该生详细特点记录。":
109
+ return f"无法使用LLM直接生成模式,学生 {student_name} 的特点数据不足。请先更新其特点。"
110
+ prompt_vars = {
111
+ "student_name": student_name,
112
+ "student_characteristics": characteristics
113
+ }
114
+ user_prompt = prompts.PARENT_LLM_DIRECT_USER_PROMPT_TEMPLATE.format(**prompt_vars)
115
+ system_instruction = prompts.PARENT_LLM_DIRECT_SYSTEM_PROMPT
116
+ else:
117
+ st.error("无效的家长反馈模式。")
118
+ return "无效的反馈模式。"
119
+
120
+ return get_gemini_response(user_prompt, system_instruction=system_instruction)
llm_handler.py ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # llm_handler.py
2
+ import google.generativeai as genai
3
+ from config import GOOGLE_API_KEY, GENERATIVE_MODEL, EMBEDDING_MODEL
4
+ import streamlit as st # For displaying errors or warnings if needed
5
+
6
+ # Configure Gemini API
7
+ if GOOGLE_API_KEY:
8
+ try:
9
+ genai.configure(api_key=GOOGLE_API_KEY)
10
+ except Exception as e:
11
+ st.error(f"Failed to configure Gemini API: {e}") # Show error in Streamlit if app is running
12
+ print(f"Failed to configure Gemini API: {e}") # Print to console for server logs
13
+ else:
14
+ # This will be handled by Streamlit UI in app.py if key is missing
15
+ print("Warning: GOOGLE_API_KEY is not set. LLM features will not work.")
16
+
17
+
18
+ def get_gemini_response(prompt_text, system_instruction=None):
19
+ """获取Gemini模型的响应"""
20
+ if not GOOGLE_API_KEY:
21
+ st.error("Gemini API Key未配置,无法获取模型响应。请在Hugging Face Space Secrets中设置 GOOGLE_API_KEY。")
22
+ return None
23
+ try:
24
+ model = genai.GenerativeModel(
25
+ GENERATIVE_MODEL,
26
+ system_instruction=system_instruction if system_instruction else None
27
+ )
28
+ response = model.generate_content(prompt_text)
29
+ return response.text
30
+ except Exception as e:
31
+ error_message = f"与Gemini通信时出错: {e}"
32
+ if hasattr(e, 'message') and "API key not valid" in e.message:
33
+ error_message = "Gemini API Key无效或权限不足。请检查Hugging Face Space Secrets中的GOOGLE_API_KEY。"
34
+ st.error(error_message)
35
+ print(error_message) # For server logs
36
+ return None
37
+
38
+ # Using genai.embed_content directly is often simpler for ChromaDB
39
+ # but if you need a callable for ChromaDB's embedding_functions parameter:
40
+ class GeminiEmbeddingFunctionForChroma(genai.embedding.EmbeddingFunction):
41
+ def __call__(self, input: genai.embedding.EmbedContentRequest) -> genai.embedding.EmbedContentResponse:
42
+ # Ensure 'input' is a list of strings (documents)
43
+ if not isinstance(input, list) or not all(isinstance(doc, str) for doc in input):
44
+ # ChromaDB typically passes a list of documents (strings)
45
+ # genai.embed_content expects a 'content' field which can be a string or list of strings
46
+ # The structure of 'input' from ChromaDB needs to be correctly mapped.
47
+ # ChromaDB's `embedding_function` interface expects a function that takes a list of texts
48
+ # and returns a list of embeddings.
49
+
50
+ # Let's assume 'input' is a list of document strings.
51
+ docs_to_embed = input
52
+ else: # Fallback if input structure is different, adapt as needed
53
+ docs_to_embed = [str(item) for item in input]
54
+
55
+
56
+ if not docs_to_embed:
57
+ return {"embedding": []} # Return empty embedding list for empty input
58
+
59
+ try:
60
+ # Embed a batch of documents.
61
+ # `task_type` is important for retrieval.
62
+ result = genai.embed_content(
63
+ model=EMBEDDING_MODEL,
64
+ content=docs_to_embed,
65
+ task_type="RETRIEVAL_DOCUMENT"
66
+ )
67
+ return result['embedding'] # ChromaDB expects a list of embeddings
68
+ except Exception as e:
69
+ error_message = f"获取文本嵌入时出错: {e}"
70
+ st.error(error_message)
71
+ print(error_message)
72
+ # Return a list of Nones or empty lists of the correct length if an error occurs for some documents
73
+ return [None] * len(docs_to_embed)
74
+
75
+ # --- Alternative simpler embedding function for ChromaDB ---
76
+ # This is often easier to integrate if ChromaDB's embedding_function
77
+ # parameter expects a function that takes a list of texts.
78
+ from chromadb import Documents, EmbeddingFunction, Embeddings
79
+
80
+ class GeminiChromaEF(EmbeddingFunction):
81
+ def __init__(self, model_name: str = EMBEDDING_MODEL, task_type: str = "RETRIEVAL_DOCUMENT"):
82
+ self._model_name = model_name
83
+ self._task_type = task_type
84
+ if not GOOGLE_API_KEY:
85
+ print("Warning: GOOGLE_API_KEY not set. Embedding function might fail.")
86
+ # Optionally raise an error or handle appropriately
87
+
88
+ def __call__(self, input_texts: Documents) -> Embeddings:
89
+ if not GOOGLE_API_KEY:
90
+ st.error("Gemini API Key未配置,无法生成文本嵌入。")
91
+ print("Gemini API Key not configured for embeddings.")
92
+ return [([0.0] * 768) for _ in input_texts] # Return dummy embeddings or handle error
93
+
94
+ if not input_texts:
95
+ return []
96
+ try:
97
+ # Filter out any None or non-string inputs, though Documents type should be list of str
98
+ valid_texts = [text for text in input_texts if isinstance(text, str)]
99
+ if not valid_texts:
100
+ # Handle case where all inputs were invalid
101
+ return [([0.0] * 768) for _ in input_texts]
102
+
103
+
104
+ result = genai.embed_content(
105
+ model=self._model_name,
106
+ content=valid_texts,
107
+ task_type=self._task_type
108
+ )
109
+ # Ensure the result matches the number of valid_texts.
110
+ # If there was an error, result['embedding'] might be shorter or None.
111
+ # A robust handler would map results back to original input count, perhaps with None for errors.
112
+ # For simplicity here, assuming success or a catastrophic failure handled by the try-except.
113
+
114
+ # Map embeddings back to the original input_texts length, filling with None for invalid ones
115
+ # This part is tricky because genai.embed_content might error out entirely or skip bad inputs.
116
+ # Let's assume it returns embeddings for valid_texts only.
117
+ embeddings_dict = {text: emb for text, emb in zip(valid_texts, result['embedding'])}
118
+
119
+ final_embeddings = []
120
+ for text in input_texts:
121
+ if isinstance(text, str) and text in embeddings_dict:
122
+ final_embeddings.append(embeddings_dict[text])
123
+ else:
124
+ # Provide a dummy embedding or None for invalid/missing inputs
125
+ # The dimension (e.g., 768) depends on your embedding model.
126
+ # For "models/embedding-001", it's 768.
127
+ final_embeddings.append([0.0] * 768) # Placeholder for invalid inputs
128
+ return final_embeddings
129
+
130
+ except Exception as e:
131
+ error_message = f"获取文本嵌入时出错 (GeminiChromaEF): {e}"
132
+ st.error(error_message)
133
+ print(error_message)
134
+ # Return dummy embeddings for all inputs in case of a general error
135
+ return [[0.0] * 768 for _ in input_texts] # Placeholder dimension
prompts.py ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # prompts.py
2
+ # (Paste the content of your prompts.py file here)
3
+ # Example:
4
+ # CHAT_EXTRACTION_SYSTEM_PROMPT = """..."""
5
+ # CHAT_EXTRACTION_USER_PROMPT_TEMPLATE = """..."""
6
+ # ... and all other prompts ...
7
+ CHAT_EXTRACTION_SYSTEM_PROMPT = """
8
+ 你是一个晚托班聊天记录分析助手。你的任务是从提供的聊天记录中,为每个提到的学生提取关键信息。
9
+ 信息应包括:学生姓名,以及关于该学生的具体事件、学术表现、行为、情绪、社交互动或任何值得注意的观察。
10
+ 如果一个学生有多条相关信息,请都列出来。
11
+ 专注于事实和具体描述。
12
+ """
13
+
14
+ CHAT_EXTRACTION_USER_PROMPT_TEMPLATE = """
15
+ 请分析以下今天的聊天记录,提取每个学生相关的具体事件、表现或评价。
16
+ 输出格式为:
17
+ 学生姓名: [事件/表现/评价]
18
+ 学生姓名: [另一个事件/表现/评价]
19
+ ...
20
+
21
+ 聊天记录内容如下:
22
+ ---
23
+ {chat_log_text}
24
+ ---
25
+ 请严格按照上述格式输出,每条信息占一行。只输出提取结果。
26
+ """
27
+
28
+ STUDENT_CHARACTERISTICS_SYSTEM_PROMPT = """
29
+ 你是一个资深的儿童教育心理分析师。你的任务是根据提供的一系列关于某个学生的日常表现记录,总结该学生的主要性格特点、学习习惯、社交风格和潜在优势或需要关注的方面。
30
+ 总结应全面、客观、简洁,并使用积极的语言。
31
+ """
32
+
33
+ STUDENT_CHARACTERISTICS_USER_PROMPT_TEMPLATE = """
34
+ 学生姓名: {student_name}
35
+ 历史表现记录如下:
36
+ ---
37
+ {observations_text}
38
+ ---
39
+ 请基于以上记录,为 {student_name} 总结其主要特点。
40
+ """
41
+
42
+
43
+ # --- Feedback Generation Prompts ---
44
+ # BOSS FEEDBACK
45
+ BOSS_FEEDBACK_SYSTEM_PROMPT = """
46
+ 你是一位经验丰富的晚托机构主管助理。你的任务是根据今天收集到的学生表现信息,撰写一份给老板的每日工作反馈。
47
+ 反馈应简洁明了,突出重点:
48
+ 1. 今日整体情况概述。
49
+ 2. 表现特别突出(正面或负面)的学生及其简要事迹。
50
+ 3. 任何需要老板知晓或跟进的特殊事件或问题。
51
+ 4. 可以提出简要的工作建议(可选)。
52
+ 语言需专业、客观。
53
+ """
54
+ BOSS_FEEDBACK_USER_PROMPT_TEMPLATE = """
55
+ 今日学生表现信息汇总:
56
+ ---
57
+ {today_events_summary}
58
+ ---
59
+ 请根据以上信息,生成一份给老板的晚托工作反馈。
60
+ """
61
+
62
+ # PUBLIC FEEDBACK
63
+ PUBLIC_FEEDBACK_SYSTEM_PROMPT = """
64
+ 你是一位活泼且富有创意的晚托机构宣传专员。你的任务是根据今天收集到的学生表现素材,撰写一份公开的、积极正面的晚托活动反馈。
65
+ 这份反馈将会发布在机构的社交媒体或公告栏。
66
+ 主要目标是:
67
+ 1. 展示孩子们在晚托的快乐学习时光和丰富多彩的活动。
68
+ 2. 传递积极向上的教育理念和氛围。
69
+ 3. 除非是集体性的表扬,否则避免提及具体学生姓名,可以使用“有的小朋友”、“大家”等代称。
70
+ 风格应活泼、温馨、吸引人。
71
+ """
72
+ PUBLIC_FEEDBACK_USER_PROMPT_TEMPLATE = """
73
+ 今日学生表现素材:
74
+ ---
75
+ {today_events_summary}
76
+ ---
77
+ 请根据以上素材,生成一份公开的晚托活动反馈。
78
+ """
79
+
80
+ # PARENT FEEDBACK (NORMAL MODE)
81
+ PARENT_NORMAL_SYSTEM_PROMPT = """
82
+ 你是一位经验丰富、充满爱心且专业的晚托班老师。你的任务是给学生家长写一份关于孩子今天在晚托班表现的反馈。
83
+ 反馈应包含:
84
+ 1. 问候家长。
85
+ 2. 具体描述孩子今天的学习情况(如作业完成度、遇到的困难、取得的进步)。
86
+ 3. 描述孩子的行为表现和情绪状态。
87
+ 4. 描述孩子的社交互动情况。
88
+ 5. 基于观察给予积极的肯定和鼓励。
89
+ 6. 如有必要,可以给出温和的建议或需要家长配合的事项。
90
+ 语言需亲切、真诚、具体、正面引导。
91
+ """
92
+ PARENT_NORMAL_USER_PROMPT_TEMPLATE = """
93
+ 学生姓名: {student_name}
94
+ 该生一般特点: {student_characteristics}
95
+
96
+ 今天关于 {student_name} 的具体表现记录:
97
+ ---
98
+ {today_student_specific_events}
99
+ ---
100
+ 请根据以上信息,为 {student_name} 的家长写一份今日反馈。
101
+ """
102
+
103
+ # PARENT FEEDBACK (LAZY MODE)
104
+ PARENT_LAZY_SYSTEM_PROMPT = PARENT_NORMAL_SYSTEM_PROMPT # 可以复用
105
+ PARENT_LAZY_USER_PROMPT_TEMPLATE = """
106
+ 学生姓名: {student_name}
107
+ 该生一般特点: {student_characteristics}
108
+
109
+ 以下是 {student_name} 过去的一些表现记录,请从中挑选几件【不同】的事情,巧妙地组合并略作修改,形成一份【听起来像是今天发生】的反馈给家长。
110
+ 确保反馈内容积极正面,并且事件之间有一定区隔,不要都揉在一起说。
111
+
112
+ 历史表现记录(供挑选组合):
113
+ ---
114
+ {past_events_for_student}
115
+ ---
116
+ 请根据以上要求,为 {student_name} 的家长写一份反馈。
117
+ """
118
+
119
+ # PARENT FEEDBACK (LLM DIRECT MODE - Based on characteristics)
120
+ PARENT_LLM_DIRECT_SYSTEM_PROMPT = PARENT_NORMAL_SYSTEM_PROMPT # 可以复用
121
+ PARENT_LLM_DIRECT_USER_PROMPT_TEMPLATE = """
122
+ 学生姓名: {student_name}
123
+ 该生一般特点: {student_characteristics}
124
+
125
+ 今天晚托班的常规活动包括:作业辅导、阅读、主题活动(例如手工、科学小实验或小组游戏)、自由活动。
126
+ 请你基于 {student_name} 的已知特点,并结合今天的常规活动,【设想并生成】一份他/她今天可能的表现,并据此给家长写一份反馈。
127
+ 例如,如果学生特点是“专注数学”,可以设想他今天在数学作业上表现出色。如果特点是“乐于助人”,可以设想他帮助了同学。
128
+ 反馈需要听起来自然、具体,就像真实观察到的一样。
129
+ """
rag_manager.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # rag_manager.py
2
+ import chromadb
3
+ from config import CHROMA_DB_PATH, RAG_COLLECTION_NAME
4
+ from llm_handler import GeminiChromaEF # Use the robust embedding function
5
+ import streamlit as st
6
+ import time
7
+
8
+ # Initialize the embedding function globally so it's created once.
9
+ gemini_ef = None
10
+ try:
11
+ gemini_ef = GeminiChromaEF()
12
+ except Exception as e:
13
+ st.error(f"无法初始化Gemini Embedding Function: {e}. RAG功能将受限。")
14
+ print(f"Error initializing GeminiChromaEF: {e}")
15
+
16
+
17
+ # Initialize ChromaDB client.
18
+ # Using a try-except block for robustness, especially in shared environments like HF Spaces.
19
+ db_client = None
20
+ collection = None
21
+ MAX_RETRIES = 3
22
+ RETRY_DELAY = 5 # seconds
23
+
24
+ for attempt in range(MAX_RETRIES):
25
+ try:
26
+ if not os.path.exists(CHROMA_DB_PATH):
27
+ os.makedirs(CHROMA_DB_PATH, exist_ok=True)
28
+ print(f"Created ChromaDB directory: {CHROMA_DB_PATH}")
29
+
30
+ db_client = chromadb.PersistentClient(path=CHROMA_DB_PATH)
31
+
32
+ if gemini_ef:
33
+ collection = db_client.get_or_create_collection(
34
+ name=RAG_COLLECTION_NAME,
35
+ embedding_function=gemini_ef
36
+ )
37
+ print(f"成功连接到RAG集合 '{RAG_COLLECTION_NAME}' 并使用Gemini embeddings.")
38
+ else:
39
+ # Fallback if embedding function failed to initialize
40
+ # This collection won't be very useful without a working embedding function
41
+ collection = db_client.get_or_create_collection(name=RAG_COLLECTION_NAME)
42
+ st.warning("RAG集合已创建,但Gemini Embedding Function未成功初始化。语义搜索可能无法正常工作。")
43
+ print(f"RAG collection '{RAG_COLLECTION_NAME}' created without a proper embedding function due to prior errors.")
44
+ break # Success
45
+ except Exception as e: # Catching a broad exception, sqlite3.OperationalError: database is locked is common
46
+ st.error(f"初始化ChromaDB客户端失败 (尝试 {attempt + 1}/{MAX_RETRIES}): {e}")
47
+ print(f"Error initializing ChromaDB client (Attempt {attempt + 1}/{MAX_RETRIES}): {e}")
48
+ if attempt < MAX_RETRIES - 1:
49
+ time.sleep(RETRY_DELAY)
50
+ else:
51
+ st.error("已达到最大重试次数,ChromaDB可能无法使用。请检查日志。")
52
+ print("Max retries reached for ChromaDB client initialization.")
53
+ # `collection` will remain None, functions below need to handle this.
54
+
55
+ def add_documents_to_rag(documents: list[str], metadatas: list[dict] = None, ids: list[str] = None):
56
+ if collection is None or gemini_ef is None:
57
+ st.error("RAG集合或Embedding Function未初始化,无法添加文档。")
58
+ print("RAG collection or EF not initialized in add_documents_to_rag.")
59
+ return False
60
+ if not documents:
61
+ st.info("没有文档需要添加到RAG。")
62
+ return True # Not an error, just nothing to do
63
+
64
+ num_docs = len(documents)
65
+ if not ids:
66
+ # Generate more robust unique IDs, e.g., using a hash or UUID if not provided
67
+ from hashlib import md5
68
+ ids = [f"doc_{md5(doc.encode()).hexdigest()}_{i}" for i, doc in enumerate(documents)]
69
+ if metadatas is None:
70
+ metadatas = [{}] * num_docs
71
+
72
+ # Ensure lengths match, truncate to min_len if they don't
73
+ min_len = min(len(documents), len(metadatas), len(ids))
74
+ if min_len < num_docs:
75
+ st.warning(f"文档、元数据或ID列表长度不一致。将使用最短长度: {min_len}")
76
+ documents = documents[:min_len]
77
+ metadatas = metadatas[:min_len]
78
+ ids = ids[:min_len]
79
+ if min_len == 0:
80
+ st.info("调整后没有文档可添加。")
81
+ return True
82
+
83
+ try:
84
+ collection.add(
85
+ documents=documents,
86
+ metadatas=metadatas,
87
+ ids=ids
88
+ )
89
+ st.success(f"成功添加 {len(documents)} 个文档到RAG集合 '{RAG_COLLECTION_NAME}'.")
90
+ return True
91
+ except Exception as e:
92
+ st.error(f"添加文档到RAG时出错: {e}")
93
+ print(f"Error adding documents to RAG: {e}")
94
+ return False
95
+
96
+ def query_rag(query_text: str, n_results: int = 5, filter_metadata: dict = None):
97
+ if collection is None or gemini_ef is None:
98
+ st.error("RAG集合或Embedding Function未初始化,无法查询。")
99
+ print("RAG collection or EF not initialized in query_rag.")
100
+ return []
101
+
102
+ if not query_text:
103
+ return []
104
+
105
+ try:
106
+ results = collection.query(
107
+ query_texts=[query_text],
108
+ n_results=n_results,
109
+ where=filter_metadata if filter_metadata else None
110
+ # include=['metadatas', 'documents', 'distances'] # To get more info
111
+ )
112
+ return results['documents'][0] if results and results['documents'] else []
113
+ except Exception as e:
114
+ st.error(f"查询RAG时出错: {e}")
115
+ print(f"Error querying RAG: {e}")
116
+ return []
117
+
118
+ def get_all_student_observations_from_rag(student_name: str):
119
+ if collection is None:
120
+ st.error("RAG集合未初始化,无法获取学生观察记录。")
121
+ return []
122
+ try:
123
+ # Using 'where' clause for filtering directly in the get call
124
+ entries = collection.get(
125
+ where={"student_name": student_name},
126
+ include=["documents"] # Only need documents here
127
+ )
128
+ return entries['documents'] if entries and entries['documents'] else []
129
+ except Exception as e:
130
+ st.error(f"从RAG获取学生 {student_name} 的所有观察记录时出错: {e}")
131
+ print(f"Error getting all observations for {student_name} from RAG: {e}")
132
+ return []
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ streamlit
2
+ google-generativeai
3
+ python-dotenv # 仍然有用,config.py会尝试加载,即使在HF上主要是为了本地运行或读取非敏感配置
4
+ chromadb>=0.4.22 # 确保版本兼容性,特别是对于PersistentClient和EmbeddingFunctions
5
+ # sentence-transformers # 如果你决定使用它作为 embedding function
6
+ # 其他你项目中可能用到的库