bichnhan2701 commited on
Commit
de7d237
·
1 Parent(s): 061d0f9

Add summary and mindmap logic

Browse files
app/api/transcribe.py CHANGED
@@ -11,12 +11,21 @@ from app.core.audio_utils import save_upload_file, get_audio_info, ensure_wav_16
11
  from app.core.asr_engine import load_model, transcribe_file, transcribe_file_chunks
12
  from app.config import settings
13
  from app.services.text_normalizer import normalize_text
 
 
 
14
  from app.services.note_client import NoteServiceClient
15
  from rq import Queue
16
  from app.infra.redis_client import redis_client
17
  from app.jobs.transcribe_job import transcribe_job
18
  from app.schemas.transcribe import TranscribeResponse
19
- from app.infra.metrics import REQUEST_COUNT, REQUEST_LATENCY, ASR_DURATION, NORMALIZE_DURATION, ERROR_COUNT
 
 
 
 
 
 
20
 
21
  router = APIRouter()
22
 
@@ -32,10 +41,17 @@ async def _startup():
32
 
33
  def _ensure_file_limits(path: str):
34
  if os.path.getsize(path) > settings.MAX_UPLOAD_BYTES:
35
- raise HTTPException(status_code=status.HTTP_413_REQUEST_ENTITY_TOO_LARGE, detail="File size exceeds limit")
 
 
 
36
  info = get_audio_info(path)
37
  if info and info.get("duration", 0) > settings.MAX_DURATION_SECS:
38
- raise HTTPException(status_code=status.HTTP_413_REQUEST_ENTITY_TOO_LARGE, detail="Audio duration exceeds limit")
 
 
 
 
39
 
40
  @router.post("/transcribe", response_model=TranscribeResponse)
41
  async def transcribe(file: UploadFile = File(...)):
@@ -43,9 +59,11 @@ async def transcribe(file: UploadFile = File(...)):
43
  tmp_wav = None
44
  note_service = NoteServiceClient()
45
  note_id = str(uuid.uuid4())
 
46
  start_time = time.perf_counter()
47
  endpoint = "/transcribe"
48
  status_label = "success"
 
49
  with REQUEST_LATENCY.labels(endpoint).time():
50
  try:
51
  # write upload to tmp (blocking) -> run in thread
@@ -61,6 +79,7 @@ async def transcribe(file: UploadFile = File(...)):
61
  info = get_audio_info(tmp_wav) or {}
62
  duration_sec = info.get("duration", 0)
63
  ASYNC_THRESHOLD = 120 # 2 phút, có thể chỉnh
 
64
  if duration_sec > ASYNC_THRESHOLD:
65
  # Enqueue background job bằng RQ
66
  q = Queue("asr", connection=redis_client)
@@ -78,7 +97,7 @@ async def transcribe(file: UploadFile = File(...)):
78
  "status": "queued",
79
  "duration": duration_sec
80
  })
81
-
82
  # Nếu audio ngắn, xử lý sync như cũ
83
  model = ASR_MODEL or await asyncio.to_thread(load_model, 30)
84
  with ASR_DURATION.labels(endpoint).time():
@@ -87,7 +106,13 @@ async def transcribe(file: UploadFile = File(...)):
87
 
88
  # normalize via Gemini (already async safe in your service)
89
  with NORMALIZE_DURATION.labels(endpoint).time():
90
- normalized_text = await normalize_text(text)
 
 
 
 
 
 
91
 
92
  info2 = get_audio_info(tmp_wav) or {}
93
  # persist to Note Service (async HTTP)
@@ -95,30 +120,39 @@ async def transcribe(file: UploadFile = File(...)):
95
  note_id=note_id,
96
  raw_text=text,
97
  normalized_text=normalized_text,
 
 
 
98
  duration=info2.get("duration"),
99
  sample_rate=info2.get("samplerate"),
100
  chunks=chunks,
101
  asr_model="PhoWhisper-base",
102
- normalization_model="gemini-1.5"
103
  )
104
 
105
  duration = time.perf_counter() - start_time
106
  logging.info(f"/transcribe success note_id={note_id} duration={duration:.2f}s audio_dur={info2.get('duration')}")
107
  REQUEST_COUNT.labels(endpoint, status_label).inc()
108
- return JSONResponse(status_code=200, content={
109
- "note_id": note_id,
110
- "status": "transcribed",
111
- "duration": info2.get("duration")
112
- })
 
 
 
 
113
  except HTTPException:
114
  status_label = "http_error"
115
  ERROR_COUNT.labels(endpoint, status_label).inc()
116
  raise
 
117
  except Exception as e:
118
  status_label = "error"
119
  ERROR_COUNT.labels(endpoint, status_label).inc()
120
  logging.exception(f"/transcribe failed note_id={note_id}")
121
  raise HTTPException(status_code=500, detail=f"Transcription failed: {e}")
 
122
  finally:
123
  # cleanup
124
  for p in [tmp_in, tmp_wav]:
@@ -128,11 +162,11 @@ async def transcribe(file: UploadFile = File(...)):
128
  except Exception:
129
  pass
130
 
131
-
132
  @router.post("/transcribe-url", response_model=TranscribeResponse)
133
  async def transcribe_url(payload: dict):
134
  audio_url = payload.get("audio_url")
135
  user_id = payload.get("user_id")
 
136
  if not audio_url:
137
  raise HTTPException(status_code=400, detail="audio_url required")
138
  if not user_id:
@@ -140,52 +174,193 @@ async def transcribe_url(payload: dict):
140
 
141
  tmp_in = make_temp_path(suffix=Path(audio_url).suffix or ".tmp")
142
  tmp_wav = None
143
- note_service = NoteServiceClient()
144
  note_id = str(uuid.uuid4())
 
145
 
 
146
  start_time = time.perf_counter()
147
- try:
148
- # download blocking -> thread
149
- await asyncio.to_thread(download_file_from_url, audio_url, tmp_in)
150
-
151
- _ensure_file_limits(tmp_in)
152
-
153
- tmp_wav = make_temp_path(suffix=".wav")
154
- await asyncio.to_thread(ensure_wav_16k_mono, tmp_in, tmp_wav)
155
-
156
- model = ASR_MODEL or await asyncio.to_thread(load_model, 30)
157
- text = await asyncio.to_thread(transcribe_file, model, tmp_wav, 30.0, 5.0)
158
- chunks = await asyncio.to_thread(transcribe_file_chunks, model, tmp_wav, 30.0, 5.0)
159
- normalized_text = await normalize_text(text)
160
- info2 = get_audio_info(tmp_wav) or {}
161
-
162
- await note_service.save_transcript(
163
- note_id=note_id,
164
- raw_text=text,
165
- normalized_text=normalized_text,
166
- duration=info2.get("duration"),
167
- sample_rate=info2.get("samplerate"),
168
- chunks=chunks,
169
- asr_model="PhoWhisper-base",
170
- normalization_model="gemini-1.5"
171
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
172
 
173
- duration = time.perf_counter() - start_time
174
- logging.info(f"/transcribe-url success note_id={note_id} duration={duration:.2f}s audio_dur={info2.get('duration')}")
175
- return JSONResponse(status_code=200, content={
176
- "note_id": note_id,
177
- "status": "transcribed",
178
- "duration": info2.get("duration")
179
- })
180
- except HTTPException:
181
- raise
182
- except Exception as e:
183
- logging.exception(f"/transcribe-url failed note_id={note_id}")
184
- raise HTTPException(status_code=500, detail=f"Transcription failed: {e}")
185
- finally:
186
- for p in [tmp_in, tmp_wav]:
187
- try:
188
- if p and os.path.exists(p):
189
- os.remove(p)
190
- except Exception:
191
- pass
 
11
  from app.core.asr_engine import load_model, transcribe_file, transcribe_file_chunks
12
  from app.config import settings
13
  from app.services.text_normalizer import normalize_text
14
+ from app.services.nlp_postprocess import normalize_and_extract
15
+ from app.services.summary_service import generate_summary
16
+ from app.services.mindmap_service import generate_mindmap
17
  from app.services.note_client import NoteServiceClient
18
  from rq import Queue
19
  from app.infra.redis_client import redis_client
20
  from app.jobs.transcribe_job import transcribe_job
21
  from app.schemas.transcribe import TranscribeResponse
22
+ from app.infra.metrics import (
23
+ REQUEST_COUNT,
24
+ REQUEST_LATENCY,
25
+ ASR_DURATION,
26
+ NORMALIZE_DURATION,
27
+ ERROR_COUNT,
28
+ )
29
 
30
  router = APIRouter()
31
 
 
41
 
42
  def _ensure_file_limits(path: str):
43
  if os.path.getsize(path) > settings.MAX_UPLOAD_BYTES:
44
+ raise HTTPException(
45
+ status_code=status.HTTP_413_REQUEST_ENTITY_TOO_LARGE,
46
+ detail="File size exceeds limit",
47
+ )
48
  info = get_audio_info(path)
49
  if info and info.get("duration", 0) > settings.MAX_DURATION_SECS:
50
+ raise HTTPException(
51
+ status_code=status.HTTP_413_REQUEST_ENTITY_TOO_LARGE,
52
+ detail="Audio duration exceeds limit",
53
+ )
54
+
55
 
56
  @router.post("/transcribe", response_model=TranscribeResponse)
57
  async def transcribe(file: UploadFile = File(...)):
 
59
  tmp_wav = None
60
  note_service = NoteServiceClient()
61
  note_id = str(uuid.uuid4())
62
+
63
  start_time = time.perf_counter()
64
  endpoint = "/transcribe"
65
  status_label = "success"
66
+
67
  with REQUEST_LATENCY.labels(endpoint).time():
68
  try:
69
  # write upload to tmp (blocking) -> run in thread
 
79
  info = get_audio_info(tmp_wav) or {}
80
  duration_sec = info.get("duration", 0)
81
  ASYNC_THRESHOLD = 120 # 2 phút, có thể chỉnh
82
+ # ---------- ASYNC JOB ----------
83
  if duration_sec > ASYNC_THRESHOLD:
84
  # Enqueue background job bằng RQ
85
  q = Queue("asr", connection=redis_client)
 
97
  "status": "queued",
98
  "duration": duration_sec
99
  })
100
+ # ---------- SYNC PIPELINE ----------
101
  # Nếu audio ngắn, xử lý sync như cũ
102
  model = ASR_MODEL or await asyncio.to_thread(load_model, 30)
103
  with ASR_DURATION.labels(endpoint).time():
 
106
 
107
  # normalize via Gemini (already async safe in your service)
108
  with NORMALIZE_DURATION.labels(endpoint).time():
109
+ # normalized_text = await normalize_text(text)
110
+ nlp = await normalize_and_extract(text)
111
+ normalized_text = nlp["normalized_text"]
112
+ keywords = nlp["keywords"]
113
+
114
+ summary = await generate_summary(normalized_text)
115
+ mindmap = await generate_mindmap(normalized_text)
116
 
117
  info2 = get_audio_info(tmp_wav) or {}
118
  # persist to Note Service (async HTTP)
 
120
  note_id=note_id,
121
  raw_text=text,
122
  normalized_text=normalized_text,
123
+ keywords=keywords,
124
+ summary=summary,
125
+ mindmap=mindmap,
126
  duration=info2.get("duration"),
127
  sample_rate=info2.get("samplerate"),
128
  chunks=chunks,
129
  asr_model="PhoWhisper-base",
130
+ normalization_model="gemini-1.5",
131
  )
132
 
133
  duration = time.perf_counter() - start_time
134
  logging.info(f"/transcribe success note_id={note_id} duration={duration:.2f}s audio_dur={info2.get('duration')}")
135
  REQUEST_COUNT.labels(endpoint, status_label).inc()
136
+ return JSONResponse(
137
+ status_code=200,
138
+ content={
139
+ "note_id": note_id,
140
+ "status": "transcribed",
141
+ "duration": info2.get("duration"),
142
+ },
143
+ )
144
+
145
  except HTTPException:
146
  status_label = "http_error"
147
  ERROR_COUNT.labels(endpoint, status_label).inc()
148
  raise
149
+
150
  except Exception as e:
151
  status_label = "error"
152
  ERROR_COUNT.labels(endpoint, status_label).inc()
153
  logging.exception(f"/transcribe failed note_id={note_id}")
154
  raise HTTPException(status_code=500, detail=f"Transcription failed: {e}")
155
+
156
  finally:
157
  # cleanup
158
  for p in [tmp_in, tmp_wav]:
 
162
  except Exception:
163
  pass
164
 
 
165
  @router.post("/transcribe-url", response_model=TranscribeResponse)
166
  async def transcribe_url(payload: dict):
167
  audio_url = payload.get("audio_url")
168
  user_id = payload.get("user_id")
169
+
170
  if not audio_url:
171
  raise HTTPException(status_code=400, detail="audio_url required")
172
  if not user_id:
 
174
 
175
  tmp_in = make_temp_path(suffix=Path(audio_url).suffix or ".tmp")
176
  tmp_wav = None
 
177
  note_id = str(uuid.uuid4())
178
+ note_service = NoteServiceClient()
179
 
180
+ endpoint = "/transcribe-url"
181
  start_time = time.perf_counter()
182
+ status_label = "success"
183
+
184
+ with REQUEST_LATENCY.labels(endpoint).time():
185
+ try:
186
+ # 1. Download from Cloudinary (blocking)
187
+ await asyncio.to_thread(download_file_from_url, audio_url, tmp_in)
188
+
189
+ # 2. File & duration limits
190
+ _ensure_file_limits(tmp_in)
191
+
192
+ # 3. Convert to wav 16k mono
193
+ tmp_wav = make_temp_path(suffix=".wav")
194
+ await asyncio.to_thread(ensure_wav_16k_mono, tmp_in, tmp_wav)
195
+
196
+ # 4. Check duration for sync / async
197
+ info = get_audio_info(tmp_wav) or {}
198
+ duration_sec = info.get("duration", 0)
199
+ ASYNC_THRESHOLD = 120 # seconds
200
+
201
+ # ---------- ASYNC JOB ----------
202
+ if duration_sec > ASYNC_THRESHOLD:
203
+ q = Queue("asr", connection=redis_client)
204
+ job = q.enqueue(
205
+ transcribe_job,
206
+ tmp_wav,
207
+ note_id,
208
+ job_timeout=1800,
209
+ )
210
+
211
+ logging.info(
212
+ f"/transcribe-url queued note_id={note_id} "
213
+ f"job_id={job.id} duration={duration_sec:.1f}s"
214
+ )
215
+ REQUEST_COUNT.labels(endpoint, "queued").inc()
216
+
217
+ return JSONResponse(
218
+ status_code=202,
219
+ content={
220
+ "note_id": note_id,
221
+ "job_id": job.id,
222
+ "status": "queued",
223
+ "duration": duration_sec,
224
+ },
225
+ )
226
+
227
+ # ---------- SYNC PIPELINE ----------
228
+ model = ASR_MODEL or await asyncio.to_thread(load_model, 30)
229
+
230
+ with ASR_DURATION.labels(endpoint).time():
231
+ text = await asyncio.to_thread(
232
+ transcribe_file, model, tmp_wav, 30.0, 5.0
233
+ )
234
+ chunks = await asyncio.to_thread(
235
+ transcribe_file_chunks, model, tmp_wav, 30.0, 5.0
236
+ )
237
+
238
+ with NORMALIZE_DURATION.labels(endpoint).time():
239
+ nlp = await normalize_and_extract(text)
240
+ normalized_text = nlp["normalized_text"]
241
+ keywords = nlp["keywords"]
242
+
243
+ summary = await generate_summary(normalized_text)
244
+ mindmap = await generate_mindmap(normalized_text)
245
+
246
+ # 5. Persist to Note Service
247
+ await note_service.save_transcript(
248
+ note_id=note_id,
249
+ raw_text=text,
250
+ normalized_text=normalized_text,
251
+ keywords=keywords,
252
+ summary=summary,
253
+ mindmap=mindmap,
254
+ duration=info.get("duration"),
255
+ sample_rate=info.get("samplerate"),
256
+ chunks=chunks,
257
+ asr_model="PhoWhisper-base",
258
+ normalization_model="gemini-1.5",
259
+ )
260
+
261
+ duration = time.perf_counter() - start_time
262
+ logging.info(
263
+ f"/transcribe-url success note_id={note_id} "
264
+ f"duration={duration:.2f}s audio_dur={info.get('duration')}"
265
+ )
266
+
267
+ REQUEST_COUNT.labels(endpoint, status_label).inc()
268
+ return JSONResponse(
269
+ status_code=200,
270
+ content={
271
+ "note_id": note_id,
272
+ "status": "transcribed",
273
+ "duration": info.get("duration"),
274
+ },
275
+ )
276
+
277
+ except HTTPException:
278
+ status_label = "http_error"
279
+ ERROR_COUNT.labels(endpoint, status_label).inc()
280
+ raise
281
+
282
+ except Exception as e:
283
+ status_label = "error"
284
+ ERROR_COUNT.labels(endpoint, status_label).inc()
285
+ logging.exception(f"/transcribe-url failed note_id={note_id}")
286
+ raise HTTPException(status_code=500, detail=str(e))
287
+
288
+ finally:
289
+ for p in [tmp_in, tmp_wav]:
290
+ try:
291
+ if p and os.path.exists(p):
292
+ os.remove(p)
293
+ except Exception:
294
+ pass
295
+
296
+ # @router.post("/transcribe-url", response_model=TranscribeResponse)
297
+ # async def transcribe_url(payload: dict):
298
+ # audio_url = payload.get("audio_url")
299
+ # user_id = payload.get("user_id")
300
+ # if not audio_url:
301
+ # raise HTTPException(status_code=400, detail="audio_url required")
302
+ # if not user_id:
303
+ # raise HTTPException(status_code=400, detail="user_id required")
304
+
305
+ # tmp_in = make_temp_path(suffix=Path(audio_url).suffix or ".tmp")
306
+ # tmp_wav = None
307
+ # note_service = NoteServiceClient()
308
+ # note_id = str(uuid.uuid4())
309
+
310
+ # start_time = time.perf_counter()
311
+ # try:
312
+ # # download blocking -> thread
313
+ # await asyncio.to_thread(download_file_from_url, audio_url, tmp_in)
314
+
315
+ # _ensure_file_limits(tmp_in)
316
+
317
+ # tmp_wav = make_temp_path(suffix=".wav")
318
+ # await asyncio.to_thread(ensure_wav_16k_mono, tmp_in, tmp_wav)
319
+
320
+ # model = ASR_MODEL or await asyncio.to_thread(load_model, 30)
321
+ # text = await asyncio.to_thread(transcribe_file, model, tmp_wav, 30.0, 5.0)
322
+ # chunks = await asyncio.to_thread(transcribe_file_chunks, model, tmp_wav, 30.0, 5.0)
323
+
324
+ # # NLP pipeline: normalize, extract keywords, then summary and mindmap
325
+ # nlp = await normalize_and_extract(text)
326
+ # normalized_text = nlp.get("normalized_text", text)
327
+ # keywords = nlp.get("keywords", [])
328
+
329
+ # summary = await generate_summary(normalized_text)
330
+ # mindmap = await generate_mindmap(normalized_text)
331
+
332
+ # info2 = get_audio_info(tmp_wav) or {}
333
+
334
+ # await note_service.save_transcript(
335
+ # note_id=note_id,
336
+ # raw_text=text,
337
+ # normalized_text=normalized_text,
338
+ # keywords=keywords,
339
+ # summary=summary,
340
+ # mindmap=mindmap,
341
+ # duration=info2.get("duration"),
342
+ # sample_rate=info2.get("samplerate"),
343
+ # chunks=chunks,
344
+ # asr_model="PhoWhisper-base",
345
+ # normalization_model="gemini-1.5"
346
+ # )
347
 
348
+ # duration = time.perf_counter() - start_time
349
+ # logging.info(f"/transcribe-url success note_id={note_id} duration={duration:.2f}s audio_dur={info2.get('duration')}")
350
+ # return JSONResponse(status_code=200, content={
351
+ # "note_id": note_id,
352
+ # "status": "transcribed",
353
+ # "duration": info2.get("duration")
354
+ # })
355
+ # except HTTPException:
356
+ # raise
357
+ # except Exception as e:
358
+ # logging.exception(f"/transcribe-url failed note_id={note_id}")
359
+ # raise HTTPException(status_code=500, detail=f"Transcription failed: {e}")
360
+ # finally:
361
+ # for p in [tmp_in, tmp_wav]:
362
+ # try:
363
+ # if p and os.path.exists(p):
364
+ # os.remove(p)
365
+ # except Exception:
366
+ # pass
app/jobs/transcribe_job.py CHANGED
@@ -1,27 +1,37 @@
1
  from app.core.asr_engine import load_model, transcribe_file
2
- from app.services.text_normalizer import normalize_text
3
  from app.services.note_client import NoteServiceClient
 
 
 
4
 
5
  # This function will be run by RQ worker
6
  def transcribe_job(tmp_wav: str, note_id: str):
7
  model = load_model()
8
- text = transcribe_file(model, tmp_wav, 30.0, 5.0)
 
 
 
 
 
 
 
 
 
9
  # normalize_text có thể là async, nhưng RQ chỉ chạy sync nên cần chạy event loop nếu cần
10
  import asyncio
11
- if asyncio.iscoroutinefunction(normalize_text):
12
- normalized = asyncio.run(normalize_text(text))
13
- else:
14
- normalized = normalize_text(text)
15
- note_service = NoteServiceClient()
16
- # Gửi transcript sang Note Service
17
- note_service.save_transcript(
18
- note_id=note_id,
19
- raw_text=text,
20
- normalized_text=normalized,
21
- duration=None,
22
- sample_rate=None,
23
- chunks=None,
24
- asr_model="PhoWhisper-base",
25
- normalization_model="gemini-1.5"
26
  )
27
  return True
 
1
  from app.core.asr_engine import load_model, transcribe_file
 
2
  from app.services.note_client import NoteServiceClient
3
+ from app.services.nlp_postprocess import normalize_and_extract
4
+ from app.services.summary_service import generate_summary
5
+ from app.services.mindmap_service import generate_mindmap
6
 
7
  # This function will be run by RQ worker
8
  def transcribe_job(tmp_wav: str, note_id: str):
9
  model = load_model()
10
+ raw_text = transcribe_file(model, tmp_wav, 30.0, 5.0)
11
+ nlp = asyncio.run(normalize_and_extract(raw_text))
12
+ normalized = nlp["normalized_text"]
13
+ keywords = nlp["keywords"]
14
+
15
+ summary = asyncio.run(generate_summary(normalized))
16
+ mindmap = asyncio.run(generate_mindmap(normalized))
17
+
18
+ note_service = NoteServiceClient()
19
+
20
  # normalize_text có thể là async, nhưng RQ chỉ chạy sync nên cần chạy event loop nếu cần
21
  import asyncio
22
+ asyncio.run(
23
+ note_service.save_transcript(
24
+ note_id=note_id,
25
+ raw_text=raw_text,
26
+ normalized_text=normalized,
27
+ keywords=keywords,
28
+ summary=summary,
29
+ mindmap=mindmap,
30
+ duration=None,
31
+ sample_rate=None,
32
+ chunks=None,
33
+ asr_model="PhoWhisper-base",
34
+ normalization_model="gemini-1.5",
35
+ )
 
36
  )
37
  return True
app/services/mindmap_service.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio, json
2
+ from app.config.settings import GEMINI_API_KEY
3
+ import google.generativeai as genai
4
+
5
+ if GEMINI_API_KEY:
6
+ genai.configure(api_key=GEMINI_API_KEY)
7
+ _model = genai.GenerativeModel("gemini-pro")
8
+ else:
9
+ _model = None
10
+
11
+
12
+ async def generate_mindmap(text: str) -> dict:
13
+ if not _model:
14
+ return {}
15
+
16
+ prompt = f"""
17
+ Bạn là chuyên gia tạo Sơ đồ tư duy. Hãy phân tích văn bản sau và tạo cấu trúc JSON Mindmap.
18
+ Yêu cầu:
19
+ 1. Xác định Ý chính làm Root.
20
+ 2. Phân tách ý phụ thành nhánh con (tối đa 3 cấp).
21
+ 3. Nhãn (label) ngắn gọn (< 7 từ).
22
+ 4. Màu sắc (colorHex): Root="#6200EE", Con="#F59E2B", "#2ECF9A", "#2F9BFF".
23
+
24
+ Cấu trúc JSON bắt buộc (Chỉ trả về JSON):
25
+ {{
26
+ "root": {{
27
+ "label": "Chủ đề",
28
+ "colorHex": "#6200EE",
29
+ "children": [
30
+ {{
31
+ "label": "Ý 1",
32
+ "colorHex": "#F59E2B",
33
+ "children": []
34
+ }}
35
+ ]
36
+ }}
37
+ }}
38
+
39
+ Văn bản:
40
+ {text}
41
+ """
42
+
43
+ loop = asyncio.get_event_loop()
44
+
45
+ def call():
46
+ r = _model.generate_content(prompt)
47
+ return r.text
48
+
49
+ raw = await loop.run_in_executor(None, call)
50
+
51
+ start = raw.find("{")
52
+ end = raw.rfind("}")
53
+ if start != -1 and end != -1:
54
+ return json.loads(raw[start:end+1])
55
+
56
+ return {}
app/services/nlp_postprocess.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from app.infra.redis_client import redis_client
2
+ from app.utils.hashing import sha256
3
+ from app.config.settings import GEMINI_API_KEY
4
+ import google.generativeai as genai
5
+ import asyncio
6
+ import json
7
+
8
+ CACHE_TTL = 60 * 60 * 24 * 3 # 3 days
9
+
10
+ if GEMINI_API_KEY:
11
+ genai.configure(api_key=GEMINI_API_KEY)
12
+ _model = genai.GenerativeModel("gemini-pro")
13
+ else:
14
+ _model = None
15
+
16
+
17
+ async def normalize_and_extract(raw_text: str) -> dict:
18
+ """
19
+ return {
20
+ "normalized_text": "...",
21
+ "keywords": [...]
22
+ }
23
+ """
24
+ cache_key = f"nlp:{sha256(raw_text)}"
25
+ cached = redis_client.get(cache_key)
26
+ if cached:
27
+ return json.loads(cached)
28
+
29
+ prompt = f"""
30
+ Bạn là một hệ thống Xử lý Hậu kỳ NLP (NLP Post-Processing) Tiếng Việt.
31
+
32
+ Đầu vào là văn bản thô (raw transcript), có thể thiếu dấu câu và sai chính tả do nhận dạng giọng nói (ví dụ: 'ăn chứa' -> 'ăn chưa').
33
+
34
+ Nhiệm vụ (Trả về JSON duy nhất):
35
+ 1. [ASR Correction & Punctuation]: Sửa lỗi chính tả ASR, thêm dấu câu, viết hoa chuẩn xác.
36
+
37
+ Văn bản đầu vào: \"\"\"{raw_text}\"\"\"
38
+
39
+ Cấu trúc JSON bắt buộc:
40
+ {{
41
+ "normalizedText": "Văn bản đã sửa hoàn chỉnh...",
42
+ "keywords": ["Từ khóa 1", "Từ khóa 2", "..."]
43
+ }}
44
+ """
45
+
46
+ result = {
47
+ "normalized_text": raw_text,
48
+ "keywords": []
49
+ }
50
+
51
+ if _model:
52
+ loop = asyncio.get_event_loop()
53
+
54
+ def call():
55
+ r = _model.generate_content(prompt)
56
+ return r.text
57
+
58
+ text = await loop.run_in_executor(None, call)
59
+
60
+ # clean JSON
61
+ start = text.find("{")
62
+ end = text.rfind("}")
63
+ if start != -1 and end != -1:
64
+ data = json.loads(text[start:end+1])
65
+ result = {
66
+ "normalized_text": data.get("normalizedText", raw_text),
67
+ "keywords": data.get("keywords", [])
68
+ }
69
+
70
+ redis_client.setex(cache_key, CACHE_TTL, json.dumps(result))
71
+ return result
app/services/note_client.py CHANGED
@@ -18,6 +18,7 @@ class NoteServiceClient:
18
  )
19
  )
20
  async def save_transcript(self, note_id: str, raw_text: str, normalized_text: str,
 
21
  duration: float, sample_rate: int, chunks: list,
22
  asr_model: str = "PhoWhisper-base",
23
  normalization_model: str = "gemini-1.5"):
@@ -25,6 +26,9 @@ class NoteServiceClient:
25
  payload = {
26
  "raw_text": raw_text,
27
  "normalized_text": normalized_text,
 
 
 
28
  "duration": duration,
29
  "sample_rate": sample_rate,
30
  "chunks": chunks,
 
18
  )
19
  )
20
  async def save_transcript(self, note_id: str, raw_text: str, normalized_text: str,
21
+ keywords: list, summary: str, mindmap: dict,
22
  duration: float, sample_rate: int, chunks: list,
23
  asr_model: str = "PhoWhisper-base",
24
  normalization_model: str = "gemini-1.5"):
 
26
  payload = {
27
  "raw_text": raw_text,
28
  "normalized_text": normalized_text,
29
+ "keywords": keywords,
30
+ "summary": summary,
31
+ "mindmap": mindmap,
32
  "duration": duration,
33
  "sample_rate": sample_rate,
34
  "chunks": chunks,
app/services/summary_service.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ from app.config.settings import GEMINI_API_KEY
3
+ import google.generativeai as genai
4
+
5
+ if GEMINI_API_KEY:
6
+ genai.configure(api_key=GEMINI_API_KEY)
7
+ _model = genai.GenerativeModel("gemini-pro")
8
+ else:
9
+ _model = None
10
+
11
+
12
+ async def generate_summary(text: str) -> str:
13
+ if not _model:
14
+ return ""
15
+
16
+ prompt = f"""
17
+ Bạn là chuyên gia tóm tắt. Hãy tóm tắt văn bản sau thành **một đoạn văn duy nhất**.
18
+ Yêu cầu:
19
+ 1. Viết khoảng 3-5 câu, tổng hợp đầy đủ chủ đề và các ý chính.
20
+ 2. Viết liền mạch, KHÔNG xuống dòng, KHÔNG dùng gạch đầu dòng hay đánh số.
21
+ 3. Chỉ dựa trên thông tin được cung cấp, tuyệt đối KHÔNG tự thêm thông tin bên ngoài.
22
+ 4. Trả về văn bản thuần (plain text).
23
+
24
+ Văn bản:
25
+ \"\"\"{text}\"\"\"
26
+ """
27
+
28
+ loop = asyncio.get_event_loop()
29
+
30
+ def call():
31
+ r = _model.generate_content(prompt)
32
+ return r.text.strip()
33
+
34
+ result = await loop.run_in_executor(None, call)
35
+ return result.replace("```", "").strip()