File size: 3,637 Bytes
85707bb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 |
import os, json, re, datetime
from typing import Dict, List, Optional
from transformers import pipeline
from faster_whisper import WhisperModel
# --------- Models (lazy init) ---------
_SUMMARIZER = None
_QA = None
_WHISPER = None
def get_summarizer():
global _SUMMARIZER
if _SUMMARIZER is None:
_SUMMARIZER = pipeline("summarization", model="facebook/bart-large-cnn")
return _SUMMARIZER
def get_extractor():
"""Use Flan-T5 for JSON-style action/decision extraction via text2text pipeline."""
global _QA
if _QA is None:
_QA = pipeline("text2text-generation", model="google/flan-t5-large", max_new_tokens=256)
return _QA
def get_whisper(device: str = "auto"):
global _WHISPER
if _WHISPER is None:
_WHISPER = WhisperModel("Systran/faster-whisper-small", device=device, compute_type="int8")
return _WHISPER
# --------- Core functions ---------
def transcribe_audio(audio_path: str) -> str:
model = get_whisper()
segments, info = model.transcribe(audio_path, beam_size=1)
text = " ".join(seg.text.strip() for seg in segments)
return text.strip()
def summarize(text: str) -> str:
summarizer = get_summarizer()
chunks = _chunk(text, 2200)
partials = [summarizer(ch, do_sample=False)[0]["summary_text"] for ch in chunks]
merged = " ".join(partials)
final = summarizer(merged, do_sample=False, max_length=200, min_length=60)[0]["summary_text"]
return final
def extract_actions_decisions(text: str) -> Dict[str, List[str]]:
prompt = f"""Tu es un assistant de prise de notes de réunion.
À partir du transcript ci-dessous, extrais :
1) une liste concise de "Points d'action" (qui fait quoi, verbe à l'infinitif, deadline si mentionnée)
2) une liste "Décisions prises" (phrases courtes)
Retourne du JSON strict de la forme :
{{"actions": ["...","..."], "decisions": ["...","..."]}}
Transcript:
{text[:7000]}
"""
gen = get_extractor()
out = gen(prompt)[0]["generated_text"]
try:
data = json.loads(out)
actions = [s.strip() for s in data.get("actions", []) if s.strip()]
decisions = [s.strip() for s in data.get("decisions", []) if s.strip()]
return {"actions": actions, "decisions": decisions}
except Exception:
actions = []
decisions = []
for line in text.splitlines():
if re.search(r"(?i)\b(action|à faire|todo|faire):", line):
actions.append(re.sub(r"(?i)^.*?:\s*", "", line).strip())
if re.search(r"(?i)\b(décision|decision):", line):
decisions.append(re.sub(r"(?i)^.*?:\s*", "", line).strip())
return {"actions": actions, "decisions": decisions}
def make_minutes_md(title: str, summary: str, actions: List[str], decisions: List[str]) -> str:
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M")
lines = [
f"# {title} — Compte-rendu",
f"_Généré le {now}_",
"",
"## Résumé",
summary.strip() if summary else "—",
"",
"## Points d'action",
*[f"- [ ] {a}" for a in (actions or ["—"])],
"",
"## Décisions prises",
*[f"- {d}" for d in (decisions or ["—"])],
"",
]
return "\n".join(lines)
def _chunk(text: str, max_chars: int) -> List[str]:
parts, buf, size = [], [], 0
import re as _re
for sent in _re.split(r'(?<=[\.!\?])\s+', text):
if size + len(sent) > max_chars and buf:
parts.append(" ".join(buf)); buf, size = [], 0
buf.append(sent); size += len(sent) + 1
if buf: parts.append(" ".join(buf))
return parts
|