Spaces:

Corin1998
/

Auto_PPT_Generator

Sleeping

File size: 4,549 Bytes

f73aa58
 
d13b751
 
f73aa58
a19c579
f73aa58
a19c579
 
 
 
f73aa58
 
a19c579
f73aa58
a19c579
f73aa58
a19c579
f73aa58
 
a19c579
 
 
 
 
 
f73aa58
 
 
 
 
a19c579
 
f73aa58
a19c579
f73aa58
 
 
 
 
 
 
a19c579
 
f73aa58
 
 
 
 
a19c579
 
 
f73aa58
a19c579
 
f73aa58
 
 
 
 
a19c579
 
f73aa58
 
 
 
 
 
 
 
 
 
a19c579
 
f73aa58
 
 
 
 
 
a19c579
f73aa58
 
 
 
 
 
 
 
 
 
 
 
a19c579
f73aa58
 
a19c579
 
f73aa58
a19c579
f73aa58
 
 
 
a19c579
f73aa58
 
a19c579
f73aa58
a19c579
f73aa58
 
a19c579
 
f73aa58
 
 
 
 
 
a19c579
 
f73aa58
 
a19c579
f73aa58
 
 
 
a19c579
 
 
 
 
 
 
f73aa58
 
a19c579
 
f73aa58
a19c579
f73aa58

import re
from typing import Dict, List, Tuple, Any
# ★ 相対ではなく絶対（modules 配下直読み）に変更
from llm import LLMClient

# ----------------- Regex helpers -----------------
LIST_BULLET = re.compile(r"^(?:[-*•・]|\d+\.|\d+\))\s+(.*)")
KEYVAL_LINE = re.compile(r"^\s*([^:：]+?)\s*[:：]\s*([^\n]+?)\s*$")
LABEL_NUM = re.compile(r"^\s*([^:：]+?)\s*[:：]\s*([+-]?\d+(?:\.\d+)?)\s*$")
HEADER = re.compile(r"^(#+|\d+\.|\d+\))\s*(.+)$")


def naive_section_split(text: str, target_chars: int = 1200) -> List[Tuple[str, str]]:
    """Split into (title, content) using headings or by size."""
    lines = text.splitlines()
    sections: List[Tuple[str, str]] = []
    cur_title = "セクション"
    cur_buf: List[str] = []

    def flush():
        nonlocal cur_title, cur_buf
        if cur_buf:
            sections.append((cur_title, "\n".join(cur_buf).strip()))
            cur_buf = []

    for ln in lines:
        m = HEADER.match(ln.strip())
        if m:
            flush()
            cur_title = m.group(2).strip()
            continue
        cur_buf.append(ln)
        if sum(len(x) for x in cur_buf) > target_chars:
            flush()
            cur_title = f"セクション{len(sections)+1}"
    flush()

    # Fallback single section
    if not sections:
        sections = [("本文", text)]
    return sections


def extract_bullets(section_text: str, max_items: int = 8) -> List[str]:
    bullets: List[str] = []
    for line in section_text.splitlines():
        m = LIST_BULLET.match(line.strip())
        if m:
            bullets.append(m.group(1).strip())
    if not bullets:
        # Heuristic: split by '。' or '.' and take concise sentences
        sents = re.split(r"[。\.!?]\s*", section_text)
        for s in sents:
            s = s.strip()
            if 8 <= len(s) <= 120:
                bullets.append(s)
            if len(bullets) >= max_items:
                break
    return bullets[:max_items]


def extract_keyval_table(section_text: str) -> List[Tuple[str, str]]:
    pairs: List[Tuple[str, str]] = []
    for line in section_text.splitlines():
        m = KEYVAL_LINE.match(line)
        if m:
            k = m.group(1).strip()
            v = m.group(2).strip()
            if k and v:
                pairs.append((k, v))
    return pairs


def extract_chart_data(section_text: str, top_k: int = 10) -> List[Tuple[str, float]]:
    data: List[Tuple[str, float]] = []
    for line in section_text.splitlines():
        m = LABEL_NUM.match(line)
        if m:
            label = m.group(1).strip()
            try:
                val = float(m.group(2))
            except ValueError:
                continue
            data.append((label, val))
    # Deduplicate by label, keep last occurrence
    seen = {}
    for k, v in data:
        seen[k] = v
    items = list(seen.items())
    # Sort by abs value desc
    items.sort(key=lambda x: abs(x[1]), reverse=True)
    return items[:top_k]


def process_text(text: str,
                 use_inference_api: bool,
                 summarizer_model: str,
                 generator_model: str,
                 want_summary: bool,
                 want_tables: bool,
                 want_charts: bool,
                 max_summary_words: int = 200) -> Dict[str, Any]:
    client = LLMClient(use_inference_api=use_inference_api)

    # 1) Executive summary
    summary = None
    if want_summary:
        summary = client.summarize(text, model=summarizer_model, max_words=max_summary_words)

    # 2) Sections (rule-based; reliable on CPU)
    sections = naive_section_split(text)

    # 3) Per-section bullets / tables / charts
    bullets_by_section: Dict[int, List[str]] = {}
    tables: List[Dict[str, Any]] = []
    charts: List[Dict[str, Any]] = []

    for idx, (title, body) in enumerate(sections):
        bullets_by_section[idx] = extract_bullets(body)

        if want_tables:
            kv = extract_keyval_table(body)
            if kv:
                tables.append({
                    "title": f"{title} — 表",
                    "pairs": kv
                })

        if want_charts:
            series = extract_chart_data(body)
            if series:
                charts.append({
                    "title": f"{title} — チャート",
                    "series": series
                })

    return {
        "summary": summary,
        "sections": sections,  # list of (title, text)
        "bullets": bullets_by_section,
        "tables": tables,
        "charts": charts,
    }