File size: 2,224 Bytes
2219112
 
 
 
 
7eb64ea
2219112
 
 
de5cdf3
2219112
 
 
de5cdf3
2219112
 
de5cdf3
 
 
 
 
 
 
 
 
 
 
 
 
2219112
 
 
 
7eb64ea
 
 
 
 
 
 
 
 
 
de5cdf3
2219112
 
 
 
de5cdf3
 
 
 
7eb64ea
 
de5cdf3
5c389b4
 
2219112
 
de5cdf3
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import gradio as gr
import os
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
from transformers import T5Tokenizer, T5ForConditionalGeneration

# 記事フォルダ読み込み
articles_dir = "articles"
texts, titles, urls = [], [], []

model = SentenceTransformer("all-MiniLM-L6-v2")

# 記事を読み込む
for fname in os.listdir(articles_dir):
    with open(os.path.join(articles_dir, fname), "r", encoding="utf-8") as f:
        content = f.read()
        title_line = content.splitlines()[0].replace("タイトル:", "").strip()
        url_line = content.splitlines()[1].replace("URL:", "").strip()
        body_text = "\n".join(content.splitlines()[3:])
        titles.append(title_line)
        urls.append(url_line)
        texts.append(body_text)

        vec = model.encode(body_text)
        if 'vectors' not in locals():
            vectors = [vec]
        else:
            vectors.append(vec)

index = faiss.IndexFlatL2(384)
index.add(np.array(vectors))

# T5要約モデル
tokenizer = T5Tokenizer.from_pretrained("sonoisa/t5-base-japanese")
t5_model = T5ForConditionalGeneration.from_pretrained("sonoisa/t5-base-japanese")

def generate_summary(text):
    input_text = "summarize: " + text.replace("\n", " ")
    input_ids = tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True)
    output_ids = t5_model.generate(input_ids, max_length=128, min_length=32, do_sample=False)
    summary = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return summary

# チャットボット関数
def chat(query):
    vec = model.encode([query])
    _, I = index.search(np.array(vec), k=3)
    retrieved_texts = [texts[i] for i in I[0]]
    retrieved_titles = [titles[i] for i in I[0]]
    retrieved_urls = [urls[i] for i in I[0]]

    context = "\n\n".join(retrieved_texts)[:1000]
    summary = generate_summary(context)

    links = "\n".join([f"🔗 [{retrieved_titles[i]}]({retrieved_urls[i]})" for i in range(len(retrieved_titles))])
    return f"{summary}\n\n参考記事:\n{links}"

# Gradio UI
gr.Interface(fn=chat, inputs="text", outputs="text", title="ブログ記事から回答する転職チャットボット").launch()