Spaces:
Sleeping
Sleeping
File size: 9,510 Bytes
aa52b0f 3cfefb9 aa52b0f b40f5af aa52b0f b40f5af aa52b0f 030acb9 aa52b0f 621abfa aa52b0f | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 | # -*- coding: utf-8 -*-
import os
import requests
import re
import jaconv
import sys
import openai
from janome.tokenizer import Tokenizer
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import langchain
from langchain import OpenAI
from langchain.text_splitter import TokenTextSplitter
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.chains.combine_documents.map_reduce import MapReduceDocumentsChain
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
from typing import Any, List, Mapping, Optional
from langchain.chat_models import ChatOpenAI
import cchardet
# APIキーの設定
openai.api_key = os.getenv("OPENAI_API_KEY")
url1 = sys.argv[1]
url2 = sys.argv[2]
url3 = sys.argv[3]
urls = [url1, url2, url3]
# エラーが発生したURLを保存するファイル
error_url_file = "error_urls.txt"
# エラーが発生したURLを読み込む
try:
with open(error_url_file, "r") as f:
error_urls = f.read().splitlines()
except FileNotFoundError:
error_urls = []
texts = []
num_topics = 3
tfidf_threshold = 0.1 # TF-IDFの閾値
n_top_words = 10 # 各トピックのトップNのキーワードを抽出
stop_words = ["こちら","の", "に", "は", "を", "た", "が", "で", "て", "と", "し", "れ", "さ", "ある", "いる", "も", "する", "から", "な", "こと", "として", "い", "や", "れる", "など", "なっ", "ない", "この", "ため", "その", "あっ", "よう", "また", "もの", "という", "あり", "まで", "られ", "なる", "へ", "か", "だ", "これ", "によって", "により", "おり", "より", "による", "ず", "なり", "られる", "において", "ば", "なかっ", "なく", "しかし", "について", "せ", "だっ", "その後", "できる", "それ", "う", "ので", "なお", "のみ", "でき", "き", "つ", "における", "および", "いう", "さらに", "でも", "ら", "たり", "その他", "または", "ながら", "つつ", "とも", "これら", "ところ", "ここ", "です", "ます", "ましょ", "ください"]
# janomeの初期化
t = Tokenizer()
def url_to_filepath(url):
return url.replace("https://", "").replace("/", "_").replace("?", "_").replace("&", "_")
def extract_text_from_url(url, output_file):
try:
response = requests.get(url)
response.raise_for_status() # Raises stored HTTPError, if one occurred.
encoding = cchardet.detect(response.content)['encoding']
response.encoding = encoding
text = response.text
text = re.sub(r"\d{3,}", "", text)
text = re.sub(r"<table.*?/table>", "", text, flags=re.DOTALL)
text = jaconv.h2z(text, kana=False, digit=True, ascii=True)
text = jaconv.z2h(text, kana=True, digit=False, ascii=True)
# ノーブレークスペースを通常のスペースに置換
text = text.replace('\xa0', ' ')
soup = BeautifulSoup(text, "html.parser")
p_tags = soup.find_all("p")
output_text = ""
for p in p_tags:
if len(output_text) + len(p.get_text()) > 7500:
break # 7500文字を超えたらループを終了
output_text += p.get_text()
output_text = output_text.replace("\n", "")
output_text = output_text.replace('\xa0', ' ')
output_dir = os.path.dirname(os.path.abspath(output_file))
os.makedirs(output_dir, exist_ok=True) # ディレクトリを作成
with open(output_file, "w", encoding=encoding) as f:
f.write(output_text)
return output_text
except requests.HTTPError as http_err:
print(f'HTTP error occurred: {http_err}')
except Exception as err:
print(f'Other error occurred: {err}')
# エラーが発生したURLを記録
error_urls.append(url)
with open(error_url_file, "w") as f:
for error_url in error_urls:
f.write(error_url + "\n")
return None
def extract_text_from_urls(urls: List[str]) -> List[str]:
extracted_texts = []
for i, url in enumerate(urls):
output_file = f"output0-{i+1}.txt"
if os.path.exists(output_file):
print(f"File already exists: {output_file}")
with open(output_file, "r", encoding="utf-8") as f:
text = f.read()
else:
print(f"Extracting text from: {url}")
text = extract_text_from_url(url, output_file)
if text and text != "エラー":
extracted_texts.append(text)
print("Extracted texts:", extracted_texts) # テキストの出力
return extracted_texts
# エラーが発生したURLをスキップ
urls = [url for url in urls if url not in error_urls]
extracted_texts = extract_text_from_urls(urls)
combined_text = "" # 合わせたテキスト
for i, url in enumerate(urls):
output_file = f"output0-{i+1}.txt"
output_text = extract_text_from_url(url, output_file)
texts.append(output_text)
combined_text += output_text + " " # 合わせたテキストに追加
# LDA for combined text
combined_text = combined_text.lower() # テキストを小文字に変換
tokens = [token.surface for token in t.tokenize(combined_text)] # テキストをトークン化
words = [word for word in tokens if word not in stop_words] # ストップワードを削除
if words:
vectorizer = TfidfVectorizer(stop_words=stop_words)
X = vectorizer.fit_transform([' '.join([token.surface for token in t.tokenize(text)]) for text in texts])
print("Number of texts:", len(texts)) # テキストの数を出力
print("Shape of X:", X.shape) # Xの形状を出力
feature_names = vectorizer.get_feature_names_out()
# LDA
lda = LatentDirichletAllocation(n_components=num_topics)
X_lda = lda.fit_transform(X)
# Extract top keywords for each topic
topic_keywords = [[] for _ in range(num_topics)] # Store topic keywords
for topic_idx, topic in enumerate(lda.components_):
top_keyword_indices = topic.argsort()[:-n_top_words - 1:-1]
topic_keywords[topic_idx].extend([feature_names[i] for i in top_keyword_indices])
# Write topic keywords to output1.txt
with open("output1.txt", "w", encoding="utf-8") as f:
f.write("出現頻度の高いキーワードTOP{} :\n".format(n_top_words))
f.write("\n".join([", ".join(topic) for topic in topic_keywords]))
f.write("\n\n")
else:
print("No words found for LDA processing.")
# TF-IDF Vectorization
vectorizer = TfidfVectorizer(stop_words=stop_words)
X = vectorizer.fit_transform([' '.join([token.surface for token in t.tokenize(text)]) for text in texts])
feature_names = vectorizer.get_feature_names_out()
# TF-IDFスコアが閾値以上の特徴語を抽出
high_tfidf_features = []
for text_id in range(len(texts)):
text = texts[text_id].lower() # テキストを小文字に変換
tokens = [token.surface for token in t.tokenize(text)] # テキストをトークン化
words = [word for word in tokens if word not in stop_words] # ストップワードを削除
if not words:
continue
vectorizer = TfidfVectorizer(stop_words=stop_words)
X = vectorizer.fit_transform([' '.join(words)]) # ボキャブラリを作成するためにテキストを使用
feature_names = vectorizer.get_feature_names_out()
feature_index= X.nonzero()[1]
top_keywords = [feature_names[i] for i in feature_index if X[0, i] >= tfidf_threshold][:n_top_words]
high_tfidf_features.append(top_keywords)
# Write high TF-IDF features to output1.txt
with open("output1.txt", "a", encoding="utf-8") as f:
f.write("重要なキーワード:\n")
f.write(", ".join(top_keywords))
f.write("\n\n")
# Extract text subjects and related text parts
model_name = "gpt-3.5-turbo-1106"
llm = ChatOpenAI(model_name=model_name, temperature=0.7)
text_splitter = TokenTextSplitter(chunk_size=5000, chunk_overlap=500)
document_splits = []
for file_path in ["output0-1.txt", "output0-2.txt", "output0-3.txt"]:
with open(file_path, "rb") as file:
content = file.read()
encoding = cchardet.detect(content)['encoding']
if encoding is None:
print(f"Warning: Could not determine encoding for {file_path}. File might contain binary data. Skipping this file.")
continue
try:
text = content.decode(encoding)
document_splits.extend(text_splitter.create_documents([text]))
except UnicodeDecodeError:
print(f"Error: Failed to decode {file_path} using {encoding}. Skipping this file.")
continue
prompt_subject = PromptTemplate(
input_variables=["text"],
template="""Text: {text}
Textの主題を抽出し、主題:〇〇という形で教えてください。Please tell me in Japanese.:
*主題:
*"""
)
chain_subject = LLMChain(llm=llm, prompt=prompt_subject, verbose=True)
map_reduce_chain = MapReduceDocumentsChain(
llm_chain=chain_subject,
combine_document_chain=StuffDocumentsChain(llm_chain=chain_subject, verbose=True),
verbose=True
)
subjects = map_reduce_chain.run(input_documents=document_splits, token_max=50000)
print(subjects)
# Write the extracted subjects to output1.txt
with open("output1.txt", "a", encoding="utf-8") as f:
f.write("主題:\n")
for subject in subjects:
f.write(subject + " ")
f.write("\n") |