Yasu777 commited on
Commit
882bbf7
·
verified ·
1 Parent(s): 6849f0f

Update first.py

Browse files
Files changed (1) hide show
  1. first.py +25 -225
first.py CHANGED
@@ -1,229 +1,29 @@
1
  # -*- coding: utf-8 -*-
2
 
3
- import os
4
- import requests
5
- import re
6
- import jaconv
7
  import sys
8
- import openai
9
- from janome.tokenizer import Tokenizer
10
- from bs4 import BeautifulSoup
11
- from sklearn.feature_extraction.text import TfidfVectorizer
12
- from sklearn.decomposition import LatentDirichletAllocation
13
- import langchain
14
- from langchain import OpenAI
15
- from langchain.text_splitter import TokenTextSplitter
16
- from langchain.prompts import PromptTemplate
17
- from langchain.chains import LLMChain
18
- from langchain.chains.combine_documents.map_reduce import MapReduceDocumentsChain
19
- from langchain.chains.combine_documents.stuff import StuffDocumentsChain
20
- from typing import Any, List, Mapping, Optional
21
- from langchain.chat_models import ChatOpenAI
22
- import cchardet
23
-
24
- # APIキーの設定
25
- openai.api_key = os.getenv("OPENAI_API_KEY")
26
-
27
- url1 = sys.argv[1]
28
- url2 = sys.argv[2]
29
- url3 = sys.argv[3]
30
-
31
- urls = [url1, url2, url3]
32
-
33
- # エラーが発生したURLを保存するファイル
34
- error_url_file = "error_urls.txt"
35
-
36
- # エラーが発生したURLを読み込む
37
- try:
38
- with open(error_url_file, "r") as f:
39
- error_urls = f.read().splitlines()
40
- except FileNotFoundError:
41
- error_urls = []
42
-
43
- texts = []
44
- num_topics = 3
45
- tfidf_threshold = 0.1 # TF-IDFの閾値
46
- n_top_words = 10 # 各トピックのトップNのキーワードを抽出
47
-
48
- stop_words = ["こちら","の", "に", "は", "を", "た", "が", "で", "て", "と", "し", "れ", "さ", "ある", "いる", "も", "する", "から", "な", "こと", "として", "い", "や", "れる", "など", "なっ", "ない", "この", "ため", "その", "あっ", "よう", "また", "もの", "という", "あり", "まで", "られ", "なる", "へ", "か", "だ", "これ", "によって", "により", "おり", "より", "による", "ず", "なり", "られる", "において", "ば", "なかっ", "なく", "しかし", "について", "せ", "だっ", "その後", "できる", "それ", "う", "ので", "なお", "のみ", "でき", "き", "つ", "における", "および", "いう", "さらに", "でも", "ら", "たり", "その他", "または", "ながら", "つつ", "とも", "これら", "ところ", "ここ", "です", "ます", "ましょ", "ください"]
49
-
50
- # janomeの初期化
51
- t = Tokenizer()
52
-
53
- def url_to_filepath(url):
54
- return url.replace("https://", "").replace("/", "_").replace("?", "_").replace("&", "_")
55
-
56
- def extract_text_from_url(url, output_file):
57
- try:
58
- response = requests.get(url)
59
- response.raise_for_status() # Raises stored HTTPError, if one occurred.
60
- encoding = cchardet.detect(response.content)['encoding']
61
- response.encoding = encoding
62
- text = response.text
63
- text = re.sub(r"\d{3,}", "", text)
64
- text = re.sub(r"<table.*?/table>", "", text, flags=re.DOTALL)
65
- text = jaconv.h2z(text, kana=False, digit=True, ascii=True)
66
- text = jaconv.z2h(text, kana=True, digit=False, ascii=True)
67
-
68
- # ノーブレークスペースを通常のスペースに置換
69
- text = text.replace('\xa0', ' ')
70
-
71
- soup = BeautifulSoup(text, "html.parser")
72
- p_tags = soup.find_all("p")
73
- output_text = ""
74
- for p in p_tags:
75
- if len(output_text) + len(p.get_text()) > 7500:
76
- break # 7500文字を超えたらループを終了
77
- output_text += p.get_text()
78
- output_text = output_text.replace("\n", "")
79
- output_text = output_text.replace('\xa0', ' ')
80
-
81
- output_dir = os.path.dirname(os.path.abspath(output_file))
82
- os.makedirs(output_dir, exist_ok=True) # ディレクトリを作成
83
-
84
- with open(output_file, "w", encoding=encoding) as f:
85
- f.write(output_text)
86
-
87
- return output_text
88
- except requests.HTTPError as http_err:
89
- print(f'HTTP error occurred: {http_err}')
90
- except Exception as err:
91
- print(f'Other error occurred: {err}')
92
- # エラーが発生したURLを記録
93
- error_urls.append(url)
94
- with open(error_url_file, "w") as f:
95
- for error_url in error_urls:
96
- f.write(error_url + "\n")
97
- return None
98
-
99
- def extract_text_from_urls(urls: List[str]) -> List[str]:
100
- extracted_texts = []
101
- for i, url in enumerate(urls):
102
- output_file = f"output0-{i+1}.txt"
103
- if os.path.exists(output_file):
104
- print(f"File already exists: {output_file}")
105
- with open(output_file, "r", encoding="utf-8") as f:
106
- text = f.read()
107
- else:
108
- print(f"Extracting text from: {url}")
109
- text = extract_text_from_url(url, output_file)
110
- if text and text != "エラー":
111
- extracted_texts.append(text)
112
- print("Extracted texts:", extracted_texts) # テキストの出力
113
- return extracted_texts
114
-
115
- # エラーが発生したURLをスキップ
116
- urls = [url for url in urls if url not in error_urls]
117
-
118
- extracted_texts = extract_text_from_urls(urls)
119
-
120
- combined_text = "" # 合わせたテキスト
121
- for i, url in enumerate(urls):
122
- output_file = f"output0-{i+1}.txt"
123
- output_text = extract_text_from_url(url, output_file)
124
- texts.append(output_text)
125
- combined_text += output_text + " " # 合わせたテキストに追加
126
-
127
- # LDA for combined text
128
- combined_text = combined_text.lower() # テキストを小文字に変換
129
- tokens = [token.surface for token in t.tokenize(combined_text)] # テキストをトークン化
130
- words = [word for word in tokens if word not in stop_words] # ストップワードを削除
131
-
132
- if words:
133
- vectorizer = TfidfVectorizer(stop_words=stop_words)
134
- X = vectorizer.fit_transform([' '.join([token.surface for token in t.tokenize(text)]) for text in texts])
135
- print("Number of texts:", len(texts)) # テキストの数を出力
136
- print("Shape of X:", X.shape) # Xの形状を出力
137
- feature_names = vectorizer.get_feature_names_out()
138
-
139
- # LDA
140
- lda = LatentDirichletAllocation(n_components=num_topics)
141
- X_lda = lda.fit_transform(X)
142
-
143
- # Extract top keywords for each topic
144
- topic_keywords = [[] for _ in range(num_topics)] # Store topic keywords
145
- for topic_idx, topic in enumerate(lda.components_):
146
- top_keyword_indices = topic.argsort()[:-n_top_words - 1:-1]
147
- topic_keywords[topic_idx].extend([feature_names[i] for i in top_keyword_indices])
148
-
149
- # Write topic keywords to output1.txt
150
- with open("output1.txt", "w", encoding="utf-8") as f:
151
- f.write("出現頻度の高いキーワードTOP{} :\n".format(n_top_words))
152
- f.write("\n".join([", ".join(topic) for topic in topic_keywords]))
153
- f.write("\n\n")
154
- else:
155
- print("No words found for LDA processing.")
156
-
157
- # TF-IDF Vectorization
158
- vectorizer = TfidfVectorizer(stop_words=stop_words)
159
- X = vectorizer.fit_transform([' '.join([token.surface for token in t.tokenize(text)]) for text in texts])
160
- feature_names = vectorizer.get_feature_names_out()
161
-
162
- # TF-IDFスコアが閾値以上の特徴語を抽出
163
- high_tfidf_features = []
164
- for text_id in range(len(texts)):
165
- text = texts[text_id].lower() # テキストを小文字に変換
166
- tokens = [token.surface for token in t.tokenize(text)] # テキストをトークン化
167
- words = [word for word in tokens if word not in stop_words] # ストップワードを削除
168
-
169
- if not words:
170
- continue
171
-
172
- vectorizer = TfidfVectorizer(stop_words=stop_words)
173
- X = vectorizer.fit_transform([' '.join(words)]) # ボキャブラリを作成するためにテキストを使用
174
- feature_names = vectorizer.get_feature_names_out()
175
-
176
- feature_index= X.nonzero()[1]
177
- top_keywords = [feature_names[i] for i in feature_index if X[0, i] >= tfidf_threshold][:n_top_words]
178
- high_tfidf_features.append(top_keywords)
179
-
180
- # Write high TF-IDF features to output1.txt
181
- with open("output1.txt", "a", encoding="utf-8") as f:
182
- f.write("重要なキーワード:\n")
183
- f.write(", ".join(top_keywords))
184
- f.write("\n\n")
185
-
186
- # Extract text subjects and related text parts
187
- model_name = "gpt-3.5-turbo-1106"
188
- llm = ChatOpenAI(model_name=model_name, temperature=0.7)
189
- text_splitter = TokenTextSplitter(chunk_size=5000, chunk_overlap=500)
190
- document_splits = []
191
-
192
- for file_path in ["output0-1.txt", "output0-2.txt", "output0-3.txt"]:
193
- with open(file_path, "rb") as file:
194
- content = file.read()
195
- encoding = cchardet.detect(content)['encoding']
196
- if encoding is None:
197
- print(f"Warning: Could not determine encoding for {file_path}. File might contain binary data. Skipping this file.")
198
- continue
199
- try:
200
- text = content.decode(encoding)
201
- document_splits.extend(text_splitter.create_documents([text]))
202
- except UnicodeDecodeError:
203
- print(f"Error: Failed to decode {file_path} using {encoding}. Skipping this file.")
204
- continue
205
-
206
- prompt_subject = PromptTemplate(
207
- input_variables=["text"],
208
- template="""Text: {text}
209
- Textの主題を抽出し、主題:〇〇という形で教えてください。Please tell me in Japanese.:
210
- *主題:
211
- *"""
212
- )
213
-
214
- chain_subject = LLMChain(llm=llm, prompt=prompt_subject, verbose=True)
215
- map_reduce_chain = MapReduceDocumentsChain(
216
- llm_chain=chain_subject,
217
- combine_document_chain=StuffDocumentsChain(llm_chain=chain_subject, verbose=True),
218
- verbose=True
219
- )
220
-
221
- subjects = map_reduce_chain.run(input_documents=document_splits, token_max=50000)
222
- print(subjects)
223
 
224
- # Write the extracted subjects to output1.txt
225
- with open("output1.txt", "a", encoding="utf-8") as f:
226
- f.write("主題:\n")
227
- for subject in subjects:
228
- f.write(subject + " ")
229
- f.write("\n")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  # -*- coding: utf-8 -*-
2
 
 
 
 
 
3
  import sys
4
+ from sklearn.feature_extraction.text import CountVectorizer
5
+ import os
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
+ def process_keywords(text):
8
+ """ テキストからN-gramを生成してリストとして返す """
9
+ # 文字列を正規化して、カンマと改行を空白に変換
10
+ text = re.sub(r"[,\n]+", " ", text)
11
+ # CountVectorizerを用いてN-gramを生成
12
+ vectorizer = CountVectorizer(ngram_range=(1, 3))
13
+ X = vectorizer.fit_transform([text])
14
+ features = vectorizer.get_feature_names_out()
15
+ return features
16
+
17
+ def save_keywords(keywords, filename="output1.txt"):
18
+ """ キーワードをファイルに保存 """
19
+ with open(filename, 'w', encoding='utf-8') as file:
20
+ for keyword in keywords:
21
+ file.write(keyword + "\n")
22
+
23
+ if __name__ == "__main__":
24
+ if len(sys.argv) > 1:
25
+ input_keywords = sys.argv[1] # コマンドラインからその他のキーワードを受け取る
26
+ processed_keywords = process_keywords(input_keywords) # キーワードを処理
27
+ save_keywords(processed_keywords) # 処理したキーワードを保存
28
+ else:
29
+ print("エラー: コマンドライン引数としてキーワードが提供されていません。")