soojeongcrystal commited on
Commit
e765d87
ยท
verified ยท
1 Parent(s): 8c5b6fa

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +78 -59
app.py CHANGED
@@ -1,59 +1,78 @@
1
- import streamlit as st
2
- from pororo import Pororo
3
- from sklearn.feature_extraction.text import CountVectorizer
4
- from sklearn.decomposition import LatentDirichletAllocation
5
- from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
6
- import regex as re
7
-
8
- # ๋ถˆ์šฉ์–ด ๋ชฉ๋ก
9
- stopwords = set(["์žˆ๋‹ค", "์—†๋‹ค", "๊ฒƒ", "๊ทธ", "์ด", "ํ•˜๋Š”", "ํ•˜๊ธฐ", "ํ• ", "๋˜", "์ˆ˜", "์ด๋‹ค", "์‹œํ‚ค๋‹ค"])
10
-
11
- # ํ…์ŠคํŠธ ์ „์ฒ˜๋ฆฌ ๋ฐ ๋ช…์‚ฌ ์ถ”์ถœ ํ•จ์ˆ˜
12
- def preprocess_text(text):
13
- tokenizer = Pororo(task="tokenization", lang="ko")
14
- text = re.sub(r'[^\w\s]', '', text) # ํŠน์ˆ˜ ๋ฌธ์ž ์ œ๊ฑฐ
15
- nouns = tokenizer(text, pos=True)
16
- nouns = [word for word, tag in nouns if tag.startswith('N') and len(word) > 1 and word not in stopwords] # ๋ช…์‚ฌ๋งŒ ์ถ”์ถœ, ๋ถˆ์šฉ์–ด ๋ฐ ํ•œ ๊ธ€์ž ๋‹จ์–ด ์ œ๊ฑฐ
17
- cleaned_text = ' '.join(nouns)
18
- return cleaned_text
19
-
20
- # ํ† ํ”ฝ ๋ชจ๋ธ๋ง ํ•จ์ˆ˜
21
- def topic_modeling(texts, n_components=5):
22
- vectorizer = CountVectorizer()
23
- data_vectorized = vectorizer.fit_transform(texts)
24
-
25
- lda_model = LatentDirichletAllocation(n_components=n_components, random_state=0)
26
- lda_model.fit(data_vectorized)
27
- feature_names = vectorizer.get_feature_names_out()
28
- topics = {}
29
- for topic_idx, topic in enumerate(lda_model.components_):
30
- topics[f"Topic {topic_idx}"] = [feature_names[i] for i in topic.argsort()[:-11:-1]] # ์ƒ์œ„ 10๊ฐœ ๋‹จ์–ด
31
- return topics
32
-
33
- # ์„ค๋ช… ๋ฐ ์š”์•ฝ ํŒŒ์ดํ”„๋ผ์ธ ์ดˆ๊ธฐํ™” (PyTorch)
34
- model_name = "t5-small"
35
- tokenizer = AutoTokenizer.from_pretrained(model_name)
36
- model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
37
- summarizer = pipeline("summarization", model=model, tokenizer=tokenizer, framework='pt')
38
-
39
- # ์ŠคํŠธ๋ฆผ๋ฆฟ UI
40
- st.title('ํ•œ๊ตญ์–ด ํ…์ŠคํŠธ ํ† ํ”ฝ ๋ชจ๋ธ๋ง ๋ฐ ์š”์•ฝ')
41
-
42
- uploaded_file = st.file_uploader("TXT ํŒŒ์ผ ์—…๋กœ๋“œ", type=['txt'])
43
- n_topics = st.sidebar.number_input("ํ† ํ”ฝ ์ˆ˜ ์„ค์ •", min_value=2, max_value=10, value=5)
44
-
45
- if uploaded_file is not None:
46
- with st.spinner('ํŒŒ์ผ์„ ์ฒ˜๋ฆฌ ์ค‘์ž…๋‹ˆ๋‹ค...'):
47
- try:
48
- text = str(uploaded_file.read(), 'utf-8') # ํŒŒ์ผ ๋‚ด์šฉ ์ฝ๊ธฐ
49
- processed_text = preprocess_text(text) # ํ…์ŠคํŠธ ์ „์ฒ˜๋ฆฌ
50
- topic_results = topic_modeling([processed_text], n_components=n_topics) # ํ† ํ”ฝ ๋ชจ๋ธ๋ง
51
-
52
- st.subheader("ํ† ํ”ฝ ๋ชจ๋ธ๋ง ๊ฒฐ๊ณผ")
53
- for topic, words in topic_results.items():
54
- st.write(f"{topic}: {', '.join(words)}")
55
- description = "This topic primarily discusses " + ", ".join(words[:5]) + "."
56
- summary = summarizer(description, max_length=130, min_length=30, do_sample=False)
57
- st.write("Summary:", summary[0]['summary_text'])
58
- except Exception as e:
59
- st.error(f"์ฒ˜๋ฆฌ ์ค‘ ์˜ค๋ฅ˜๊ฐ€ ๋ฐœ์ƒํ–ˆ์Šต๋‹ˆ๋‹ค: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ from pykospacing import Spacing
4
+ from hanspell import spell_checker
5
+ import warnings
6
+
7
+ # ๊ฒฝ๊ณ  ๋ฉ”์‹œ์ง€ ๋ฌด์‹œ
8
+ warnings.filterwarnings("ignore")
9
+
10
+ def correct_text(text):
11
+ # ๋„์–ด์“ฐ๊ธฐ ๊ต์ •
12
+ spacing = Spacing()
13
+ spaced_text = spacing(text)
14
+
15
+ # ๋งž์ถค๋ฒ• ๋ฐ ๋„์–ด์“ฐ๊ธฐ ๊ฒ€์‚ฌ
16
+ checked_text = spell_checker.check(spaced_text)
17
+ return checked_text.checked, checked_text.errors
18
+
19
+ def parse_srt(file_path):
20
+ with open(file_path, 'r', encoding='utf-8') as file:
21
+ lines = file.readlines()
22
+
23
+ captions = []
24
+ temp_caption = {'index': None, 'time': None, 'text': ""}
25
+
26
+ for line in lines:
27
+ line = line.strip()
28
+ if line.isdigit():
29
+ if temp_caption['index'] is not None: # ์ƒˆ ์ž๋ง‰์ด ์‹œ์ž‘๋˜๋ฉด ๊ธฐ์กด ์ž๋ง‰์„ ๋ฆฌ์ŠคํŠธ์— ์ถ”๊ฐ€
30
+ captions.append(temp_caption)
31
+ temp_caption = {'index': None, 'time': None, 'text': ""}
32
+ temp_caption['index'] = int(line)
33
+ elif '-->' in line:
34
+ temp_caption['time'] = line
35
+ elif line:
36
+ if temp_caption['text']:
37
+ temp_caption['text'] += " " + line
38
+ else:
39
+ temp_caption['text'] = line
40
+
41
+ if temp_caption['index'] is not None: # ๋งˆ์ง€๋ง‰ ์ž๋ง‰ ์ถ”๊ฐ€
42
+ captions.append(temp_caption)
43
+
44
+ return captions
45
+
46
+ def spell_check_captions(file):
47
+ captions = parse_srt(file)
48
+ results = []
49
+ for caption in captions:
50
+ corrected_text, errors = correct_text(caption['text'])
51
+ results.append({
52
+ '์‹œ๊ฐ„': caption['time'],
53
+ '์›๋ณธ ์ž๋ง‰': caption['text'],
54
+ '์ˆ˜์ •๋œ ์ž๋ง‰': corrected_text,
55
+ '์ˆ˜์ • ํ•„์š” ๋‚ด์šฉ': ', '.join([f"{error}->{correct}" for error, correct in errors.items() if error != correct])
56
+ })
57
+
58
+ if results:
59
+ df = pd.DataFrame(results)
60
+ output_file = 'corrected_captions.xlsx'
61
+ df.to_excel(output_file, index=False, engine='openpyxl')
62
+ return df, output_file, "Check the result in the table and download the file."
63
+ else:
64
+ return pd.DataFrame(), None, "์ˆ˜์ •ํ•  ๋‚ด์šฉ์ด ์—†์Šต๋‹ˆ๏ฟฝ๏ฟฝ."
65
+
66
+ iface = gr.Interface(
67
+ fn=spell_check_captions,
68
+ inputs=gr.File(type="filepath", label="์ž๋ง‰ ํŒŒ์ผ ์—…๋กœ๋“œ"),
69
+ outputs=[
70
+ gr.Dataframe(label="๊ฒ€์‚ฌ ๊ฒฐ๊ณผ ๋ฏธ๋ฆฌ๋ณด๊ธฐ"),
71
+ gr.File(label="๊ฒฐ๊ณผ ์—‘์…€ ํŒŒ์ผ ๋‹ค์šด๋กœ๋“œ"),
72
+ gr.Textbox(label="๋ฉ”์‹œ์ง€")
73
+ ],
74
+ title="์ž๋ง‰ ๊ฒ€์‚ฌ ๋ฐ ์ˆ˜์ •",
75
+ description="์ž๋ง‰ ํŒŒ์ผ์„ ์—…๋กœ๋“œํ•˜๊ณ , ์ˆ˜์ •ํ•  ๋‚ด์šฉ์ด ์žˆ๋Š” ๊ฒฝ์šฐ ๊ฒฐ๊ณผ๋ฅผ ํ™•์ธํ•˜์„ธ์š”."
76
+ )
77
+
78
+ iface.launch()