curation / app.py
soojeongcrystal's picture
Update app.py
4dea241 verified
# ํ•„์š”ํ•œ ๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ ๋ถˆ๋Ÿฌ์˜ค๊ธฐ
import streamlit as st
import pandas as pd
from dotenv import load_dotenv
import os
import openpyxl
from soynlp.noun import LRNounExtractor_v2
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import openai
# ํ™˜๊ฒฝ ๋ณ€์ˆ˜ ๋กœ๋“œ
load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")
# OpenAI ํด๋ผ์ด์–ธํŠธ ์ƒ์„ฑ
openai.api_key = openai_api_key
# ์—‘์…€ ํŒŒ์ผ ๊ฒฝ๋กœ
file_path = 'contents.xlsx'
# ์—‘์…€ ํŒŒ์ผ ๋กœ๋“œ
wb = openpyxl.load_workbook(file_path)
# ๋ช…์‚ฌ ์ถ”์ถœ๊ธฐ๋Š” ์•ฑ ์‹œ์ž‘ ์‹œ ํ•œ ๋ฒˆ๋งŒ ์ดˆ๊ธฐํ™”
noun_extractor = LRNounExtractor_v2()
@st.cache(allow_output_mutation=True)
def calculate_similarity_with_soynlp(text1, text2):
nouns1 = noun_extractor.train_extract([text1])
nouns2 = noun_extractor.train_extract([text2])
text1_nouns = ' '.join(list(nouns1.keys()))
text2_nouns = ' '.join(list(nouns2.keys()))
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform([text1_nouns, text2_nouns])
cosine_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])
return cosine_sim[0][0]
@st.cache
def load_sheet_data(sheet_name):
ws = wb[sheet_name]
data = list(ws.values)
columns = data[0]
df = pd.DataFrame(data[1:], columns=columns)
return df
def filter_internal_contents(df, search_criteria, threshold=0.1):
filtered_contents = []
for _, row in df.iterrows():
intro_text = row['์†Œ๊ฐœ']
if intro_text:
similarity_score = calculate_similarity_with_soynlp(search_criteria, intro_text.lower())
if similarity_score >= threshold:
filtered_contents.append({'์ œ๋ชฉ': row['์ œ๋ชฉ'], '์†Œ๊ฐœ': row['์†Œ๊ฐœ'], '์‹œ๊ฐ„': row['์‹œ๊ฐ„']})
return filtered_contents
# ์ถ”๊ฐ€๋œ ํ‚ค์›Œ๋“œ ์ถ”์ถœ ํ•จ์ˆ˜
def extract_keywords(purpose, target):
prompt = f"ํ•™์Šต ๋ชฉ์  '{purpose}'๊ณผ ๋Œ€์ƒ '{target}'๋ฅผ ์ข…ํ•ฉํ•˜์—ฌ ์–ด๋–ค ์ปค๋ฆฌํ˜๋Ÿผ์˜ ํ•™์Šต ์ปจํ…์ธ ๊ฐ€ ํ•„์š”ํ•œ์ง€๋ฅผ ์„œ์ˆ ํ•ด์ฃผ์„ธ์š”."
try:
response = openai.ChatCompletion.create(
model="gpt-4-0613", # gpt-4 ๋ชจ๋ธ ์‚ฌ์šฉ
messages=[{"role": "user", "content": prompt}]
)
return response['choices'][0]['message']['content'].strip().split(', ')
except Exception as e:
st.error(f"OpenAI API error: {e}")
return []
def recommend_with_gpt(sheet_name, purpose, target, time, internal_contents):
summary = ", ".join([f"{content['์ œ๋ชฉ']}: {content['์†Œ๊ฐœ']}" for content in internal_contents])
prompt = f"""
'{sheet_name}' ์นดํ…Œ๊ณ ๋ฆฌ ๋‚ด์—์„œ '{purpose}' ํ•™์Šต ๋ชฉ์ ์„ ๋‹ฌ์„ฑํ•˜๊ณ ์ž ํ•˜๋Š” '{target}' ๋Œ€์ƒ์„ ์œ„ํ•ด,
์ฃผ์–ด์ง„ '{time}' ์‹œ๊ฐ„ ๋‚ด์— ๋‹ค๋ฃฐ ์ˆ˜ ์žˆ๋Š” ์ปจํ…์ธ ๋ฅผ ์ถ”์ฒœํ•ด์ฃผ์„ธ์š”.
ํ˜„์žฌ ํ•„ํ„ฐ๋ง๋œ ๋‚ด๋ถ€ ์ปจํ…์ธ ๋Š” ๋‹ค์Œ๊ณผ ๊ฐ™์Šต๋‹ˆ๋‹ค: {summary}
์ถ”๊ฐ€๋กœ, ์ด ๋‚ด๋ถ€ ์ปจํ…์ธ ๋ฅผ ๋ณด์™„ํ•˜๊ฑฐ๋‚˜ ํ™•์žฅํ•  ์ˆ˜ ์žˆ๋Š” ์™ธ๋ถ€ ์ปจํ…์ธ (์˜ˆ: ์ฑ…, ์˜์ƒ, ์˜จ๋ผ์ธ ๊ฐ•์˜ ๋“ฑ)๋„ ํ•จ๊ป˜ ์ถ”์ฒœํ•ด์ฃผ์„ธ์š”.
"""
response = openai.Completion.create(
model="gpt-4-0613",
prompt=prompt,
temperature=0.6,
max_tokens=150,
top_p=1.0,
frequency_penalty=0.0,
presence_penalty=0.0
)
return response.choices[0].text.strip()
# ์ŠคํŠธ๋ฆผ๋ฆฟ UI ๊ตฌ์„ฑ
st.set_page_config(page_title="Curation", page_icon="๐ŸŒท")
st.title("์ปจํ…์ธ  ํ๋ ˆ์ด์…˜")
purpose = st.text_area("ํ•™์Šต ๋ชฉ์ ์„ ์ž…๋ ฅํ•ด์ฃผ์„ธ์š”", placeholder="์˜ˆ: ํ–‰๋ณต ์ดํ•ด ๋ฐ ์ ์šฉ")
target = st.text_area("ํ•™์Šต ๋Œ€์ƒ์„ ์ž…๋ ฅํ•˜์„ธ์š”", placeholder="์˜ˆ: ์ผ๋ฐ˜ ๊ตฌ์„ฑ์›")
time = st.number_input("ํ•™์Šต ์‹œ๊ฐ„์„ ์ž…๋ ฅํ•˜์„ธ์š”", min_value=1, max_value=120, value=30, step=1)
sheet_names = wb.sheetnames
sheet_name = st.selectbox("์ปจํ…์ธ  ๋ฒ”์œ„๋ฅผ ์„ ํƒํ•ด์ฃผ์„ธ์š”", sheet_names)
if st.button('์ปจํ…์ธ  ์ถ”์ฒœ๋ฐ›๊ธฐ'):
if not purpose or not target or not time:
st.error("๋ชจ๋“  ํ•„๋“œ๋ฅผ ์˜ฌ๋ฐ”๋ฅด๊ฒŒ ์ฑ„์›Œ์ฃผ์„ธ์š”.")
else:
# ํ‚ค์›Œ๋“œ ์ถ”์ถœ ํ•จ์ˆ˜ ํ˜ธ์ถœ ๋ฐ ๊ฒฐ๊ณผ ํ‘œ์‹œ
keywords = extract_keywords(purpose, target)
if keywords:
st.markdown("### ์ถ”์ถœ๋œ ํ‚ค์›Œ๋“œ")
st.write(", ".join(keywords))
else:
st.write("ํ‚ค์›Œ๋“œ๋ฅผ ์ถ”์ถœํ•˜์ง€ ๋ชปํ–ˆ์Šต๋‹ˆ๋‹ค.")
# ๋‚ด๋ถ€ ์ปจํ…์ธ  ํ•„ํ„ฐ๋ง ๋ฐ ํ‘œ์‹œ
df = load_sheet_data(sheet_name)
search_criteria = f"{purpose} {target}".lower()
internal_contents = filter_internal_contents(df, search_criteria, threshold=0.75)
if internal_contents:
st.subheader("๋‚ด๋ถ€ ์ปจํ…์ธ  ํ•„ํ„ฐ๋ง ๊ฒฐ๊ณผ")
for content in internal_contents:
with st.expander(f"{content['์ œ๋ชฉ']} ({content['์‹œ๊ฐ„']}๋ถ„)"):
st.write(content['์†Œ๊ฐœ'])
else:
st.write("๋‚ด๋ถ€ ์ปจํ…์ธ ๊ฐ€ ์—†์Šต๋‹ˆ๋‹ค.")
external_recommendation = recommend_with_gpt(sheet_name, purpose, target, str(time), internal_contents)
st.subheader("์ถ”์ฒœ ์ปจํ…์ธ  ๋ฆฌ์ŠคํŠธ")
st.write(external_recommendation)