Spaces:

thinh21
/

hethongtomtat

Running

App Files Files Community

thinh21 commited on 21 days ago

Commit

a9b0b3a

verified ·

1 Parent(s): 16cccc3

Upload 10 files

Browse files

Files changed (10) hide show

api_keys.py +5 -0
app.py +366 -0
cohere_summarizer.py +18 -0
config.py +9 -0
database.py +263 -0
groq_summarizer.py +28 -0
requirements.txt +0 -0
summarizer_ai.py +69 -0
text_cleaner.py +17 -0
textrank_summarizer.py +72 -0

api_keys.py ADDED Viewed

	@@ -0,0 +1,5 @@

+# File này chỉ lưu trữ trên máy tính của bạn, KHÔNG đẩy lên GitHub
+GEMINI_KEY = "AIzaSyBJWldClO1-5ANcxVfQcHk2oHZxsRUChw4"
+GROQ_KEY = "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" # Dán mã Key bạn vừa tạo vào đây
+COHERE_KEY = "poK0FNYgmnNbYNlwf16SDG6DAlwOlmELKD2gXt6f"
+OPENROUTER_KEY = "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" # Dán key của bạn vào đây

app.py ADDED Viewed

	@@ -0,0 +1,366 @@

+import streamlit as st
+import PyPDF2
+import docx
+import time
+import pandas as pd
+from io import BytesIO
+from rouge_score import rouge_scorer # <-- [MỚI] Thư viện tính điểm học thuật
+import plotly.express as px
+import os
+import smtplib
+from email.message import EmailMessage
+# --- IMPORT CÁC MODULE XỬ LÝ ---
+from summarizer_ai import TextSummarizer
+from textrank_summarizer import TextRankSummarizer
+from text_cleaner import TextPreprocessor
+from groq_summarizer import GroqSummarizer
+from cohere_summarizer import CohereSummarizer
+import database
+import api_keys
+st.set_page_config(page_title="AI Summarizer Pro", page_icon="📝", layout="wide")
+database.init_db()
+@st.cache_resource
+def load_models():
+    return (
+        TextSummarizer(), TextRankSummarizer(), TextPreprocessor(),
+        GroqSummarizer(api_keys.GROQ_KEY), CohereSummarizer(api_keys.COHERE_KEY)
+    )
+def _ensure_auth_state():
+    if "user" not in st.session_state:
+        st.session_state.user = None
+def _mask_email(email: str) -> str:
+    email = (email or "").strip()
+    if "@" not in email:
+        return "***"
+    name, domain = email.split("@", 1)
+    if len(name) <= 2:
+        name_masked = name[:1] + "*"
+    else:
+        name_masked = name[:2] + "*" * (len(name) - 2)
+    return f"{name_masked}@{domain}"
+def _send_reset_email(to_email: str, code: str):
+    smtp_user = os.getenv("SMTP_USER", "").strip() or str(st.secrets.get("SMTP_USER", "")).strip()
+    smtp_app_password = os.getenv("SMTP_APP_PASSWORD", "").strip() or str(st.secrets.get("SMTP_APP_PASSWORD", "")).strip()
+    if not smtp_user or not smtp_app_password:
+        return False, "Chưa cấu hình SMTP. Hãy set SMTP_USER/SMTP_APP_PASSWORD (env hoặc .streamlit/secrets.toml)."
+    msg = EmailMessage()
+    msg["Subject"] = "AI Summarizer Pro - Ma dat lai mat khau"
+    msg["From"] = smtp_user
+    msg["To"] = to_email
+    msg.set_content(
+        "Ban da yeu cau dat lai mat khau.\n\n"
+        f"Ma xac nhan (OTP): {code}\n"
+        "Ma co hieu luc 10 phut. Neu khong phai ban, hay bo qua email nay.\n"
+    )
+    try:
+        with smtplib.SMTP_SSL("smtp.gmail.com", 465) as server:
+            server.login(smtp_user, smtp_app_password)
+            server.send_message(msg)
+        return True, "Da gui ma OTP qua email."
+    except Exception as e:
+        return False, f"Gui email that bai: {e}"
+def _render_auth_sidebar():
+    st.sidebar.header("👤 Tài khoản")
+    if st.session_state.user:
+        st.sidebar.success(f"Xin chào, {st.session_state.user['username']}")
+        if st.sidebar.button("Đăng xuất"):
+            st.session_state.user = None
+            st.rerun()
+        return True
+    tab_login, tab_register, tab_forgot = st.sidebar.tabs(["Đăng nhập", "Đăng ký", "Quên mật khẩu"])
+    with tab_login:
+        with st.form("login_form", clear_on_submit=False):
+            username = st.text_input("Username", placeholder="vd: thinh")
+            password = st.text_input("Password", type="password")
+            submitted = st.form_submit_button("Đăng nhập", type="primary")
+        if submitted:
+            ok, user, msg = database.authenticate_user(username, password)
+            if ok:
+                st.session_state.user = user
+                st.sidebar.success(msg)
+                st.rerun()
+            else:
+                st.sidebar.error(msg)
+    with tab_register:
+        with st.form("register_form", clear_on_submit=True):
+            username = st.text_input("Username (bắt buộc)")
+            email = st.text_input("Email (tuỳ chọn)", placeholder="name@example.com")
+            password = st.text_input("Password (bắt buộc)", type="password")
+            confirm = st.text_input("Nhập lại password", type="password")
+            submitted = st.form_submit_button("Tạo tài khoản", type="primary")
+        if submitted:
+            if password != confirm:
+                st.sidebar.error("Password nhập lại không khớp.")
+            elif len((password or "")) < 6:
+                st.sidebar.error("Password tối thiểu 6 ký tự.")
+            else:
+                ok, msg = database.create_user(username=username, password=password, email=email)
+                if ok:
+                    st.sidebar.success(msg)
+                else:
+                    st.sidebar.error(msg)
+    with tab_forgot:
+        st.caption("Nhập username hoặc email đã đăng ký để nhận mã OTP.")
+        smtp_user_present = bool(os.getenv("SMTP_USER", "").strip() or str(st.secrets.get("SMTP_USER", "")).strip())
+        smtp_pass_present = bool(os.getenv("SMTP_APP_PASSWORD", "").strip() or str(st.secrets.get("SMTP_APP_PASSWORD", "")).strip())
+        if not (smtp_user_present and smtp_pass_present):
+            st.warning("SMTP chưa được cấu hình cho phiên chạy hiện tại.")
+        with st.form("forgot_request_form", clear_on_submit=True):
+            identifier = st.text_input("Username hoặc Email")
+            submitted = st.form_submit_button("Gửi mã OTP", type="primary")
+        if submitted:
+            ok, email, code_or_msg = database.create_password_reset_code(identifier)
+            if ok:
+                send_ok, send_msg = _send_reset_email(email, code_or_msg)
+                if send_ok:
+                    st.sidebar.success(f"{send_msg} ({_mask_email(email)})")
+                else:
+                    st.sidebar.error(send_msg)
+            else:
+                st.sidebar.info(code_or_msg)
+        st.divider()
+        st.caption("Sau khi nhận OTP, nhập mã và mật khẩu mới.")
+        with st.form("forgot_reset_form", clear_on_submit=True):
+            identifier2 = st.text_input("Username hoặc Email (để đặt lại)")
+            code = st.text_input("Mã OTP (6 số)")
+            new_password = st.text_input("Mật khẩu mới", type="password")
+            confirm = st.text_input("Nhập lại mật khẩu mới", type="password")
+            submitted2 = st.form_submit_button("Đổi mật khẩu", type="primary")
+        if submitted2:
+            if new_password != confirm:
+                st.sidebar.error("Password nhập lại không khớp.")
+            else:
+                ok2, msg2 = database.reset_password_with_code(identifier2, code, new_password)
+                if ok2:
+                    st.sidebar.success(msg2)
+                else:
+                    st.sidebar.error(msg2)
+    return False
+# ==========================================
+# HÀM TÍNH TOÁN CÁC ĐỘ ĐO (METRICS)
+# ==========================================
+def calc_novelty(original_text, summary_text):
+    """Tính tỷ lệ phần trăm từ vựng mới được AI tạo ra (Độ sáng tạo)"""
+    orig_set = set(original_text.lower().split())
+    summ_set = set(summary_text.lower().split())
+    if not summ_set: return 0.0
+    new_words = summ_set - orig_set
+    return round((len(new_words) / len(summ_set)) * 100, 1)
+def calc_rouge_l(reference_text, summary_text):
+    """Tính điểm ROUGE-L (Mức độ hành văn giống với con người)"""
+    if not reference_text.strip(): return 0.0
+    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=False)
+    scores = scorer.score(reference_text, summary_text)
+    return round(scores['rougeL'].fmeasure * 100, 1)
+def extract_text_from_file(uploaded_file):
+    try:
+        filename = uploaded_file.name
+        if filename.endswith('.txt'): return uploaded_file.getvalue().decode("utf-8")
+        elif filename.endswith('.pdf'):
+            pdf_reader = PyPDF2.PdfReader(BytesIO(uploaded_file.read()))
+            return "".join([page.extract_text() + "\n" for page in pdf_reader.pages if page.extract_text()])
+        elif filename.endswith('.docx'):
+            doc = docx.Document(BytesIO(uploaded_file.read()))
+            return "\n".join([para.text for para in doc.paragraphs])
+    except Exception as e:
+        st.error(f"Lỗi đọc file: {e}")
+    return ""
+# ==========================================
+# GIAO DIỆN CHÍNH
+# ==========================================
+_ensure_auth_state()
+is_authed = _render_auth_sidebar()
+st.title("📝 Hệ thống Tóm tắt & Nghiên cứu Đánh giá AI")
+st.markdown("Đồ án chuyên sâu: Phân tích hiệu năng, đo lường độ sáng tạo (Novelty) và điểm chuẩn ROUGE giữa các thuật toán.")
+if not is_authed:
+    st.info("Vui lòng đăng nhập hoặc đăng ký ở sidebar để sử dụng hệ thống.")
+    st.stop()
+(ai_summarizer, textrank_summarizer, text_cleaner, groq_summarizer, cohere_summarizer) = load_models()
+st.sidebar.header("⚙️ Cấu hình chung")
+summary_length = st.sidebar.slider("Độ dài tóm tắt mong muốn (số từ):", 30, 1000, 100)
+st.subheader("📥 Dữ liệu đầu vào")
+uploaded_file = st.file_uploader("📂 Tải lên tài liệu (PDF, DOCX, TXT)", type=["pdf", "docx", "txt"])
+input_content = extract_text_from_file(uploaded_file) if uploaded_file else ""
+c_input, c_ref = st.columns(2)
+with c_input:
+    input_text = st.text_area("Nội dung văn bản cần xử lý (Bắt buộc):", value=input_content, height=200)
+with c_ref:
+    reference_text = st.text_area("Bản tóm tắt chuẩn của con người (Tùy chọn - Dùng để tính điểm ROUGE):", height=200, placeholder="Nhập bản tóm tắt mẫu vào đây để AI so sánh độ chính xác...")
+cleaned_text = text_cleaner.clean_text(input_text)
+original_word_count = len(cleaned_text.split())
+tab1, tab2, tab3 = st.tabs(["📝 Tóm tắt Đơn", "⚖️ So sánh Đa mô hình", "📊 Dashboard & Lịch sử DB"])
+# ---------------------------------------------------------
+# TAB 1: TÓM TẮT ĐƠN
+# ---------------------------------------------------------
+with tab1:
+    method = st.selectbox("Chọn mô hình AI:", [
+        "Thông minh (AI T5 - Viết lại câu)", "Trích xuất ý chính (TextRank)",
+        "⚡ Siêu tốc độ (Groq Llama 3 API)", "🌟 Tóm tắt chuyên sâu (Cohere API)"
+    ])
+    if st.button("🚀 Chạy Mô hình Đơn", type="primary"):
+        if original_word_count < 20: st.warning("⚠️ Văn bản quá ngắn.")
+        else:
+            with st.spinner(f"🤖 Đang xử lý bằng {method}..."):
+                start_time = time.time()
+                try:
+                    if "T5" in method: result = ai_summarizer.summarize(cleaned_text, max_len=summary_length)
+                    elif "Groq" in method: result = groq_summarizer.summarize(cleaned_text, max_words=summary_length)
+                    elif "Cohere" in method: result = cohere_summarizer.summarize(cleaned_text, max_words=summary_length)
+                    else: result = textrank_summarizer.summarize(cleaned_text, num_sentences=max(1, summary_length // 20))
+                    p_time = round(time.time() - start_time, 2)
+                    sum_count = len(result.split())
+                    novelty = calc_novelty(cleaned_text, result)
+                    rouge = calc_rouge_l(reference_text, result)
+                    st.success(result)
+                    if not result.startswith("⚠️"):
+                        database.save_summary(method, original_word_count, sum_count, p_time, cleaned_text, result, novelty, rouge)
+                    m1, m2, m3, m4 = st.columns(4)
+                    m1.metric("⏱️ Thời gian", f"{p_time}s")
+                    m2.metric("📉 Tỷ lệ nén", f"{round((sum_count/original_word_count)*100, 1)}%")
+                    m3.metric("🧠 Độ sáng tạo (Novelty)", f"{novelty}%")
+                    m4.metric("🎯 Điểm ROUGE-L", f"{rouge}%" if reference_text else "N/A")
+                except Exception as e:
+                    st.error(f"Lỗi: {e}")
+# ---------------------------------------------------------
+# TAB 2: SO SÁNH ĐA MÔ HÌNH
+# ---------------------------------------------------------
+with tab2:
+    st.info("Chế độ này sẽ gửi văn bản đến 4 AI cùng lúc. Kèm theo chấm điểm Novelty (Tỷ lệ sinh từ mới) và ROUGE-L.")
+    if st.button("⚖️ Bắt đầu Đại chiến AI (Chạy tất cả)", type="primary"):
+        if original_word_count < 20: st.warning("⚠️ Văn bản quá ngắn.")
+        else:
+            col1, col2 = st.columns(2)
+            col3, col4 = st.columns(2)
+            def render_result(col, title, res, time_taken, method_name):
+                with col:
+                    st.markdown(f"### {title}")
+                    st.write(res)
+                    if not res.startswith("⚠️"):
+                        sum_cnt = len(res.split())
+                        nov = calc_novelty(cleaned_text, res)
+                        rg = calc_rouge_l(reference_text, res)
+                        st.caption(f"⏱️ {time_taken}s | 📝 {sum_cnt} từ | 🧠 Novelty: {nov}% | 🎯 ROUGE: {rg if reference_text else 'N/A'}")
+                        database.save_summary(method_name, original_word_count, sum_cnt, time_taken, cleaned_text, res, nov, rg)
+            # 1. Llama 3 (Groq)
+            start_t = time.time()
+            res_groq = groq_summarizer.summarize(cleaned_text, max_words=summary_length)
+            render_result(col1, "⚡ Groq (Llama 3)", res_groq, round(time.time() - start_t, 2), "⚡ Siêu tốc độ (Groq Llama 3 API)")
+            # 2. Cohere
+            start_t = time.time()
+            res_co = cohere_summarizer.summarize(cleaned_text, max_words=summary_length)
+            render_result(col2, "🌟 Cohere API", res_co, round(time.time() - start_t, 2), "🌟 Tóm tắt chuyên sâu (Cohere API)")
+            # 3. T5 Local
+            start_t = time.time()
+            res_t5 = ai_summarizer.summarize(cleaned_text, max_len=summary_length)
+            render_result(col3, "🧠 AI T5 (Offline)", res_t5, round(time.time() - start_t, 2), "Thông minh (AI T5 - Viết lại câu)")
+            # 4. TextRank
+            start_t = time.time()
+            res_tr = textrank_summarizer.summarize(cleaned_text, num_sentences=max(1, summary_length // 20))
+            render_result(col4, "✂️ TextRank", res_tr, round(time.time() - start_t, 2), "Trích xuất ý chính (TextRank)")
+# ---------------------------------------------------------
+# TAB 3: THỐNG KÊ & BIỂU ĐỒ
+# ---------------------------------------------------------
+with tab3:
+    history_data = database.get_history()
+    if len(history_data) == 0:
+        st.write("Chưa có dữ liệu. Hãy chạy tóm tắt vài lần để xem biểu đồ!")
+    else:
+        df = pd.DataFrame(history_data, columns=["ID", "Thời gian", "Phương pháp", "Từ (Gốc)", "Từ (Tóm tắt)", "Thời gian xử lý (s)", "Văn bản gốc", "Kết quả", "Novelty (%)", "ROUGE-L (%)"])
+        def shorten_name(name):
+            if "T5" in name: return "AI T5 (Local)"
+            if "TextRank" in name: return "TextRank"
+            if "Groq" in name: return "Groq Llama 3"
+            if "Cohere" in name: return "Cohere"
+            return name
+        df["Tên rút gọn"] = df["Phương pháp"].apply(shorten_name)
+        df["Tỷ lệ nén (%)"] = (df["Từ (Tóm tắt)"] / df["Từ (Gốc)"]) * 100
+        st.subheader("📈 Phân tích Các Chỉ Số Học Thuật")
+        c1, c2 = st.columns(2)
+        with c1:
+            st.markdown("**1. Tốc độ xử lý (giây)**")
+            # THÊM CHÚ THÍCH GIẢI THÍCH BIỂU ĐỒ TỐC ĐỘ
+            st.caption("⏳ Cột càng **THẤP** (thời gian ngắn) chứng tỏ AI chạy càng nhanh. Cột cao thể hiện độ trễ lớn, cần nhiều thời gian chờ đợi.")
+            fig1 = px.bar(df.groupby("Tên rút gọn")["Thời gian xử lý (s)"].mean().reset_index(), x="Tên rút gọn", y="Thời gian xử lý (s)", text_auto='.2f', color="Tên rút gọn")
+            fig1.update_layout(showlegend=False, xaxis_title="")
+            st.plotly_chart(fig1, use_container_width=True)
+        with c2:
+            st.markdown("**2. Độ Sáng tạo - Novelty (%)**")
+            # THÊM CHÚ THÍCH GIẢI THÍCH BIỂU ĐỒ NOVELTY
+            st.caption("🧠 Cột càng **CAO** chứng tỏ AI có khả năng dùng từ vựng mới để viết lại câu (Paraphrase) càng tốt. TextRank luôn = 0 vì thuật toán này chỉ copy-paste câu gốc.")
+            fig2 = px.bar(df.groupby("Tên rút gọn")["Novelty (%)"].mean().reset_index(), x="Tên rút gọn", y="Novelty (%)", text_auto='.1f', color="Tên rút gọn")
+            fig2.update_layout(showlegend=False, xaxis_title="")
+            st.plotly_chart(fig2, use_container_width=True)
+        st.markdown("---")
+        c3, c4 = st.columns([2, 1])
+        with c3:
+            st.markdown("**3. Điểm Chuẩn ROUGE-L (%)**")
+            # THÊM CHÚ THÍCH GIẢI THÍCH BIỂU ĐỒ ROUGE
+            st.caption("🎯 Thanh càng **DÀI** (tỉ lệ cao) chứng tỏ cách hành văn của AI càng sát với bản tóm tắt chuẩn của con người. (Chỉ vẽ biểu đồ khi bạn có nhập Bản tóm tắt mẫu).")
+            df_rouge = df[df["ROUGE-L (%)"] > 0]
+            if not df_rouge.empty:
+                fig3 = px.bar(df_rouge.groupby("Tên rút gọn")["ROUGE-L (%)"].mean().reset_index(), y="Tên rút gọn", x="ROUGE-L (%)", orientation='h', text_auto='.1f', color="Tên rút gọn")
+                fig3.update_layout(showlegend=False, yaxis_title="")
+                st.plotly_chart(fig3, use_container_width=True)
+            else:
+                st.info("💡 Bạn chưa nhập 'Bản tóm tắt chuẩn' lần nào nên chưa có biểu đồ ROUGE.")
+        with c4:
+            st.markdown("**4. Tỷ lệ nén văn bản (%)**")
+            # THÊM CHÚ THÍCH GIẢI THÍCH TỶ LỆ NÉN
+            st.caption("📦 Phần trăm số từ của bản tóm tắt so với bản gốc. Miếng bánh **NHỎ** nghĩa là AI tóm tắt siêu ngắn gọn. Miếng bánh **TO** là AI giữ lại nhiều chi tiết.")
+            fig4 = px.pie(df.groupby("Tên rút gọn")["Tỷ lệ nén (%)"].mean().reset_index(), values="Tỷ lệ nén (%)", names="Tên rút gọn", hole=0.4)
+            st.plotly_chart(fig4, use_container_width=True)
+        st.markdown("---")
+        st.subheader("📚 Bảng dữ liệu SQLite (Đã lưu điểm học thuật)")
+        st.dataframe(df.drop(columns=["Văn bản gốc", "Kết quả", "Tên rút gọn", "Tỷ lệ nén (%)"], errors='ignore'), use_container_width=True)

cohere_summarizer.py ADDED Viewed

	@@ -0,0 +1,18 @@

+import cohere
+class CohereSummarizer:
+    def __init__(self, api_key):
+        # Sử dụng Client V2 để tương thích với hệ thống mới nhất 2026
+        self.co = cohere.Client(api_key)
+    def summarize(self, text, max_words=100):
+        try:
+            # Chuyển hẳn sang dùng Chat API vì Summarize API cũ không còn được hỗ trợ
+            response = self.co.chat(
+                model='command-r-plus-08-2024', # Dùng phiên bản ổn định nhất hiện nay
+                message=f"Tóm tắt văn bản sau bằng tiếng Việt, khoảng {max_words} từ: {text}",
+            )
+            return response.text.strip()
+        except Exception as e:
+            # Trình bày lỗi gọn gàng cho đồ án
+            return f"⚠️ Lỗi Cohere (New API): {str(e)}"

config.py ADDED Viewed

	@@ -0,0 +1,9 @@

+# ==========================================
+# CẤU HÌNH HỆ THỐNG
+# ==========================================
+# Sử dụng mô hình của Đại học Bách Khoa Hà Nội (Tương thích tốt hơn)
+MODEL_NAME = "NlpHUST/t5-small-vi-summarization"
+MAX_LENGTH = 300
+MIN_LENGTH = 30

database.py ADDED Viewed

	@@ -0,0 +1,263 @@

+import sqlite3
+from datetime import datetime, timedelta
+import hashlib
+import hmac
+import os
+import secrets
+_DB_PATH = "history.db"
+_PWD_ITERATIONS = 200_000
+_RESET_CODE_ITERATIONS = 120_000
+_RESET_CODE_TTL_MINUTES = 10
+_RESET_MAX_ATTEMPTS = 5
+def _utc_now_str():
+    return datetime.utcnow().strftime("%d/%m/%Y %H:%M:%S")
+def _hash_password(password: str, salt: bytes) -> bytes:
+    return hashlib.pbkdf2_hmac("sha256", password.encode("utf-8"), salt, _PWD_ITERATIONS)
+def _hash_reset_code(code: str, salt: bytes) -> bytes:
+    return hashlib.pbkdf2_hmac("sha256", code.encode("utf-8"), salt, _RESET_CODE_ITERATIONS)
+def init_db():
+    conn = sqlite3.connect(_DB_PATH)
+    c = conn.cursor()
+    c.execute('''
+        CREATE TABLE IF NOT EXISTS summary_history (
+            id INTEGER PRIMARY KEY AUTOINCREMENT,
+            created_at TEXT,
+            method TEXT,
+            original_length INTEGER,
+            summary_length INTEGER,
+            process_time REAL,
+            original_text TEXT,
+            summary_text TEXT,
+            novelty_score REAL,
+            rouge_l_score REAL
+        )
+    ''')
+    c.execute('''
+        CREATE TABLE IF NOT EXISTS users (
+            id INTEGER PRIMARY KEY AUTOINCREMENT,
+            created_at TEXT NOT NULL,
+            username TEXT NOT NULL UNIQUE,
+            email TEXT,
+            password_salt TEXT NOT NULL,
+            password_hash TEXT NOT NULL
+        )
+    ''')
+    c.execute('''
+        CREATE TABLE IF NOT EXISTS password_reset_codes (
+            id INTEGER PRIMARY KEY AUTOINCREMENT,
+            created_at TEXT NOT NULL,
+            user_id INTEGER NOT NULL,
+            code_salt TEXT NOT NULL,
+            code_hash TEXT NOT NULL,
+            expires_at TEXT NOT NULL,
+            attempts INTEGER NOT NULL DEFAULT 0,
+            used INTEGER NOT NULL DEFAULT 0,
+            FOREIGN KEY(user_id) REFERENCES users(id)
+        )
+    ''')
+    conn.commit()
+    conn.close()
+def save_summary(method, orig_len, sum_len, p_time, orig_text, sum_text, novelty=0.0, rouge_l=0.0):
+    conn = sqlite3.connect(_DB_PATH)
+    c = conn.cursor()
+    date_str = datetime.now().strftime("%d/%m/%Y %H:%M:%S")
+    c.execute('''
+        INSERT INTO summary_history
+        (created_at, method, original_length, summary_length, process_time, original_text, summary_text, novelty_score, rouge_l_score)
+        VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
+    ''', (date_str, method, orig_len, sum_len, p_time, orig_text, sum_text, novelty, rouge_l))
+    conn.commit()
+    conn.close()
+def get_history():
+    conn = sqlite3.connect(_DB_PATH)
+    c = conn.cursor()
+    c.execute("SELECT * FROM summary_history ORDER BY id DESC LIMIT 50")
+    rows = c.fetchall()
+    conn.close()
+    return rows
+def create_user(username: str, password: str, email: str | None = None):
+    username = (username or "").strip()
+    email = (email or "").strip() or None
+    if not username or not password:
+        return False, "Thiếu username hoặc password."
+    salt = os.urandom(16)
+    pwd_hash = _hash_password(password, salt)
+    try:
+        conn = sqlite3.connect(_DB_PATH)
+        c = conn.cursor()
+        c.execute(
+            "INSERT INTO users (created_at, username, email, password_salt, password_hash) VALUES (?, ?, ?, ?, ?)",
+            (_utc_now_str(), username, email, salt.hex(), pwd_hash.hex()),
+        )
+        conn.commit()
+        return True, "Đăng ký thành công."
+    except sqlite3.IntegrityError:
+        return False, "Username đã tồn tại."
+    finally:
+        try:
+            conn.close()
+        except Exception:
+            pass
+def authenticate_user(username: str, password: str):
+    username = (username or "").strip()
+    if not username or not password:
+        return False, None, "Thiếu username hoặc password."
+    conn = sqlite3.connect(_DB_PATH)
+    c = conn.cursor()
+    c.execute("SELECT id, username, email, password_salt, password_hash FROM users WHERE username = ?", (username,))
+    row = c.fetchone()
+    conn.close()
+    if not row:
+        return False, None, "Sai username hoặc password."
+    user_id, uname, email, salt_hex, hash_hex = row
+    salt = bytes.fromhex(salt_hex)
+    expected = bytes.fromhex(hash_hex)
+    actual = _hash_password(password, salt)
+    if not hmac.compare_digest(expected, actual):
+        return False, None, "Sai username hoặc password."
+    return True, {"id": user_id, "username": uname, "email": email}, "Đăng nhập thành công."
+def _get_user_by_identifier(identifier: str):
+    identifier = (identifier or "").strip()
+    if not identifier:
+        return None
+    conn = sqlite3.connect(_DB_PATH)
+    c = conn.cursor()
+    if "@" in identifier:
+        c.execute("SELECT id, username, email FROM users WHERE email = ?", (identifier,))
+    else:
+        c.execute("SELECT id, username, email FROM users WHERE username = ?", (identifier,))
+    row = c.fetchone()
+    conn.close()
+    if not row:
+        return None
+    user_id, username, email = row
+    return {"id": user_id, "username": username, "email": email}
+def create_password_reset_code(identifier: str):
+    """
+    Create a one-time reset code for a user (by username or email).
+    Returns: (ok: bool, email: str|None, msg: str)
+    """
+    user = _get_user_by_identifier(identifier)
+    if not user:
+        # Don't reveal whether user exists
+        return False, None, "Nếu tài khoản tồn tại và có email, hệ thống sẽ gửi mã đặt lại mật khẩu."
+    email = (user.get("email") or "").strip()
+    if not email:
+        return False, None, "Tài khoản này chưa có email nên không thể đặt lại mật khẩu."
+    code = f"{secrets.randbelow(1_000_000):06d}"
+    salt = os.urandom(16)
+    code_hash = _hash_reset_code(code, salt)
+    now = datetime.utcnow()
+    expires = now + timedelta(minutes=_RESET_CODE_TTL_MINUTES)
+    conn = sqlite3.connect(_DB_PATH)
+    c = conn.cursor()
+    # Invalidate previous unused codes for this user
+    c.execute("UPDATE password_reset_codes SET used = 1 WHERE user_id = ? AND used = 0", (user["id"],))
+    c.execute(
+        "INSERT INTO password_reset_codes (created_at, user_id, code_salt, code_hash, expires_at, attempts, used) VALUES (?, ?, ?, ?, ?, 0, 0)",
+        (_utc_now_str(), user["id"], salt.hex(), code_hash.hex(), expires.isoformat()),
+    )
+    conn.commit()
+    conn.close()
+    return True, email, code
+def reset_password_with_code(identifier: str, code: str, new_password: str):
+    """
+    Verify reset code and update password.
+    Returns: (ok: bool, msg: str)
+    """
+    user = _get_user_by_identifier(identifier)
+    if not user:
+        return False, "Mã không hợp lệ hoặc đã hết hạn."
+    code = (code or "").strip()
+    if not code or not new_password:
+        return False, "Thiếu mã hoặc mật khẩu mới."
+    if len(new_password) < 6:
+        return False, "Password tối thiểu 6 ký tự."
+    conn = sqlite3.connect(_DB_PATH)
+    c = conn.cursor()
+    c.execute(
+        """
+        SELECT id, code_salt, code_hash, expires_at, attempts, used
+        FROM password_reset_codes
+        WHERE user_id = ?
+        ORDER BY id DESC
+        LIMIT 1
+        """,
+        (user["id"],),
+    )
+    row = c.fetchone()
+    if not row:
+        conn.close()
+        return False, "Mã không hợp lệ hoặc đã hết hạn."
+    reset_id, salt_hex, hash_hex, expires_at, attempts, used = row
+    if used:
+        conn.close()
+        return False, "Mã không hợp lệ hoặc đã hết hạn."
+    try:
+        expires_dt = datetime.fromisoformat(expires_at)
+    except Exception:
+        expires_dt = datetime.utcnow() - timedelta(days=1)
+    if datetime.utcnow() > expires_dt:
+        c.execute("UPDATE password_reset_codes SET used = 1 WHERE id = ?", (reset_id,))
+        conn.commit()
+        conn.close()
+        return False, "Mã không hợp lệ hoặc đã hết hạn."
+    if attempts >= _RESET_MAX_ATTEMPTS:
+        c.execute("UPDATE password_reset_codes SET used = 1 WHERE id = ?", (reset_id,))
+        conn.commit()
+        conn.close()
+        return False, "Bạn đã nhập sai quá nhiều lần. Vui lòng yêu cầu mã mới."
+    salt = bytes.fromhex(salt_hex)
+    expected = bytes.fromhex(hash_hex)
+    actual = _hash_reset_code(code, salt)
+    if not hmac.compare_digest(expected, actual):
+        c.execute("UPDATE password_reset_codes SET attempts = attempts + 1 WHERE id = ?", (reset_id,))
+        conn.commit()
+        conn.close()
+        return False, "Mã không hợp lệ hoặc đã hết hạn."
+    # Update password
+    pwd_salt = os.urandom(16)
+    pwd_hash = _hash_password(new_password, pwd_salt)
+    c.execute(
+        "UPDATE users SET password_salt = ?, password_hash = ? WHERE id = ?",
+        (pwd_salt.hex(), pwd_hash.hex(), user["id"]),
+    )
+    c.execute("UPDATE password_reset_codes SET used = 1 WHERE id = ?", (reset_id,))
+    conn.commit()
+    conn.close()
+    return True, "Đổi mật khẩu thành công. Bạn có thể đăng nhập lại."

groq_summarizer.py ADDED Viewed

	@@ -0,0 +1,28 @@

+from groq import Groq
+class GroqSummarizer:
+    def __init__(self, api_key):
+        self.client = Groq(api_key=api_key)
+        # Sử dụng model Llama 3 mới nhất, rất giỏi tiếng Việt
+        self.model = "llama-3.3-70b-versatile"
+    def summarize(self, text, max_words=100):
+        prompt = f"""
+        Bạn là một chuyên gia tóm tắt văn bản tiếng Việt.
+        Nhiệm vụ: Tóm tắt văn bản dưới đây một cách súc tích, khoảng {max_words} từ.
+        Yêu cầu: Giữ lại thông tin quan trọng nhất, hành văn tự nhiên.
+        Văn bản cần tóm tắt:
+        {text}
+        """
+        try:
+            completion = self.client.chat.completions.create(
+                model=self.model,
+                messages=[{"role": "user", "content": prompt}],
+                temperature=0.5,
+                max_tokens=1024
+            )
+            return completion.choices[0].message.content.strip()
+        except Exception as e:
+            return f"⚠️ Lỗi Groq API: {str(e)}"

requirements.txt ADDED Viewed

Binary file (298 Bytes). View file

summarizer_ai.py ADDED Viewed

	@@ -0,0 +1,69 @@

+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+import config
+import torch
+class TextSummarizer:
+    def __init__(self):
+        # Avoid printing non-ASCII to Windows consoles (cp1252) which can crash Streamlit.
+        # T5 Vietnamese models use SentencePiece (`spiece.model`). Force slow tokenizer to avoid
+        # tiktoken conversion path that can mis-detect and crash on Windows.
+        self.tokenizer = AutoTokenizer.from_pretrained(config.MODEL_NAME, use_fast=False)
+        self.model = AutoModelForSeq2SeqLM.from_pretrained(config.MODEL_NAME)
+        # Kiểm tra nếu có GPU (CUDA) thì chuyển model sang GPU để chạy nhanh và chính xác hơn
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.model.to(self.device)
+    def summarize(self, text, max_len=100):
+        """
+        Hàm tóm tắt nâng cấp: Đảm bảo thoát ý, không lặp, không cụt câu.
+        """
+        # KỸ THUẬT PROMPT MỚI: Dẫn dắt AI tập trung vào tóm tắt tiếng Việt chất lượng cao
+        prompt_text = f"vietnamese summarization: {text}"
+        inputs = self.tokenizer(
+            prompt_text,
+            max_length=1024,
+            return_tensors="pt",
+            truncation=True
+        ).to(self.device) # Chuyển dữ liệu vào cùng thiết bị với model
+        # THIẾT LẬP THAM SỐ SINH VĂN BẢN TỐI ƯU
+        # Tăng biên độ để AI có không gian chọn từ ngữ hay nhất
+        min_target = max(20, max_len - 30)
+        max_target = max_len + 40
+        summary_ids = self.model.generate(
+            inputs["input_ids"],
+            max_length=max_target,
+            min_length=min_target,
+            # CHIẾN THUẬT CHẤT LƯỢNG CAO
+            num_beams=5,               # Tăng lên 5 để AI tìm con đường có nghĩa nhất
+            length_penalty=1.2,        # Điều chỉnh để câu văn đủ ý, không quá ngắn
+            no_repeat_ngram_size=3,    # Ngăn lặp lại cụm 3 chữ (giúp câu văn đa dạng)
+            repetition_penalty=2.5,    # Phạt nặng việc lặp lại ý tứ cũ
+            # Đảm bảo kết thúc chuyên nghiệp
+            early_stopping=True,
+            forced_eos_token_id=self.tokenizer.eos_token_id
+        )
+        # Giải mã
+        summary_text = self.tokenizer.decode(summary_ids[0], skip_special_tokens=True)
+        # HẬU XỬ LÝ (POST-PROCESSING): Xử lý lỗi cụt chữ cuối câu
+        summary_text = summary_text.strip()
+        # Nếu câu cuối cùng không có dấu kết thúc, ta tìm dấu chấm gần nhất hoặc thêm dấu ba chấm
+        valid_endings = ('.', '!', '?', '\"', '”')
+        if not summary_text.endswith(valid_endings):
+            # Tìm vị trí dấu chấm cuối cùng để cắt bỏ phần chữ bị cụt phía sau
+            last_dot = max(summary_text.rfind('.'), summary_text.rfind('!'), summary_text.rfind('?'))
+            if last_dot != -1 and len(summary_text) - last_dot < 30: # Nếu đoạn cụt ngắn
+                summary_text = summary_text[:last_dot + 1]
+            else:
+                summary_text += "..." # Nếu không tìm thấy dấu chấm, thêm dấu 3 chấm để báo hiệu còn ý
+        return summary_text

text_cleaner.py ADDED Viewed

	@@ -0,0 +1,17 @@

+import re
+class TextPreprocessor:
+    def clean_text(self, text):
+        if not text:
+            return ""
+        # 1. Xóa các khoảng trắng thừa, dấu xuống dòng, khoảng tab liên tiếp
+        text = re.sub(r'\s+', ' ', text)
+        # 2. Xóa các thẻ HTML (nếu lỡ copy từ web có dính code)
+        text = re.sub(r'<[^>]+>', '', text)
+        # LƯU Ý: Không dùng lệnh xóa ký tự đặc biệt chung chung ở đây nữa
+        # Việc giữ lại các dấu câu (, . - / %) là bắt buộc để ngày tháng, tỉ số không bị dính vào nhau.
+        return text.strip()

textrank_summarizer.py ADDED Viewed

	@@ -0,0 +1,72 @@

+import nltk
+from nltk.tokenize import sent_tokenize
+from sklearn.feature_extraction.text import TfidfVectorizer
+import networkx as nx
+class TextRankSummarizer:
+    def __init__(self):
+        # Tải bộ tách câu của NLTK (Đã cập nhật thêm punkt_tab cho phiên bản mới)
+        try:
+            nltk.data.find('tokenizers/punkt')
+            nltk.data.find('tokenizers/punkt_tab')
+        except LookupError:
+            nltk.download('punkt')
+            nltk.download('punkt_tab')
+    def summarize(self, text, num_sentences=2):
+        # 1. Tách đoạn văn thành các câu riêng biệt
+        sentences = sent_tokenize(text)
+        if len(sentences) <= num_sentences:
+            return text
+        # ĐIỂM CỘNG ĐỒ ÁN: Khai báo danh sách Stop words tiếng Việt cơ bản
+        vietnamese_stopwords = [
+            "là", "và", "thì", "mà", "của", "các", "có", "để", "những", "một",
+            "trong", "với", "cho", "không", "này", "được", "về", "từ", "khi",
+            "đã", "đang", "sẽ", "như", "hay", "hoặc", "tại", "nó", "bởi", "ra", "vào"
+        ]
+        # 2. Dùng TF-IDF với tính năng loại bỏ Stop words
+        vectorizer = TfidfVectorizer(stop_words=vietnamese_stopwords)
+        X = vectorizer.fit_transform(sentences)
+        # 3. Tính toán độ tương đồng (Similarity)
+        similarity_matrix = (X * X.T).toarray()
+        # 4. Xây dựng Đồ thị (Graph) và chạy PageRank
+        nx_graph = nx.from_numpy_array(similarity_matrix)
+        scores = nx.pagerank(nx_graph)
+        # 5. Xếp hạng câu và trích xuất
+        ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)
+        top_sentences = [s for score, s in ranked_sentences[:num_sentences]]
+        return " ".join(top_sentences)
+    def extract_keywords(self, text, num_keywords=5):
+        """Trích xuất các từ khóa quan trọng nhất từ văn bản"""
+        try:
+            # Danh sách từ nối không mang ý nghĩa chính (Stopwords) mở rộng
+            vietnamese_stopwords = [
+                "là", "và", "thì", "mà", "của", "các", "có", "để", "những", "một",
+                "trong", "với", "cho", "không", "này", "được", "về", "từ", "khi",
+                "đã", "đang", "sẽ", "như", "hay", "hoặc", "tại", "nó", "bởi", "ra", "vào",
+                "nhưng", "cũng", "việc", "đến", "ngày", "năm", "người", "theo", "sau"
+            ]
+            # Dùng TF-IDF để tìm các từ xuất hiện nhiều và có sức nặng
+            vectorizer = TfidfVectorizer(stop_words=vietnamese_stopwords)
+            X = vectorizer.fit_transform([text])
+            # Lấy danh sách từ và điểm số
+            words = vectorizer.get_feature_names_out()
+            scores = X.toarray()[0]
+            # Lọc ra Top 5 từ khóa điểm cao nhất
+            top_indices = scores.argsort()[-num_keywords:][::-1]
+            keywords = [words[i] for i in top_indices]
+            return keywords
+        except Exception as e:
+            return []