Spaces:
Sleeping
Sleeping
| from flask import Flask, request, render_template, jsonify | |
| import pandas as pd | |
| import requests | |
| import os | |
| import re | |
| import networkx as nx | |
| from nltk.tokenize import word_tokenize, sent_tokenize | |
| from nltk.corpus import stopwords | |
| from bs4 import BeautifulSoup | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| import matplotlib.pyplot as plt | |
| import nltk | |
| # Inisialisasi NLTK | |
| nltk.download("stopwords") | |
| nltk.download("punkt") | |
| nltk.download('punkt_tab') | |
| # Inisialisasi Flask | |
| app = Flask(__name__) | |
| # Fungsi untuk scraping berita | |
| def scrape_news(url): | |
| isi = [] | |
| judul = [] | |
| headers = { | |
| "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" | |
| } | |
| try: | |
| response = requests.get(url, headers=headers) | |
| response.raise_for_status() | |
| article_full = BeautifulSoup(response.content, "html.parser") | |
| judul_artikel = article_full.find("h1", class_="mb-4 text-32 font-extrabold") | |
| if judul_artikel: | |
| judul_artikel = judul_artikel.text.strip() | |
| else: | |
| judul_artikel = "Judul tidak ditemukan" | |
| artikel_element = article_full.find("div", class_="detail-text") | |
| if artikel_element: | |
| artikel_teks = [p.get_text(strip=True) for p in artikel_element.find_all("p")] | |
| artikel_content = "\n".join(artikel_teks) | |
| else: | |
| artikel_content = "Konten artikel tidak ditemukan" | |
| isi.append(artikel_content) | |
| judul.append(judul_artikel) | |
| except requests.exceptions.RequestException as e: | |
| judul.append("Error") | |
| isi.append(f"Gagal mengambil data: {e}") | |
| return pd.DataFrame({"judul": judul, "isi": isi}) | |
| # Fungsi preprocessing | |
| def preprocess_text(content): | |
| content = content.lower() | |
| content = re.sub(r"[0-9]|[/(){}\[\]\|@,;_]|[^a-z .]+", " ", content) | |
| content = re.sub(r"\s+", " ", content).strip() | |
| tokens = word_tokenize(content) | |
| stopword = set(stopwords.words("indonesian")) | |
| tokens = [word for word in tokens if word not in stopword] | |
| return " ".join(tokens) | |
| # Fungsi untuk membuat ringkasan dan visualisasi graf | |
| def summarize_and_visualize(content): | |
| kalimat = sent_tokenize(content) | |
| preprocessed_text = preprocess_text(content) | |
| kalimat_preprocessing = sent_tokenize(preprocessed_text) | |
| # TF-IDF dan cosine similarity | |
| tfidf_vectorizer = TfidfVectorizer() | |
| tfidf_matrix = tfidf_vectorizer.fit_transform(kalimat_preprocessing) | |
| cossim_prep = cosine_similarity(tfidf_matrix, tfidf_matrix) | |
| # Analisis jaringan dengan NetworkX | |
| G = nx.DiGraph() | |
| for i in range(len(cossim_prep)): | |
| G.add_node(i) | |
| for j in range(len(cossim_prep)): | |
| if cossim_prep[i][j] > 0.1 and i != j: | |
| G.add_edge(i, j) | |
| # Hitung closeness centrality dan buat ringkasan | |
| closeness_scores = nx.closeness_centrality(G) | |
| sorted_closeness = sorted(closeness_scores.items(), key=lambda x: x[1], reverse=True) | |
| ringkasan = " ".join(kalimat[node] for node, _ in sorted_closeness[:3]) | |
| # Visualisasi graf | |
| plt.figure(figsize=(10, 8)) | |
| pos = nx.spring_layout(G, k=2) | |
| nx.draw_networkx_nodes(G, pos, node_size=500, node_color="b") | |
| nx.draw_networkx_edges(G, pos, edge_color="red", arrows=True) | |
| nx.draw_networkx_labels(G, pos, font_size=10) | |
| plt.title("Graph Representation of Sentence Similarity") | |
| # Periksa apakah file graph.png sudah ada | |
| graph_path = "static/graph.png" | |
| if os.path.exists(graph_path): | |
| os.remove(graph_path) # Hapus file jika sudah ada | |
| # Simpan graf sebagai file baru | |
| plt.savefig(graph_path) | |
| plt.close() | |
| return ringkasan | |
| # Route utama untuk scraping dan analisis | |
| def index(): | |
| if request.method == "POST": | |
| url = request.form.get("url") | |
| if url: | |
| # Scraping berita | |
| df = scrape_news(url) | |
| if not df.empty: | |
| content = df["isi"].iloc[0] | |
| title = df["judul"].iloc[0] | |
| # Preprocessing, summarizing, and visualizing | |
| ringkasan = summarize_and_visualize(content) | |
| return render_template("result.html", title=title, content=content, summary=ringkasan, graph_url="static/graph.png") | |
| else: | |
| return render_template("summary.html", error="Gagal mengambil data dari URL.") | |
| else: | |
| return render_template("summary.html", error="URL tidak boleh kosong.") | |
| return render_template("summary.html") | |
| # Menjalankan aplikasi Flask | |
| if __name__ == "__main__": | |
| app.run(debug=True, port=5002) | |