| import heapq |
| import pandas as pd |
| from collections import defaultdict |
| import streamlit as st |
|
|
| class Node: |
| def __init__(self, symbol=None, count=0): |
| self.symbol = symbol |
| self.count = count |
| self.left = None |
| self.right = None |
|
|
| def __lt__(self, other): |
| return self.count < other.count |
|
|
| def build_frequency_table(data): |
| frequency_table = defaultdict(int) |
| for char in data: |
| frequency_table[char] += 1 |
| return frequency_table |
|
|
| def build_huffman_tree(frequency_table): |
| heap = [] |
| for symbol, count in frequency_table.items(): |
| heapq.heappush(heap, Node(symbol, count)) |
|
|
| while len(heap) > 1: |
| left = heapq.heappop(heap) |
| right = heapq.heappop(heap) |
| parent = Node(count=left.count + right.count) |
| parent.left = left |
| parent.right = right |
| heapq.heappush(heap, parent) |
|
|
| return heap[0] |
|
|
| def huffman_compress(data): |
| if len(data) == 0: |
| return {},"" |
| else: |
| frequency_table = build_frequency_table(data) |
| huffman_tree = build_huffman_tree(frequency_table) |
| code_table = {} |
| build_code_table(huffman_tree, '', code_table) |
| compressed_data = '' |
| for char in data: |
| compressed_data += code_table[char] |
|
|
| return code_table, compressed_data |
|
|
| def build_code_table(node, code, code_table): |
| if node is None: |
| return |
| elif node.symbol is not None: |
| code_table[node.symbol] = code |
| else: |
| build_code_table(node.left, code + '0', code_table) |
| build_code_table(node.right, code + '1', code_table) |
|
|
| input = st.file_uploader("Chose your txt file") |
|
|
| if input is not None: |
| text_data = input.read().decode('utf-8') |
| st.header("Input: "+text_data) |
| else: text_data = '' |
|
|
| huffman_table, encoded_data = huffman_compress(text_data) |
|
|
| df_huffman = pd.DataFrame(list(huffman_table.items()), columns=['characters', 'code']) |
|
|
| if len(text_data) == 0: |
| st.header("Please browse your text file") |
| else: |
| st_df = st.dataframe(data=df_huffman,width=1000) |
| st.header("Compressed Data:") |
| st.header(encoded_data) |
| original_size = len(text_data.encode('utf-8')) * 8 |
| compressed_size = len(encoded_data) |
| compression_ratio = (1 - (compressed_size / original_size)) * 100 |
| st.header("Hiệu suất nén: "+str(compression_ratio)) |
|
|
| |