Tarandeep Singh commited on
Commit
16da076
·
1 Parent(s): 2b8afcb

first commit

Browse files
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ __pycache__/*
Dockerfile ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
2
+ # you will also find guides on how best to write your Dockerfile
3
+
4
+ FROM python:3.9
5
+
6
+ WORKDIR /code
7
+
8
+ COPY ./requirements.txt /code/requirements.txt
9
+
10
+ RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
11
+
12
+ COPY . .
13
+
14
+ CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860"]
README copy.md ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ n_gram_app helps predict sequence from William Blake's Poems, the data for which was obtained from gutenberg's
2
+ blake-poems.txt
3
+
4
+ Steps to the UI:
5
+ 1. Run n_gram_app.py
6
+ 2. Open web browser on http://127.0.0.1:5000/
7
+ 3. Enter inputs
app.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from flask import Flask, render_template, request,jsonify
2
+ from modules.ngram_models_utils import generate_sentence, get_probability, preprocess_new, probability_helper, predict,create_ngrams
3
+ import pickle
4
+ from collections import Counter
5
+
6
+ app = Flask(__name__)
7
+ # Load the data from the pickle file
8
+
9
+ with open('static/file_text.pkl', 'rb') as pickle_file:
10
+ data = pickle.load(pickle_file)
11
+
12
+ blake = data['blake-poems.txt']
13
+
14
+ @app.route('/')
15
+ def index():
16
+ return render_template('index.html')
17
+
18
+ @app.route('/generate_sequence', methods=['POST'])
19
+
20
+ def generate_sequence():
21
+ # Replace 'input1', 'input2', etc. with your actual input field names
22
+ initial_sequence = request.form['initial_sequence']
23
+ n_grams = request.form['n_grams']
24
+ sentence_length = request.form['sentence_length']
25
+ n_grams, sentence_length = int(n_grams), int(sentence_length)
26
+ probs_blake = get_probability(blake,n_grams ,type = "smooth")
27
+
28
+ # Here, call your function with the inputs
29
+ output_sequence = generate_sentence(probs_blake,initial_sequence, n_grams, sentence_length)
30
+
31
+ #return render_template('result.html', sequence=output_sequence)
32
+ return jsonify({'sequence': output_sequence})
33
+
34
+ if __name__ == '__main__':
35
+ app.run(debug=True)
modules/ngram_models_utils.py ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import nltk, string
4
+ from nltk.corpus import gutenberg
5
+ from collections import Counter
6
+ from nltk.tokenize import word_tokenize
7
+ import random, string
8
+
9
+ def preprocess_new(text):
10
+ text = ' '.join(gutenberg.raw('blake-poems.txt').split())
11
+ # Remove punctuation except for commas
12
+ punctuation_to_remove = string.punctuation.replace(',', '') # Keep commas
13
+ translator = str.maketrans('', '', punctuation_to_remove)
14
+ text = text.translate(translator)
15
+ #text = text.translate(str.maketrans('', '', string.punctuation))
16
+
17
+ # Tokenize and lower case
18
+ tokens = word_tokenize(text)
19
+ tokens = [word.lower() for word in tokens]
20
+ return tokens
21
+
22
+ def create_ngrams(tokens, n):
23
+ n_gram_tokens = []
24
+ for i in range(len(tokens)-n):
25
+ n_gram_tokens.append(tuple(tokens[i:i+n]))
26
+ return n_gram_tokens
27
+
28
+ def probability_helper(sample,n):
29
+ """
30
+ sample: text sample
31
+ n: n-gram size
32
+ return: dataframe with probability
33
+ """
34
+ #get ngrams
35
+ ngrams_sample = create_ngrams(sample,n)
36
+
37
+ #get frequency
38
+ ngram_frequency = Counter([tuple(ngram) for ngram in ngrams_sample])
39
+
40
+ #ger probability
41
+ df = pd.DataFrame.from_dict(ngram_frequency, orient='index').reset_index()
42
+ df.columns = ['sequence', 'count']
43
+
44
+ #convert first column into 2 columns where first column has n-1 words, the second column has nth word
45
+ df['nth_word'] = df['sequence'].apply(lambda x: x[-1])
46
+
47
+ def get_sequence(tuple):
48
+ x = ''
49
+ for i in range(len(tuple)-1):
50
+ x+=(tuple[i])
51
+ x+=','
52
+ x = x[:-1]
53
+ x = x.replace(","," ")
54
+ return x
55
+
56
+ df['sequence'] = df['sequence'].apply(lambda x: get_sequence(x))
57
+
58
+ #get ids for sequences and predictions
59
+ df_sorted = df.sort_values(by='sequence')
60
+ df_sorted['sequence_id'] = range(1, len(df_sorted) + 1)
61
+ df_new = df_sorted
62
+ df_sorted = df_new.sort_values(by='nth_word')
63
+ df_sorted['prediction_id'] = range(1, len(df_sorted) + 1)
64
+
65
+ return df, df_sorted
66
+
67
+ def get_probability(sample,n,type = None):
68
+ if type==None:
69
+
70
+ df, df_sorted = probability_helper(sample,n)
71
+ totals = df.groupby('sequence')['count'].sum().reset_index().rename(columns={'count':'total'})
72
+ df_sorted = df_sorted.merge(totals, how = 'left', on = 'sequence')
73
+ df_sorted['probability'] = df_sorted['count']/df_sorted['total']
74
+ elif type =="smooth":
75
+ df, df_sorted = probability_helper(sample,n)
76
+ v = df_sorted['prediction_id'].max()
77
+
78
+ totals = df.groupby('sequence')['count'].sum().reset_index().rename(columns={'count':'total'})
79
+ df_sorted = df_sorted.merge(totals, how = 'left', on = 'sequence')
80
+ df_sorted['probability'] = (df_sorted['count']+1)/(df_sorted['total'] + v)
81
+
82
+ return df_sorted
83
+
84
+ def predict(data, sequence):
85
+ """this function generates predictions based on probabilities seen in the dataset"""
86
+ try:
87
+ subset = data[data['sequence']==sequence.strip()]
88
+ result = subset.iloc[subset['probability'].argmax()]['nth_word'] #return the word with max probability
89
+ #print("sequence detected")
90
+ return result
91
+ except:
92
+ result = random.choice(data['nth_word'].unique())
93
+ #print("sequence not detected")
94
+ return result
95
+
96
+ def generate_sentence(data, sequence, n,len ):
97
+ """
98
+ data: result of get_probability()
99
+ sequence: should be n-1 words together
100
+ len: number of predictions to be made
101
+ """
102
+ sentence = sequence
103
+ sentence = sentence.strip()
104
+ for i in range(len):
105
+ n_minus_1_sequence = ' '.join(sentence.split(" ")[-n+1:])
106
+ #print(f'sequence number {i+1}: {n_minus_1_sequence}')
107
+ next_word = predict(data, n_minus_1_sequence)
108
+ if next_word!=',':
109
+ sentence = sentence + ' ' + next_word
110
+ else:
111
+ sentence+=next_word
112
+ return sentence
113
+
114
+
115
+ '''
116
+ files = gutenberg.fileids()
117
+ text = [gutenberg.raw(fileid) for fileid in gutenberg.fileids()]
118
+ file_text = dict(zip(files, text))
119
+
120
+ for key, value in file_text.items():
121
+ file_text[key] = preprocess_new(value)'''
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ Flask==2.2.5
2
+ nltk==3.7
3
+ pandas==1.2.4
static/file_text.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0dc6993bcb5b17cd78fce735fcdef160ea75ab403d0fef8abfc6e668f6aa22bd
3
+ size 1712635
static/style.css ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ body {
2
+ font-family: Arial, sans-serif;
3
+ margin: 0;
4
+ padding: 0;
5
+ background-color: #f0f0f0;
6
+ color: #333;
7
+ }
8
+
9
+ h1 {
10
+ color: #007bff;
11
+ text-align: center;
12
+ }
13
+
14
+ h2 {
15
+ color: #6c757d;
16
+ text-align: center;
17
+ }
templates/.DS_Store ADDED
Binary file (6.15 kB). View file
 
templates/bgimage.jpg ADDED
templates/index.html ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <title>Sequence Generator</title>
6
+ <script src="https://code.jquery.com/jquery-3.6.0.min.js"></script>
7
+ <style>
8
+ body {
9
+ font-family: 'Times New Roman', serif;
10
+ background-image: url('bgimg.jpg');
11
+ background-size: cover;
12
+ background-attachment: fixed;
13
+ margin: 0;
14
+ padding: 0;
15
+ display: flex;
16
+ justify-content: center;
17
+ align-items: center;
18
+ height: 100vh;
19
+ flex-direction: column;
20
+ color: #3e3e3e;
21
+ }
22
+ h1, h2 {
23
+ text-align: center;
24
+ color: #614532; /* Dark brown color for a rustic look */
25
+ }
26
+ form {
27
+ background-color: rgba(255, 255, 255, 0.8); /* Slightly transparent white */
28
+ padding: 20px;
29
+ border-radius: 8px;
30
+ margin-bottom: 20px;
31
+ }
32
+ input, button {
33
+ display: block;
34
+ width: 100%;
35
+ padding: 10px;
36
+ margin-top: 10px;
37
+ border-radius: 5px;
38
+ border: 1px solid #ddd;
39
+ background: rgba(255, 255, 255, 0.5);
40
+ }
41
+ button {
42
+ background-color: #8a5a44; /* Earthy tone */
43
+ color: white;
44
+ border: none;
45
+ cursor: pointer;
46
+ }
47
+ button:hover {
48
+ background-color: #7d4e3b;
49
+ }
50
+ #result {
51
+ display: none;
52
+ transition: opacity 1s ease-in-out;
53
+ background-color: rgba(255, 255, 255, 0.8); /* Slightly transparent white */
54
+ padding: 20px;
55
+ border-radius: 8px;
56
+ }
57
+ #result.show {
58
+ display: block;
59
+ opacity: 1;
60
+ }
61
+ </style>
62
+ </head>
63
+ <body>
64
+ <h1>Generate a William Blake-like Poem</h1>
65
+ <h2>Rediscover the Beauty of Romanticism Poetry</h2>
66
+ <form id="sequenceForm">
67
+ <input type="text" name="initial_sequence" placeholder="few words">
68
+ <input type="number" name="n_grams" placeholder="n-grams">
69
+ <input type="number" name="sentence_length" placeholder="Poem Length (in words)">
70
+ <button type="submit">Generate Poem</button>
71
+ </form>
72
+ <div id="result">
73
+ <h2>Your Generated Poem:</h2>
74
+ <p id="sequenceOutput"></p>
75
+ </div>
76
+
77
+ <script>
78
+ $(document).ready(function() {
79
+ $('#sequenceForm').on('submit', function(e) {
80
+ e.preventDefault(); // Prevent the default form submission
81
+ $.ajax({
82
+ url: '/generate_sequence', // Your Flask endpoint
83
+ type: 'POST',
84
+ data: $(this).serialize(),
85
+ success: function(response) {
86
+ $('#sequenceOutput').text(response.sequence); // Assuming 'response.sequence' is the output
87
+ $('#result').addClass('show');
88
+ }
89
+ });
90
+ });
91
+ });
92
+ </script>
93
+ </body>
94
+ </html>
templates/index_old.html ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html>
3
+ <head>
4
+ <title>William Blake Poem Generator</title>
5
+ <!--<link rel="stylesheet" type="text/css" href="{{ url_for('static', filename='style.css') }}">-->
6
+ </head>
7
+ <body>
8
+ <h1>Using n-grams to generate a william blake-like poem</h1>
9
+ <h2>A Simple Yet Effective Demonstration</h2>
10
+ </body>
11
+ <body>
12
+ <form action="/generate" method="post">
13
+ <input type="text" name="initial_sequence" placeholder="initial_sequence">
14
+ <input type="number" name="n_grams" placeholder="n-grams">
15
+ <input type="number" name="sentence_length" placeholder="Poem Length (in words)">
16
+ <!-- Add more input fields as necessary -->
17
+ <button type="submit">Generate Sentence</button>
18
+ </form>
19
+ </body>
20
+ </html>
templates/result.html ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html>
3
+ <head>
4
+ <title>Sequence Result</title>
5
+ </head>
6
+ <body>
7
+ <p>Generated Sequence: {{ sequence }}</p>
8
+ <a href="/">Try Again</a>
9
+ </body>
10
+ </html>