shujath000 commited on
Commit
f36dc0e
ยท
verified ยท
1 Parent(s): 7c8b0c4

Upload 5 files

Browse files
Files changed (5) hide show
  1. c_d.csv +0 -0
  2. logistic_models.pkl +3 -0
  3. multilabels.pkl +3 -0
  4. streamlit_app.py +124 -0
  5. tfidf.pkl +3 -0
c_d.csv ADDED
The diff for this file is too large to render. See raw diff
 
logistic_models.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b721e454d2e9e4251456b86182971c0282017377c6d9dcfff79cbc8d5c4d9c85
3
+ size 163858041
multilabels.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7f423ab92cb9f10e49ddb3562cff87cc01d5aabe61dfae0cd148e5929f3fe64b
3
+ size 148869
streamlit_app.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import streamlit as st
3
+ import pandas as pd
4
+ import joblib
5
+ import numpy as np
6
+ import string
7
+ import nltk
8
+ from nltk.corpus import stopwords as stp
9
+ from nltk import pos_tag, word_tokenize as w, sent_tokenize as s
10
+ from nltk.stem import WordNetLemmatizer as wl
11
+ NLTK_DATA_PATH = "/app/nltk_data"
12
+ os.makedirs(NLTK_DATA_PATH, exist_ok=True)
13
+ os.environ["NLTK_DATA"] = NLTK_DATA_PATH
14
+ nltk.download('punkt_tab', quiet=True)
15
+ nltk.download('punkt_tab', download_dir=NLTK_DATA_PATH, quiet=True)
16
+ nltk.download('punkt', download_dir=NLTK_DATA_PATH, quiet=True)
17
+ nltk.download('averaged_perceptron_tagger', download_dir=NLTK_DATA_PATH, quiet=True)
18
+ nltk.download('wordnet', download_dir=NLTK_DATA_PATH, quiet=True)
19
+ nltk.download('stopwords', download_dir=NLTK_DATA_PATH, quiet=True)
20
+ # Download necessary NLTK data
21
+ #nltk.download('punkt', quiet=True)
22
+ #nltk.download('averaged_perceptron_tagger', quiet=True)
23
+ #nltk.download('wordnet', quiet=True)
24
+ #nltk.download('stopwords', quiet=True)
25
+ nltk.download('punkt', download_dir=NLTK_DATA_PATH, quiet=True)
26
+ nltk.download('averaged_perceptron_tagger_eng', download_dir=NLTK_DATA_PATH, quiet=True)
27
+ nltk.download('wordnet', download_dir=NLTK_DATA_PATH, quiet=True)
28
+ nltk.download('stopwords', download_dir=NLTK_DATA_PATH, quiet=True)
29
+
30
+ # === Cleaning Function ===
31
+ def sahi_karneka_function(x):
32
+ nouns=[]
33
+ li=[]
34
+ lem=wl()
35
+ l=s(x)
36
+ for i in l:
37
+ d=w(i.lower())
38
+ for k in d:
39
+ li.append(k)
40
+ lw=len(li)
41
+ j=0
42
+ while j<lw:
43
+ if li[j] in string.punctuation:
44
+ li.remove(li[j])
45
+ lw=len(li)
46
+ j=0
47
+ elif li[j] in stp.words("english"):
48
+ li.remove(li[j])
49
+ lw=len(li)
50
+ j=0
51
+ else:
52
+ j=j+1
53
+ tags=pos_tag(li)
54
+ for word,tag in tags:
55
+ if tag.startswith("NN") or tag.startswith("V"):
56
+ nouns.append(word)
57
+ semi_final_words=[lem.lemmatize(m,pos="n") if tagg.startswith("NN") else lem.lemmatize(m,pos="v") for m,tagg in pos_tag(nouns)]
58
+ final_sentence=" ".join(semi_final_words)
59
+ return final_sentence
60
+
61
+ # === Load Data and Models ===
62
+ df = pd.read_csv(r"src/c_d.csv")
63
+ model = joblib.load("src/logistic_models.pkl")
64
+ tfidf = joblib.load("src/tfidf.pkl")
65
+ ml = joblib.load("src/multilabels.pkl")
66
+
67
+ # === Streamlit UI ===
68
+ st.title("๐Ÿง  Multi-Label Question Tag Predictor")
69
+
70
+ # --- Select a URL for context ---
71
+ selected_url = st.selectbox("Select a question URL (for context):", df['questions_url'])
72
+ st.markdown(f"๐Ÿ”— [Open selected question]({selected_url})")
73
+
74
+ # --- Session State ---
75
+ if "user_input" not in st.session_state:
76
+ st.session_state["user_input"] = ""
77
+ if "clear_input" not in st.session_state:
78
+ st.session_state["clear_input"] = False
79
+
80
+ # --- Clear input if flagged (AFTER rerun) ---
81
+ if st.session_state.clear_input:
82
+ st.session_state.user_input = ""
83
+ st.session_state.clear_input = False
84
+
85
+ # --- Input box ---
86
+ st.text_area("โœ๏ธ Type your question here:", key="user_input", height=150)
87
+
88
+ # --- Predict button ---
89
+ if st.button("Predict Tags"):
90
+ final_question = st.session_state.user_input.strip()
91
+
92
+ if not final_question:
93
+ st.warning("โš ๏ธ Please enter a question.")
94
+ else:
95
+ with st.spinner("๐Ÿ” Predicting tags..."):
96
+ # Step 1: Clean input
97
+ cleaned = sahi_karneka_function(final_question)
98
+
99
+ # Step 2: TF-IDF
100
+ f=[]
101
+ f.append(cleaned)
102
+ x_tfidf = tfidf.transform(f)
103
+
104
+ # Step 3: Predict
105
+ y_probs = model.predict_proba(x_tfidf)
106
+ threshold = 0.55
107
+ y_predd=model.predict(x_tfidf)
108
+ probs_column1 = np.array([i[:, 1] for i in y_probs]).T
109
+ y_pred = (probs_column1 >= threshold).astype(int)
110
+
111
+ # Step 4: Decode
112
+ predicted_tags = ml.inverse_transform(y_predd)
113
+
114
+ # Step 5: Display results
115
+ st.success("โœ… Predicted Tags:")
116
+ if predicted_tags and predicted_tags[0]:
117
+ for tag in predicted_tags[0]:
118
+ st.markdown(f"๐Ÿ”น **`{tag}`**")
119
+ else:
120
+ st.info("No tags matched the threshold.")
121
+
122
+ # Step 6: Show a "Clear" button
123
+ if st.button("Clear Input"):
124
+ st.session_state.user_input = ""
tfidf.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0ae3d0aac7f2381331f394013961dec02a2ca090c88ff16234d5225beed994a9
3
+ size 107299