Spaces:
Runtime error
Runtime error
| from transformers import pipeline | |
| from transformers import TrainingArguments, Trainer, AutoModelForSeq2SeqLM | |
| # In[2]: | |
| import pandas as pd | |
| import pickle | |
| import streamlit as st | |
| import numpy as np | |
| from sklearn.feature_extraction.text import CountVectorizer | |
| import nltk | |
| from nltk.stem.porter import PorterStemmer | |
| from nltk.stem import WordNetLemmatizer | |
| import re | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from nltk.corpus import wordnet | |
| nltk.download('punkt') | |
| nltk.download('wordnet') | |
| # In[47]: | |
| data3 = pd.read_csv('final2.csv') | |
| # In[5]: | |
| data3.info() | |
| # In[6]: | |
| data3.head() | |
| # In[9]: | |
| data3['topic'] = data3.topic.astype("string") | |
| data3['discription'] = data3.discription.astype("string") | |
| data3['keyword'] = data3.keyword.astype("string") | |
| data3['level'] = data3.level.astype("string") | |
| data3.info() | |
| # # Data Cleaning Process | |
| # In[10]: | |
| data3['tag'] = data3['discription'] + " " + data3['keyword'] +" " + data3['level'] | |
| # In[11]: | |
| def remove_symbols(text): | |
| # Create a regular expression pattern to match unwanted symbols | |
| pattern = r'[^\w\s]' # Matches characters that are not alphanumeric or whitespace | |
| # Substitute matched symbols with an empty string | |
| return re.sub(pattern, '', text.lower()) | |
| # In[12]: | |
| data3['tag'] = data3['tag'].fillna('') | |
| data3['tag'] = data3['tag'].apply(remove_symbols) | |
| data3['level'] = data3['level'].apply(lambda x: x.replace(" ","")) | |
| data3['keyword'] = data3['keyword'].fillna('') | |
| data3.head() | |
| # # Convert tag columns into vector | |
| # In[14]: | |
| cv = CountVectorizer( max_features = 5000, stop_words = 'english') | |
| vector = cv.fit_transform(data3['tag']).toarray() | |
| # In[18]: | |
| ps = PorterStemmer() | |
| # In[30]: | |
| def preprocess_query(query): | |
| # Lowercase the query | |
| cleaned_query = query.lower() | |
| # Remove punctuation (adjust as needed) | |
| import string | |
| punctuation = string.punctuation | |
| cleaned_query = ''.join([char for char in cleaned_query if char not in punctuation]) | |
| # Remove stop words (optional, replace with your stop word list) | |
| stop_words = ["the", "a", "is", "in", "of"] | |
| cleaned_query = ' '.join([word for word in cleaned_query.split() if word not in stop_words]) | |
| # Stemming | |
| ps = PorterStemmer() | |
| cleaned_query = ' '.join([ps.stem(word) for word in cleaned_query.split()]) | |
| # Lemmatization | |
| wnl = WordNetLemmatizer() | |
| cleaned_query = ' '.join([wnl.lemmatize(word) for word in cleaned_query.split()]) | |
| return cleaned_query | |
| # In[31]: | |
| # # Find Similarity score for finding most related topic from dataset | |
| # In[24]: | |
| similar = cosine_similarity(vector) | |
| # In[27]: | |
| # sorted(list(enumerate(similar[1])),reverse = True, key = lambda x: x[1])[0:5] | |
| # In[29]: | |
| summarizer = pipeline("summarization", model="facebook/bart-base") | |
| text_generator = pipeline("text-generation", model="gpt2") | |
| # In[34]: | |
| documents = [] | |
| for index, row in data3.iterrows(): | |
| topic_description = preprocess_query(row["topic"]) | |
| keywords = preprocess_query(row["keyword"]) | |
| combined_text = f"{topic_description} {keywords}" # Combine for TF-IDF | |
| documents.append(combined_text) | |
| # In[35]: | |
| # Create TF-IDF vectorizer | |
| vectorizer = TfidfVectorizer() | |
| # Fit the vectorizer on the documents | |
| document_vectors = vectorizer.fit_transform(documents) | |
| def recommend_from_dataset(query): | |
| cleaned_query = preprocess_query(query) | |
| query_vector = vectorizer.transform([cleaned_query]) | |
| # Calculate cosine similarity between query and documents | |
| cosine_similarities = cosine_similarity(query_vector, document_vectors) | |
| similarity_scores = cosine_similarities.flatten() | |
| # Sort documents based on similarity scores | |
| sorted_results = sorted(zip(similarity_scores, data3.index, range(len(documents))), reverse=True) | |
| # Return top N recommendations with scores, topic names, and links (if available) | |
| top_n_results = sorted_results[:5] | |
| recommendations = [] | |
| for result in top_n_results: | |
| score = result[0] | |
| document_id = result[1] | |
| topic_name = data3.loc[document_id, "topic"] | |
| link = data3.loc[document_id, "Links"] if "Links" in data3.columns else "No link available" | |
| if score >= 0.3: | |
| recommendations.append({"topic_name": topic_name, "link": link}) | |
| return recommendations | |
| # In[45]: | |
| def summarize_and_generate(user_query, recommendations): | |
| # Summarize the user query | |
| query_summary = summarizer(user_query, max_length=200, truncation=True)[0]["summary_text"] | |
| # Generate creative text related to the query | |
| generated_text = text_generator(f"Exploring the concept of {user_query}", max_length=200, num_return_sequences=3)[0]["generated_text"] | |
| # Extract related links with scores | |
| related_links = [] | |
| for recommendation in recommendations: | |
| related_links.append({"topic": recommendation["topic_name"], "link": recommendation["link"]}) | |
| return { | |
| "query_summary": query_summary.strip(), | |
| "generated_text": generated_text.strip(), | |
| "related_links": related_links | |
| } | |
| # In[46]: | |
| # user_query = "java " | |
| # recommendations = recommend_from_dataset(user_query) | |
| # # Get the summary, generated text, and related links | |
| # results = summarize_and_generate(user_query, recommendations) | |
| # print(f"Query Summary: {results['query_summary']}") | |
| # print(f"Creative Text: {results['generated_text']}") | |
| # print("Related Links:") | |
| # for link in results["related_links"]: | |
| # print(f"- {link['topic']}: {link['link']}") | |
| # In[ ]: | |