In [1]:
from transformers import pipeline
from transformers import TrainingArguments, Trainer, AutoModelForSeq2SeqLM

In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
import re
from sklearn.metrics.pairwise import cosine_similarity
from fuzzywuzzy import fuzz
from sklearn.feature_extraction.text import TfidfVectorizer

In [47]:

data3 = pd.read_csv('final2.csv')

In [5]:
data3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3720 entries, 0 to 3719
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Unnamed: 0   3720 non-null   int64 
 1   topic        3720 non-null   object
 2   discription  1748 non-null   object
 3   keyword      3204 non-null   object
 4   Links        3720 non-null   object
 5   level        3720 non-null   object
dtypes: int64(1), object(5)
memory usage: 174.5+ KB


In [6]:
data3.head()

Unnamed: 0.1,Unnamed: 0,topic,discription,keyword,Links,level
0,0,Java,Java is a general-purpose computer programming...,"Java, James Gosling, website, wikipedia, docum...","website: https://oracle.com/java/, documentati...",beginner to advance
1,1,JavaScript,"JavaScript (), often abbreviated as JS, is a h...","JavaScript, Brendan Eich, reference, wikipedia...",reference: https://www.w3schools.com/js/js_res...,beginner to advance
2,2,C,"C (, as in the letter c) is a general-purpose,...","C, Dennis Ritchie, reference, wikipedia, docum...",reference: http://www.c4learn.com/c-programmin...,beginner to advance
3,3,Python,Python is a widely used high-level programming...,"Python, Guido van Rossum, website, reference, ...","website: https://www.python.org/, reference: h...",beginner to advance
4,4,SQL,SQL ( ( listen) ESS-kew-EL or ( listen) SEE-k...,"SQL, Donald D. Chamberlin and Raymond F. Boyce...",documentation: https://docs.data.world/documen...,beginner to advance


In [9]:
data3['topic'] = data3.topic.astype("string")
data3['discription'] = data3.discription.astype("string")
data3['keyword'] = data3.keyword.astype("string")
data3['level'] = data3.level.astype("string")
data3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3720 entries, 0 to 3719
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Unnamed: 0   3720 non-null   int64 
 1   topic        3720 non-null   string
 2   discription  1748 non-null   string
 3   keyword      3720 non-null   string
 4   Links        3720 non-null   object
 5   level        3720 non-null   string
dtypes: int64(1), object(1), string(4)
memory usage: 174.5+ KB


# Data Cleaning Process
'
'


In [10]:
data3['tag'] = data3['discription'] + " " + data3['keyword'] +" " + data3['level']

In [11]:
def remove_symbols(text):
  # Create a regular expression pattern to match unwanted symbols
    pattern = r'[^\w\s]'  # Matches characters that are not alphanumeric or whitespace
  # Substitute matched symbols with an empty string
    return re.sub(pattern, '', text.lower()) 

In [12]:
data3['tag'] = data3['tag'].fillna('')
data3['tag'] = data3['tag'].apply(remove_symbols)
data3['level'] = data3['level'].apply(lambda x: x.replace(" ",""))
data3['keyword'] = data3['keyword'].fillna('')
data3.head()

Unnamed: 0.1,Unnamed: 0,topic,discription,keyword,Links,level,tag
0,0,Java,Java is a general-purpose computer programming...,"Java, James Gosling, website, wikipedia, docum...","website: https://oracle.com/java/, documentati...",beginnertoadvance,java is a generalpurpose computer programming ...
1,1,JavaScript,"JavaScript (), often abbreviated as JS, is a h...","JavaScript, Brendan Eich, reference, wikipedia...",reference: https://www.w3schools.com/js/js_res...,beginnertoadvance,javascript often abbreviated as js is a highl...
2,2,C,"C (, as in the letter c) is a general-purpose,...","C, Dennis Ritchie, reference, wikipedia, docum...",reference: http://www.c4learn.com/c-programmin...,beginnertoadvance,c as in the letter c is a generalpurpose impe...
3,3,Python,Python is a widely used high-level programming...,"Python, Guido van Rossum, website, reference, ...","website: https://www.python.org/, reference: h...",beginnertoadvance,python is a widely used highlevel programming ...
4,4,SQL,SQL ( ( listen) ESS-kew-EL or ( listen) SEE-k...,"SQL, Donald D. Chamberlin and Raymond F. Boyce...",documentation: https://docs.data.world/documen...,beginnertoadvance,sql listen esskewel or listen seekwəl or ...


In [13]:
data3['tag'][0]

'java is a generalpurpose computer programming language that is concurrent classbased objectoriented and specifically designed to have as few implementation dependencies as possible it is intended to let application developers write once run anywhere wora meaning that compiled java code can run on all platforms that support java without the need for recompilation java applications are typically compiled to bytecode that can run on any java virtual machine jvm regardless of computer architecture as of 2016 java is one of the most popular programming languages in use particularly for clientserver web applications with a reported 9 million developers java was originally developed by james gosling at sun microsystems which has since been acquired by oracle corporation and released in 1995 as a core component of sun microsystems java platform the language derives much of its syntax from c and c but it has fewer lowlevel facilities than either of them the original and reference implementatio

# Convert tag columns into vector 

In [14]:
cv = CountVectorizer( max_features = 5000, stop_words = 'english')
vector = cv.fit_transform(data3['tag']).toarray()

In [15]:
vector[0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [16]:
cv.get_feature_names_out()

array(['10', '100', '1000', ..., 'λprolog', 'λx', 'μc'], dtype=object)

# Stemming And Lemmitization Process

In [18]:
ps = PorterStemmer()

In [30]:
def preprocess_query(query):
    
    # Lowercase the query
    cleaned_query = query.lower()

    # Remove punctuation (adjust as needed)
    import string
    punctuation = string.punctuation
    cleaned_query = ''.join([char for char in cleaned_query if char not in punctuation])

    # Remove stop words (optional, replace with your stop word list)
    stop_words = ["the", "a", "is", "in", "of"]
    cleaned_query = ' '.join([word for word in cleaned_query.split() if word not in stop_words])

    # Stemming
    ps = PorterStemmer()
    cleaned_query = ' '.join([ps.stem(word) for word in cleaned_query.split()])

    # Lemmatization
    wnl = WordNetLemmatizer()
    cleaned_query = ' '.join([wnl.lemmatize(word) for word in cleaned_query.split()])

    return cleaned_query

In [32]:
preprocess_query('talked')

'talk'

In [31]:
preprocess_query('java james gosling website wikipedia document united states beginnertoadvance')

'java jame gosl websit wikipedia document unit state beginnertoadv'

In [23]:
data3['tag'].apply(stem)   # apply on tag columns 

0       java is a generalpurpos comput program languag...
1       javascript often abbrevi as js is a highlevel ...
2       c as in the letter c is a generalpurpos imper ...
3       python is a wide use highlevel program languag...
4       sql listen esskewel or listen seekwəl or skwee...
                              ...                        
3715    understandingtheprofessionaldataengineercertif...
3716    atourofgooglecloudhandsonlab machinelearningen...
3717    introductiontoaiandmachinelearningongoogleclou...
3718    introductiontoaiandmachinelearningongoogleclou...
3719    aifound machinelearningengineerlearningpathweb...
Name: tag, Length: 3720, dtype: object

# Find Similarity score for finding most related topic from dataset

In [24]:
similar = cosine_similarity(vector)

In [27]:
sorted(list(enumerate(similar[1])),reverse = True, key = lambda x: x[1])[0:5]

[(1, 0.9999999999999998),
 (40, 0.4543441112511213),
 (350, 0.445852828483904),
 (134, 0.4049985302736412),
 (1485, 0.3754717312648463)]

In [29]:
summarizer = pipeline("summarization", model="facebook/bart-base")
text_generator = pipeline("text-generation", model="gpt2")

In [34]:
documents = []
for index, row in data3.iterrows():
    topic_description = preprocess_query(row["topic"]) 
    keywords = preprocess_query(row["keyword"])  
    combined_text = f"{topic_description} {keywords}"  # Combine for TF-IDF
    documents.append(combined_text)


In [35]:
# Create TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Fit the vectorizer on the documents
document_vectors = vectorizer.fit_transform(documents)

def recommend_from_dataset(query):
    
    cleaned_query = preprocess_query(query)
    query_vector = vectorizer.transform([cleaned_query])

    # Calculate cosine similarity between query and documents
    cosine_similarities = cosine_similarity(query_vector, document_vectors)
    similarity_scores = cosine_similarities.flatten()

    # Sort documents based on similarity scores
    sorted_results = sorted(zip(similarity_scores, data3.index, range(len(documents))), reverse=True)

    # Return top N recommendations with scores, topic names, and links (if available)
    top_n_results = sorted_results[:5]  
    recommendations = []
    for result in top_n_results:
        score = result[0]
        document_id = result[1]
        topic_name = data3.loc[document_id, "topic"]  
        link = data3.loc[document_id, "Links"] if "Links" in data3.columns else "No link available" 
        if score >= 0.3:
            recommendations.append({"topic_name": topic_name, "link": link, "score": score})
    return recommendations


In [36]:
def fine_tune_model(model_name, train_dataset, validation_dataset, epochs=3):
    # Load model and tokenizer
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # Define training arguments (adjust parameters as needed)
    training_args = TrainingArguments(
        output_dir="./results",  # Adjust output directory
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=epochs,
        save_steps=10_000,
    )

    # Create a Trainer instance for fine-tuning
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=validation_dataset,
        tokenizer=tokenizer,
    )

    # Train the model
    trainer.train()

    return model

In [39]:
train_dataset =   # Prepare your training dataset
validation_dataset = ...  # Prepare your validation dataset

# Fine-tune the model (replace model name if needed)
fine_tuned_model = fine_tune_model("facebook/bart-base", train_dataset, validation_dataset)

# Update summarization pipeline with the fine-tuned model
summarizer1 = pipeline("text-generation", model=fine_tuned_model, tokenizer=fine_tuned_model.tokenizer)


In [45]:
def summarize_and_generate(user_query, recommendations):
    
    # Summarize the user query
    query_summary = summarizer(user_query, max_length=100, truncation=True)[0]["summary_text"]

    # Generate creative text related to the query
    generated_text = text_generator(f"Exploring the concept of {user_query}", max_length=100, num_return_sequences=1)[0]["generated_text"]

    # Extract related links with scores
    related_links = []
    for recommendation in recommendations:
        related_links.append({"topic": recommendation["topic_name"], "link": recommendation["link"], "score": recommendation["score"]})

    return {
        "query_summary": query_summary.strip(),
        "generated_text": generated_text.strip(),
        "related_links": related_links
      }

In [46]:
user_query = "java by james goslin"
recommendations = recommend_from_dataset(user_query)

# Get the summary, generated text, and related links
results = summarize_and_generate(user_query, recommendations)

print(f"Query Summary: {results['query_summary']}")
print(f"Creative Text: {results['generated_text']}")
print("Some Related Links for your query:")
for link in results["related_links"]:
    print(f"- {link['topic']}:\n {link['link']} : \n Score: {link['score']}") #(Score: {link['score']})

Your max_length is set to 100, but you input_length is only 9. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=4)
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Query Summary: java by james goslinjames groslin
Creative Text: Exploring the concept of java by james goslin is an impressive effort at the best of times and I'm very impressed by how well this was done. The code looks quite simple for simple purposes — there are only two basic methods, call() and destroy(). These two methods are used by most of the java libraries, so any Java that relies on call() or destroy() should use a proper method of your choice as well. Also, the code uses a single method, so that
Some Related Links for your query:
- Java:
 website: https://oracle.com/java/, documentation: https://docs.oracle.com/en/java/, wikipedia: https://en.wikipedia.org/wiki/Java_(programming_language) : 
 Score: 0.625462748622542
- Java Properties:
 wikipedia: https://en.wikipedia.org/wiki/.properties : 
 Score: 0.3952596829701199
- Java Bytecode:
 documentation: https://docs.oracle.com/javase/specs/jvms/se7/html/, wikipedia: https://en.wikipedia.org/wiki/Java_bytecode : 
 Score: 0.38255