| from model import AIModel | |
| import random | |
| import os | |
| import requests | |
| from bs4 import BeautifulSoup | |
| def fetch_wikipedia_text(title): | |
| url = f"https://en.wikipedia.org/api/rest_v1/page/html/{title}" | |
| headers = {"User-Agent": "Mozilla/5.0"} | |
| response = requests.get(url, headers=headers) | |
| if response.status_code == 200: | |
| soup = BeautifulSoup(response.content, "html.parser") | |
| return soup.get_text() | |
| else: | |
| print(f"Failed to fetch '{title}'") | |
| return "" | |
| def collect_wikipedia_data(titles, min_chars=25000): | |
| all_text = "" | |
| for title in titles: | |
| print(f"Fetching: {title}") | |
| text = fetch_wikipedia_text(title) | |
| all_text += f"\n\n=== {title.replace('_', ' ')} ===\n\n{text}" | |
| if len(all_text) >= min_chars: | |
| break | |
| return all_text | |
| file_path = 'agailab/data.txt' | |
| if not os.path.exists(file_path): | |
| os.makedirs(os.path.dirname(file_path), exist_ok=True) | |
| article_titles = [ | |
| "Artificial_intelligence", "Machine_learning", "Deep_learning", "Neural_network", | |
| "Natural_language_processing", "Computer_vision", "Reinforcement_learning", | |
| "Supervised_learning", "Unsupervised_learning", "Turing_test", "ChatGPT", | |
| "Large_language_model", "OpenAI", "Automation", "Data_science", "AI_ethics", | |
| "Robotics", "Cognitive_computing", "Computer_science", "Algorithm", | |
| "Big_data", "Pattern_recognition", "Knowledge_representation", "Expert_system", "Intelligent_agent" | |
| ] | |
| text_data = collect_wikipedia_data(article_titles) | |
| with open(file_path, 'w', encoding='utf-8') as f: | |
| f.write(text_data) | |
| print(f"Saved {len(text_data)} characters of Wikipedia text to {file_path}") | |
| else: | |
| print("File already exists.") | |
| with open(file_path, 'r', encoding='utf-8') as f: | |
| words = f.read().split() | |
| with open(file_path, 'r', encoding='utf-8') as f: | |
| corpus = f.read() | |
| class AgLab: | |
| def __init__(self, system_prompt: str = ""): | |
| self.system_prompt = system_prompt | |
| with open(file_path, 'r', encoding='utf-8') as f: | |
| self.corpus = f.read() | |
| self.words = self.corpus.split() | |
| def __raw_ask(self, prompt: str) -> str: | |
| ''' | |
| Internal method: Ask a question to the AgLab LLM and get a response as text. | |
| ''' | |
| model = AIModel() | |
| response = model.AskAI(prompt) | |
| return response | |
| def AskAgLabLLM(self, prompt: str) -> str: | |
| ''' | |
| Ask a question to the AgLab LLM and get a response as text. | |
| ''' | |
| full_prompt = self.system_prompt + " " + prompt | |
| response = self.__raw_ask(full_prompt) | |
| return response | |
| def AskAgLabLLMWithContext(self, prompt: str, context: str) -> str: | |
| ''' | |
| Ask a question to the AgLab LLM with context and get a response as text. | |
| ''' | |
| full_prompt = ( | |
| self.system_prompt + | |
| " Context of the chat so far: " + context + | |
| " User said: " + prompt | |
| ) | |
| response = self.__raw_ask(full_prompt) | |
| return response | |
| def SummarizeText(self, text: str) -> str: | |
| ''' | |
| Summarize a text using the AgLab LLM and get a response as text. | |
| ''' | |
| full_prompt = " Summarize the following text: " + text + " send only the summary of the previous text, do not reply, send only the summary." | |
| response = self.__raw_ask(full_prompt) | |
| return response | |
| def TurnToBulletPoints(self, text: str) -> str: | |
| ''' | |
| Turn a text into bullet points using the AgLab LLM and get a response as text. | |
| ''' | |
| full_prompt = " Turn the following text into bullet points: " + text + " send only the bullet points of the previous text, do not reply, send only the bullet points." | |
| response = self.__raw_ask(full_prompt) | |
| return response | |
| def RandomEmojiSequence(self, length: int = 5) -> str: | |
| ''' | |
| Generate a random sequence of emojis. | |
| ''' | |
| emojis = [ | |
| "๐", "๐", "๐", "๐", "๐ ", "๐", "๐", "๐", "๐", "๐", | |
| "๐", "๐", "๐", "๐", "๐", "๐", "๐", "๐ฟ", "๐ป", "๐", | |
| "๐ค", "๐ฝ", "๐พ", "๐", "๐บ", "๐ธ", "๐น", "๐ป", "๐ผ", "๐ฝ", | |
| "๐", "๐ฟ", "๐พ", "๐ถ", "๐ฑ", "๐ญ", "๐น", "๐ฐ", "๐ฆ", "๐ป", | |
| "๐ผ", "๐จ", "๐ฏ", "๐ฆ", "๐ฎ", "๐ท", "๐ฝ", "๐ธ", "๐ต", "๐", | |
| "๐", "๐", "๐", "๐", "๐ง", "๐ฆ", "๐ค", "๐ฃ", "๐ฅ", "๐ฆ", | |
| "๐ฆ ", "๐ฆ", "๐ฆ", "๐บ", "๐", "๐ด", "๐ฆ", "๐", "๐", "๐ฆ", | |
| "๐", "๐", "๐", "๐ชฒ", "๐ชณ", "๐ท", "๐ธ", "๐ข", "๐", "๐ฆ", | |
| "๐ฆ", "๐ฆ", "๐ฆ", "๐ฆ", "๐ฆ", "๐", "๐ชธ", "๐ ", "๐", "๐ก", | |
| "๐ฆ", "๐ฌ", "๐ณ", "๐", "๐ฆญ", "๐", "๐ฆง", "๐ฆ", "๐ฆฃ", "๐", | |
| "๐ฆ", "๐ฆ", "๐ช", "๐ซ", "๐ฆ", "๐ฆ", "๐ฆฌ", "๐", "๐", "๐", | |
| "๐", "๐", "๐", "๐", "๐ฆ", "๐", "๐ฆ", "๐", "๐ฉ", "๐ฆฎ", | |
| "๐โ๐ฆบ", "๐", "๐โโฌ", "๐ชถ", "๐", "๐ฆ", "๐ฆค", "๐ฆ", "๐ฆ", | |
| "๐ฆข", "๐ฆฉ", "๐", "๐", "๐ฆ", "๐ฆจ", "๐ฆก", "๐ฆซ", "๐ฆฆ", "๐ฆฅ", | |
| "๐", "๐", "๐ฟ", "๐ฆ", "๐พ", "๐", "๐ฒ", "๐ต", "๐", "๐ฒ", | |
| "๐ณ", "๐ด", "๐ชต", "๐ฑ", "๐ฟ", "โ", "๐", "๐", "๐ชด", "๐", | |
| "๐", "๐", "๐", "๐", "๐", "๐ชจ", "๐พ", "๐", "๐ท", "๐น", | |
| "๐ฅ", "๐บ", "๐ธ", "๐ผ", "๐ป", "๐", "๐", "๐", "๐", "๐", | |
| "๐", "๐", "๐", "๐", "๐", "๐", "๐", "๐", "๐", "๐", | |
| "๐", "๐", "๐ช", "๐ซ", "โญ", "๐", "โจ", "โก", "โ", "๐ฅ", | |
| "๐ฅ", "๐ช", "๐", "โ", "๐ค", "โ ", "๐ฅ", "โ", "๐ฆ", "๐ง", | |
| "โ", "๐ฉ", "๐จ", "โ", "โ", "โ", "๐ฌ", "๐จ", "๐ง", "๐ฆ", | |
| "โ", "โ", "๐", "๐ซ" | |
| ] | |
| return ''.join(random.choice(emojis) for _ in range(length)) | |
| def RandomTextSequence(self, length: int = 5) -> str: | |
| ''' | |
| Generate a random sequence of words. | |
| ''' | |
| return ' '.join(random.choice(words) for _ in range(length)) | |
| def PredictNextWord(self, text_string: str) -> str: | |
| ''' | |
| Predict the next word in a given text string using a local LLM approach. | |
| It finds the longest matching suffix in the corpus and returns the next word. | |
| ''' | |
| text_words = text_string.split() | |
| if not text_words: | |
| return "" | |
| for i in range(len(text_words), 0, -1): | |
| sequence = ' '.join(text_words[-i:]) | |
| pattern = f"{sequence} " | |
| start_index = self.corpus.find(pattern) | |
| if start_index != -1: | |
| end_index = start_index + len(pattern) | |
| remaining = self.corpus[end_index:].strip() | |
| if remaining: | |
| return remaining.split()[0] | |
| return "<|endoftext|>" | |
| def GenerateLocalText(self, text_string: str, length=10) -> str: | |
| ''' | |
| Generate text based on a given text string using a local LLM approach. | |
| It finds the longest matching suffix in the corpus and returns the next words. | |
| ''' | |
| for i in range(length): | |
| next_word = self.PredictNextWord(text_string) | |
| if next_word == "<|endoftext|>": | |
| break | |
| text_string += " " + next_word | |
| return text_string | |
| if __name__ == "__main__": | |
| aglab = AgLab("You are a helpful assistant called ag lab llm.") | |
| print(aglab.AskAgLabLLM("What is the capital of France, also what is your name?")) | |
| print(aglab.RandomEmojiSequence(10)) | |
| print(aglab.SummarizeText("The quick brown fox jumps over the lazy dog.")) | |
| print(aglab.TurnToBulletPoints("The quick brown fox jumps over the lazy dog.")) | |
| print(aglab.RandomTextSequence(10)) | |
| print(aglab.PredictNextWord("Artificial")) | |
| print(aglab.GenerateLocalText("Artificial", 10)) | |