Spaces:
Runtime error
Runtime error
| import requests | |
| import re | |
| from html import unescape | |
| from sentence_transformers import SentenceTransformer | |
| import chromadb | |
| import yaml | |
| try: | |
| # Attempt to load configuration data from config.yaml file | |
| with open("./config.yaml", 'r') as file: | |
| config_data = yaml.safe_load(file) | |
| except Exception as e: | |
| # Raise exception if config.yaml file is not found | |
| raise Exception(f"Not able to find the file ./config.yaml") | |
| # function to fetch data from WordPress site | |
| def fetch_wordpress_data(site_url): | |
| """ | |
| Fetches data from a WordPress site using its REST API. | |
| Args: | |
| site_url (str): The URL of the WordPress site. | |
| Returns: | |
| dict: JSON data retrieved from the WordPress site. | |
| """ | |
| api_url = f"{site_url}/wp-json/wp/v2/posts" | |
| try: | |
| # Send GET request to WordPress API | |
| response = requests.get(api_url) | |
| response.raise_for_status() # Raise exception for unsuccessful responses | |
| # Extract and return JSON data from response | |
| return response.json() | |
| except requests.exceptions.RequestException as e: | |
| # Handle any errors that occur during request | |
| print("Error fetching WordPress data:", e) | |
| return None | |
| def preprocess_text(text): | |
| """ | |
| Preprocesses text by removing HTML tags, decoding special characters, and removing extra whitespaces. | |
| Args: | |
| text (str): The text to be preprocessed. | |
| Returns: | |
| str: The preprocessed text. | |
| """ | |
| # Remove HTML tags | |
| clean_text = re.sub('<.*?>', '', text) | |
| # Decode special characters | |
| clean_text = unescape(clean_text) | |
| # Removing extra newline characters | |
| clean_text = re.sub('\n+', '\n', clean_text) | |
| # Remove extra whitespaces and newline characters | |
| clean_text = clean_text.strip() | |
| return clean_text | |
| def generate_embeddings(text): | |
| """ | |
| Generates sentence embeddings using a pre-trained embedding model. | |
| Args: | |
| text (str): The input text. | |
| Returns: | |
| list: List of sentence embeddings. | |
| """ | |
| # Load pre-trained embedding model | |
| model = SentenceTransformer(config_data['embedding_model']) | |
| # Generate embeddings for input text | |
| embeddings = model.encode(text) | |
| return embeddings.tolist() | |
| def extract_text(post): | |
| """ | |
| Extracts and preprocesses text content from a WordPress post. | |
| Args: | |
| post (dict): The WordPress post data. | |
| Returns: | |
| str: The preprocessed text content of the post. | |
| """ | |
| return preprocess_text(post['content']['rendered']) | |
| def create_vector_store_and_add_posts(wordpress_data): | |
| """ | |
| Creates a vector store in Chroma database and adds WordPress posts to it. | |
| Args: | |
| wordpress_data (list): List of WordPress post data. | |
| Returns: | |
| tuple: A tuple containing the Chroma client and collection objects. | |
| """ | |
| client = chromadb.PersistentClient("./posts_db") | |
| collection = client.get_or_create_collection(name = config_data['collection_name'], metadata={"hnsw:space": "cosine"}) | |
| ids = [] | |
| documents = [] | |
| metadatas = [] | |
| embeddings = [] | |
| for post in wordpress_data: | |
| ids.append(str(post['id'])) | |
| cleaned_content = extract_text(post) | |
| embeddings.append(generate_embeddings(cleaned_content)) | |
| documents.append(cleaned_content) | |
| metadata = {} | |
| metadata['title'] = post['title']['rendered'] | |
| metadata['date'] = post['date'] | |
| metadata['modified'] = post['modified'] | |
| metadatas.append(metadata) | |
| collection.upsert(ids=ids, documents=documents, metadatas=metadatas, embeddings=embeddings) | |
| return client,collection | |