Spaces:
Runtime error
Runtime error
| import re | |
| def parse_topics_to_dict(text): | |
| topics = {} | |
| lines = text.strip().split("\n") | |
| current_topic = None | |
| topic_pattern = re.compile(r"^\d+\.\s+(.*)$") | |
| sub_topic_pattern = re.compile(r"^\*\s+(.*)$") | |
| for line in lines: | |
| line = line.strip() | |
| if topic_pattern.match(line): | |
| current_topic = topic_pattern.match(line).group(1) | |
| topics[current_topic] = [] | |
| elif sub_topic_pattern.match(line): | |
| sub_topic = sub_topic_pattern.match(line).group(1) | |
| if current_topic: | |
| topics[current_topic].append(sub_topic) | |
| print(topics) | |
| return topics | |
| def remove_all_sources(text): | |
| # Construct a regular expression pattern to match all sources | |
| pattern = r"Source \d+:(.*?)(?=Source \d+:|$)" | |
| # Use re.DOTALL to make '.' match newlines and re.IGNORECASE for case-insensitive matching | |
| updated_text = re.sub(pattern, "", text, flags=re.DOTALL) | |
| return updated_text.strip() | |
| def clean_text(text): | |
| # Replace multiple spaces with a single space | |
| text = re.sub(r"\s{2,}", " ", text) | |
| # Remove newline characters that are not followed by a number (to keep lists or numbered points) | |
| text = re.sub(r"\n(?!\s*\d)", " ", text) | |
| # Remove unnecessary punctuation (optional, adjust as needed) | |
| text = re.sub(r";(?=\S)", "", text) | |
| # Optional: Remove extra spaces around certain characters | |
| text = re.sub(r"\s*([,;])\s*", r"\1 ", text) | |
| # Normalize whitespace to a single space | |
| text = re.sub(r"\s+", " ", text).strip() | |
| return text | |
| def update_response(text): | |
| # Find all the references in the text, e.g., [1], [3], [5] | |
| responses = re.findall(r"\[\d+\]", text) | |
| # Extract the numbers from the responses, and remove duplicates | |
| ref_numbers = sorted(set(int(respon.strip("[]")) for respon in responses)) | |
| # Create a mapping from old reference numbers to new ones | |
| ref_mapping = {old: new for new, old in enumerate(ref_numbers, start=1)} | |
| # Replace old responses with the updated responses in the text | |
| for old, new in ref_mapping.items(): | |
| text = re.sub(rf"\[{old}\]", f"[{new}]", text) | |
| return text | |
| def renumber_sources(source_list): | |
| new_sources = [] | |
| for i, source in enumerate(source_list): | |
| # Extract the content after the colon | |
| content = source.split(": ", 1)[1] | |
| # Add the new source number and content | |
| new_sources.append(f"source {i+1}: {content}") | |
| return new_sources | |
| def sort_and_renumber_sources(source_list): | |
| """ | |
| This function takes a list of sources, sorts them based on the source number, | |
| and renumbers them sequentially starting from 1. | |
| :param source_list: List of strings containing source information. | |
| :return: Sorted and renumbered list of sources. | |
| """ | |
| # Function to extract source number | |
| def extract_source_number(source): | |
| match = re.search(r"Source (\d+)", source) | |
| return int(match.group(1)) if match else float('inf') | |
| # Sort sources based on the source number | |
| sorted_sources = sorted(source_list, key=extract_source_number) | |
| # Reassign the numbering in the sorted sources | |
| for idx, source in enumerate(sorted_sources, 1): | |
| sorted_sources[idx-1] = re.sub(r"Source \d+", f"Source {idx}", source) | |
| return sorted_sources | |
| def seperate_to_list(text): | |
| # Step 1: Split the text by line breaks (\n) | |
| lines = text.split("\n") | |
| # Step 2: Remove occurrences of "source (number):" | |
| cleaned_lines = [re.sub(r"Source \d+\:", "", line) for line in lines] | |
| # Step 3: Split all capital sentences | |
| final_output = [] | |
| for line in cleaned_lines: | |
| # Split any fully capitalized sentence (surrounding non-uppercase text remains intact) | |
| split_line = re.split(r"([A-Z\s]+[.!?])", line) | |
| final_output.extend([part.strip() for part in split_line if part.strip()]) | |
| return final_output | |
| def join_list(items): | |
| if not items: | |
| return "" | |
| elif len(items) == 1: | |
| return items[0] | |
| elif len(items) == 2: | |
| return f"{items[0]} and {items[1]}" | |
| else: | |
| return ", ".join(items[:-1]) + " and " + items[-1] | |
| def redesign_structure_message(message, metadata): | |
| """ | |
| This function replaces occurrences of '[n]' in the message | |
| with the title of the book found in metadata[n-1]["title"]. | |
| """ | |
| if not metadata or metadata == []: | |
| return message # Return the original message if metadata is not valid | |
| # Create a function to replace each citation with the corresponding book title | |
| def replace_citation(match): | |
| citation_number = int(match.group(1)) # Extract the citation number | |
| # Check if the citation number corresponds to a title in metadata | |
| if 1 <= citation_number <= len(metadata): | |
| return f"[*{metadata[citation_number - 1]['title']}*]" # Return the title in italics | |
| return match.group(0) # Return the original citation if out of bounds | |
| # Use regex to find all citations in the format '[n]' | |
| redesigned_message = re.sub(r'\[(\d+)\]', replace_citation, message) | |
| return redesigned_message | |
| def extract_sorted_page_numbers(content): | |
| # Regular expression pattern to match page references like [p-166], [p-163], etc. | |
| page_pattern = r'\[p-(\d+)\]' | |
| # Find all matches (page numbers) in the content | |
| page_numbers = re.findall(page_pattern, content) | |
| # Convert the found page numbers into integers, remove duplicates, and sort them | |
| return sorted(set(map(int, page_numbers))) # Use set to remove duplicates and sorted to sort them | |
| # Method to filter and create a new list with the relevant page numbers [163, 165, 166] | |
| def filter_metadata_by_pages(metadata, pages): | |
| if pages and metadata: | |
| combined_metadata = [{ | |
| "page_number": pages, | |
| "title": metadata[0]["title"], # All entries share the same title | |
| "author": metadata[0]["author"], # All entries share the same author | |
| "category": metadata[0]["category"], # All entries share the same category | |
| "year": metadata[0]["year"], # All entries share the same year | |
| "publisher": metadata[0]["publisher"], # All entries share the same publisher | |
| "reference": metadata[0]["reference"] # All entries share the same reference | |
| }] | |
| return combined_metadata | |
| else: | |
| return [] | |