Spaces:

Multimedika
/

Bot_Development

Runtime error

App Files Files Community

Bot_Development / core /parser.py

dsmultimedika

fix : update code

0767396 about 1 year ago

raw

history blame contribute delete

6.45 kB

	import re


	def parse_topics_to_dict(text):
	topics = {}
	lines = text.strip().split("\n")
	current_topic = None

	topic_pattern = re.compile(r"^\d+\.\s+(.*)$")
	sub_topic_pattern = re.compile(r"^\\s+(.)$")

	for line in lines:
	line = line.strip()
	if topic_pattern.match(line):
	current_topic = topic_pattern.match(line).group(1)
	topics[current_topic] = []
	elif sub_topic_pattern.match(line):
	sub_topic = sub_topic_pattern.match(line).group(1)
	if current_topic:
	topics[current_topic].append(sub_topic)

	print(topics)
	return topics


	def remove_all_sources(text):
	# Construct a regular expression pattern to match all sources
	pattern = r"Source \d+:(.*?)(?=Source \d+:\|$)"

	# Use re.DOTALL to make '.' match newlines and re.IGNORECASE for case-insensitive matching
	updated_text = re.sub(pattern, "", text, flags=re.DOTALL)

	return updated_text.strip()


	def clean_text(text):
	# Replace multiple spaces with a single space
	text = re.sub(r"\s{2,}", " ", text)
	# Remove newline characters that are not followed by a number (to keep lists or numbered points)
	text = re.sub(r"\n(?!\s*\d)", " ", text)
	# Remove unnecessary punctuation (optional, adjust as needed)
	text = re.sub(r";(?=\S)", "", text)
	# Optional: Remove extra spaces around certain characters
	text = re.sub(r"\s([,;])\s", r"\1 ", text)
	# Normalize whitespace to a single space
	text = re.sub(r"\s+", " ", text).strip()

	return text


	def update_response(text):
	# Find all the references in the text, e.g., [1], [3], [5]
	responses = re.findall(r"\[\d+\]", text)

	# Extract the numbers from the responses, and remove duplicates
	ref_numbers = sorted(set(int(respon.strip("[]")) for respon in responses))

	# Create a mapping from old reference numbers to new ones
	ref_mapping = {old: new for new, old in enumerate(ref_numbers, start=1)}

	# Replace old responses with the updated responses in the text
	for old, new in ref_mapping.items():
	text = re.sub(rf"\[{old}\]", f"[{new}]", text)

	return text

	def renumber_sources(source_list):
	new_sources = []
	for i, source in enumerate(source_list):
	# Extract the content after the colon
	content = source.split(": ", 1)[1]
	# Add the new source number and content
	new_sources.append(f"source {i+1}: {content}")
	return new_sources

	def sort_and_renumber_sources(source_list):
	"""
	This function takes a list of sources, sorts them based on the source number,
	and renumbers them sequentially starting from 1.

	:param source_list: List of strings containing source information.
	:return: Sorted and renumbered list of sources.
	"""

	# Function to extract source number
	def extract_source_number(source):
	match = re.search(r"Source (\d+)", source)
	return int(match.group(1)) if match else float('inf')

	# Sort sources based on the source number
	sorted_sources = sorted(source_list, key=extract_source_number)

	# Reassign the numbering in the sorted sources
	for idx, source in enumerate(sorted_sources, 1):
	sorted_sources[idx-1] = re.sub(r"Source \d+", f"Source {idx}", source)

	return sorted_sources


	def seperate_to_list(text):
	# Step 1: Split the text by line breaks (\n)
	lines = text.split("\n")

	# Step 2: Remove occurrences of "source (number):"
	cleaned_lines = [re.sub(r"Source \d+\:", "", line) for line in lines]

	# Step 3: Split all capital sentences
	final_output = []
	for line in cleaned_lines:
	# Split any fully capitalized sentence (surrounding non-uppercase text remains intact)
	split_line = re.split(r"([A-Z\s]+[.!?])", line)
	final_output.extend([part.strip() for part in split_line if part.strip()])

	return final_output

	def join_list(items):
	if not items:
	return ""
	elif len(items) == 1:
	return items[0]
	elif len(items) == 2:
	return f"{items[0]} and {items[1]}"
	else:
	return ", ".join(items[:-1]) + " and " + items[-1]

	def redesign_structure_message(message, metadata):
	"""
	This function replaces occurrences of '[n]' in the message
	with the title of the book found in metadata[n-1]["title"].
	"""
	if not metadata or metadata == []:
	return message # Return the original message if metadata is not valid

	# Create a function to replace each citation with the corresponding book title
	def replace_citation(match):
	citation_number = int(match.group(1)) # Extract the citation number
	# Check if the citation number corresponds to a title in metadata
	if 1 <= citation_number <= len(metadata):
	return f"[{metadata[citation_number - 1]['title']}]" # Return the title in italics
	return match.group(0) # Return the original citation if out of bounds

	# Use regex to find all citations in the format '[n]'
	redesigned_message = re.sub(r'\[(\d+)\]', replace_citation, message)

	return redesigned_message

	def extract_sorted_page_numbers(content):
	# Regular expression pattern to match page references like [p-166], [p-163], etc.
	page_pattern = r'\[p-(\d+)\]'

	# Find all matches (page numbers) in the content
	page_numbers = re.findall(page_pattern, content)

	# Convert the found page numbers into integers, remove duplicates, and sort them
	return sorted(set(map(int, page_numbers))) # Use set to remove duplicates and sorted to sort them

	# Method to filter and create a new list with the relevant page numbers [163, 165, 166]
	def filter_metadata_by_pages(metadata, pages):
	if pages and metadata:
	combined_metadata = [{
	"page_number": pages,
	"title": metadata[0]["title"], # All entries share the same title
	"author": metadata[0]["author"], # All entries share the same author
	"category": metadata[0]["category"], # All entries share the same category
	"year": metadata[0]["year"], # All entries share the same year
	"publisher": metadata[0]["publisher"], # All entries share the same publisher
	"reference": metadata[0]["reference"] # All entries share the same reference
	}]

	return combined_metadata
	else:
	return []