Spaces:

tokyotechlab
/

report

Sleeping

App Files Files Community

report / app /services /analysis /normalizeForm.py

3v324v23

[D] update promt

a128e26 about 2 months ago

raw

history blame contribute delete

4.42 kB

	from app.services.llm.llm_connector import LLMConnector

	class NormalizeForm:
	def __init__(self, llm: LLMConnector) -> None:
	self.llm = llm

	def _system_prompt(self) -> str:
	"""
	Defines the system's role for the LLM.
	"""
	return """You are a text processing expert specializing in citation formatting. Your sole purpose is to receive a piece of text and correct its citation style according to a strict set of rules, without altering any other part of the content. You must return only the corrected text."""

	def _prompt(self, report: str) -> str:
	"""
	Constructs the user prompt for the LLM with STRICT CITATION RULES.
	"""
	return f"""Please correct the citation formatting in the following text.

	==================================================
	CRITICAL CITATION RULES (MUST FOLLOW):

	1. Single Citation:
	- CORRECT: [Source 1]
	- CORRECT: [Image 1]
	- INCORRECT: [1], (Source 1), Source[1], (1), ...

	2. Multiple Citations (SAME TYPE) -> COMBINE:
	- CORRECT: [Source 1, 2]
	- CORRECT: [Image 1, 3]
	- INCORRECT: [Source 1][Source 2]

	3. Multiple Citations (DIFFERENT TYPES) -> SEPARATE:
	- CORRECT: [Source 1], [Image 2]
	- INCORRECT (NEVER DO THIS): [Source 1, Image 2]
	- INCORRECT: [Source 1; Image 2]

	4. Forbidden Characters:
	- Do NOT use parentheses like (Source 1).
	- Do NOT use standalone numbers like [1].

	5. Enforcement Example:
	- "According to [Source 1], the tank is visible in [Image 2, 3]."
	- "This is confirmed by [Source 1, 2] and shown in [Image 1]."

	6. No Citation Case (STRICT)

	- Never invent, assume, or auto-append citations.

	- Citations must appear only if they already exist in the original text.

	- Adding fake or guessed citations is considered a critical error.

	Example (CORRECT):

	"The system validates citation formatting."

	Example (INCORRECT):

	"The system validates citation formatting. [Source 1]"
	==================================================

	Original Text to correct:
	---
	{report}
	---

	Example:

	"The data was collected from an official report Source(1)"
	"This information is supported by multiple references Source[1, 2, 3]"

	CORRECT:

	"The data was collected from an official report [Source 1]"
	"This information is supported by multiple references [Source 1, 2, 3]"

	INCORRECT:
	"The data was collected from an official report [1]"
	"The data was collected from an official report (Source 1)"
	"The data was collected from an official report Source[1]"
	"This information is supported by multiple references [Source 1][Source 2]"
	"This information is supported by multiple references [Source 1, Source 2]"

	Return only the corrected text, with no preamble or explanation.
	If no citation exists in the text, do not add any citation.

	"""

	def _schema(self) -> dict:
	"""
	Defines the JSON schema for the expected output from the LLM.
	"""
	return {
	"type": "object",
	"properties": {
	"normalized_text": {
	"type": "string",
	"description": "The text with all citation formats corrected according to the provided rules. If no citations are present, this will be the original text."
	}
	},
	"required": ["normalized_text"]
	}

	async def normalize(self, report: str) -> str:
	"""
	Normalizes the citation format in the report string using an LLM call.
	"""
	if not report or not isinstance(report, str):
	return report

	prompt = self._prompt(report)
	system_prompt = self._system_prompt()
	schema = self._schema()

	structured_response = await self.llm.call_with_structured_output(
	prompt=prompt,
	system_prompt=system_prompt,
	schema=schema
	)

	if structured_response and "normalized_text" in structured_response:
	return structured_response["normalized_text"]

	return report