Spaces:

jehyn923
/

record-ocr

Runtime error

App Files Files Community

record-ocr / watch.py

jehyn923

Update watch.py

9906f58 verified over 1 year ago

raw

history blame contribute delete

17.7 kB

	import json
	import os
	from bs4 import BeautifulSoup
	import re
	from typing import List, Dict
	from pydantic import BaseModel

	from langchain_openai import ChatOpenAI
	from langchain_anthropic import ChatAnthropic
	from langchain_core.prompts import ChatPromptTemplate
	from langchain_upstage import UpstageDocumentParseLoader

	openai_api_key = os.environ["OPENAI_API_KEY"]

	gpt_4o_mini = ChatOpenAI(model="gpt-4o-mini", temperature=0, openai_api_key=openai_api_key)
	gpt_4o = ChatOpenAI(model="gpt-4o", temperature=0, openai_api_key=openai_api_key)

	creative_experiential_activities_prompt = """
	<instructions>
	아래 <creative-experiential-activities-html>을 <creative-experiential-activities-json>형식으로 매핑해서 응답으로 전달해줘.
	원문의 누락이 있어선 안되고, 가끔 오타가 있거나 문맥에 맞지 않은 띄어쓰기가 있다면 너의 판단에 수정해서 전달해줘.
	다시한번 말하지만 절대로 응답에 각색이나 누락이 있으면 안돼.
	areaOfInterest는 희망분야를 의미하며 진로활동에만 들어가.
	areaOfInterest는 그리고 희망분야라는 단어를 뺀 내용들만 들어가면 돼
	areaOfInterest가 없는 경우엔 빈 문자열로 처리해줘.

	output은 json만 전달해줘.
	</instructions>

	<creative-experiential-activities-html>
	{creative-experiential-activities-html}
	</creative-experiential-activities-html>

	<creative-experiential-activities-json>
	{{
	//창의적 체험활동사항
	"creativeExperientialActivities": [
	{{
	//영역
	"activityType": "AUTONOMOUS", // AUTONOMOUS, CLUB, CAREER
	//학년
	"grade": "1", (integer)
	//시간
	"hours": "", (integer)
	//특기사항
	"specialNotes": "", (string)
	//희망분야
	"areaOfInterest": "" (string)
	}},
	{{
	"activityType": "CLUB", // AUTONOMOUS, CLUB, CAREER
	"grade": "1", //학년
	"hours": "",
	"specialNotes": "",
	"areaOfInterest": ""
	}},
	{{
	"activityType": "CAREER", // AUTONOMOUS, CLUB, CAREER
	"grade": "1", //학년
	"hours": "",
	"specialNotes": "",
	"areaOfInterest": ""
	}},
	{{
	"activityType": "AUTONOMOUS", // AUTONOMOUS, CLUB, CAREER
	"grade": "2", //학년
	"hours": "",
	"specialNotes": "",
	"areaOfInterest": ""
	}},
	{{
	"activityType": "CLUB", // AUTONOMOUS, CLUB, CAREER
	"grade": "2", //학년
	"hours": "",
	"specialNotes": "",
	"areaOfInterest": ""
	}},
	{{
	"activityType": "CAREER", // AUTONOMOUS, CLUB, CAREER
	"grade": "2", //학년
	"hours": "",
	"specialNotes": "",
	"areaOfInterest": ""
	}},
	{{
	"activityType": "AUTONOMOUS", // AUTONOMOUS, CLUB, CAREER
	"grade": 3, //학년
	"hours": "",
	"specialNotes": "",
	"areaOfInterest": ""
	}},
	{{
	"activityType": "CLUB", // AUTONOMOUS, CLUB, CAREER
	"grade": "3", //학년
	"hours": "",
	"specialNotes": "",
	"areaOfInterest": ""
	}},
	{{
	"activityType": "CAREER", // AUTONOMOUS, CLUB, CAREER
	"grade": "3", //학년
	"hours": "",
	"specialNotes": "",
	"areaOfInterest": ""
	}},
	],
	}}
	</creative-experiential-activities-json>
	"""

	academic_achievement_prompt = """
	<instructions>
	<instructions>
	아래 <academic-achievement-html>을 <academic-achievement-json>형식으로 매핑해서 응답으로 전달해줘.
	학년과 course type은는 <grade-course-type>에 나와있을 예정이야.

	academic-achievement-html의 세부 능력 및 특기사항엔 교과, 과목에 대한 학생의 상세 성취 및 특기사항이 들어가고 있어.
	세부능력 및 특기사항 아래의 내용은 아래처럼 구성되어 있으니까 잘 참고해서 교과목별로 잘 할당해서 매핑해줘
	세부능력 및 특기사항을 아주 꼼꼼히 확인해서 응답해줘.
	- 교과목 : 특기사항 및 성취도
	- 교과목에 학기가 나눠져서 작성되어 있는 경우엔 그에 맞게 매핑해주면 돼.
	- 만약 교과목에 학기가 명확히 나눠져있거나 명시되어 있지 않고, 1학기 2학기 모두에 해당 교과목이 있는 경우엔 1학기에만 매핑해주고 2학기는 빈칸으로 전달해줘.

	원문의 누락이 있어선 안되고, 가끔 오타가 있거나 문맥에 맞지 않은 띄어쓰기가 있다면 너의 판단에 수정해서 전달해줘.
	다시한번 말하지만 절대로 응답에 각색이나 누락이 있으면 안돼.

	확인해보고 전달할 응답값이 없다면 빈 응답값을 줘도 괜찮아.

	output은 json만 전달해줘.
	</instructions>

	<grade-course-type>
	{grade-course-type}
	</grade-course-type>

	<academic-achievement-html>
	{academic-achievement-html}
	</academic-achievement-html>

	<academic-achievement-json>
	{{
	//교과학습발달상황
	"academicAchievement": [
	{{
	// 기본, 진로선택과목, 체육, 예술 등 교과의 type
	"courseType": "BASIC", //BASIC(기본), CAREER(진로 선택 과목), PE_ARTS(체육, 예술)
	"grade": 1, //학년
	"semester": 1, //학기 either 1 or 2
	"subject": "국어", // 교과
	"course": "국어", // 과목
	//세부능력 및 특기사항 표아래에 각 과목 아래 내용
	"detailedAbilities": "" //세부능력및특기사항 과목별 분리
	}},
	{{
	"courseType": "BASIC", //BASIC(기본), CAREER(진로 선택 과목), PE_ARTS(체육, 예술)
	"grade": 1, //학년
	"semester": 1, //학기
	"subject": "수학", // 교과
	"course": "수학", // 과목
	"detailedAbilities": "" //세부능력및특기사항 과목별 분리
	}}, ...
	{{
	"courseType": "BASIC", //BASIC(기본), CAREER(진로 선택 과목), PE_ARTS(체육, 예술)
	"grade": 1, //학년
	"semester": 2, //학기
	"subject": "국어", // 교과
	"course": "수학", // 과목
	"detailedAbilities": "" //세부능력및특기사항 과목별 분리
	}},
	],
	}}
	</academic-achievement-json>
	"""


	class HTMLParser:
	@staticmethod
	def parse(html_content: str) -> BeautifulSoup:
	return BeautifulSoup(html_content, 'html.parser')

	class ChapterExtractor:
	def __init__(self, soup: BeautifulSoup, desired_keys: List[str]):
	self.soup = soup
	self.desired_keys = desired_keys

	def extract(self) -> Dict[str, str]:
	result_dict = {}
	current_key = self.desired_keys[0]
	paragraph_tags = self.soup.find_all('p', {'data-category': 'paragraph'})

	for tag in paragraph_tags:
	key = tag.decode_contents().strip()
	if key in self.desired_keys:
	current_key = key
	value = self._extract_value(tag)
	result_dict[key] = value.strip()

	return result_dict

	def _extract_value(self, tag) -> str:
	value = ""
	next_tag = tag.find_next_sibling()
	while next_tag and not (next_tag.name == 'p' and
	next_tag.get('data-category') == 'paragraph' and
	next_tag.decode_contents().strip() in self.desired_keys):
	value += str(next_tag)
	next_tag = next_tag.find_next_sibling()
	return value

	class SubjectAchievementExtractor:
	def __init__(self, chapter_content: str):
	self.soup = HTMLParser.parse(chapter_content)

	def extract(self) -> Dict[str, str]:
	subject_achievement_per_grade = {}
	paragraph_tags = self.soup.find_all('p', {'data-category': 'paragraph'})
	pattern = r'^\[(\d+)학년\]$'

	for tag in paragraph_tags:
	key = tag.decode_contents().strip()
	if re.match(pattern, key):
	grade_key = key.strip()
	value = self._extract_value(tag, pattern)
	subject_achievement_per_grade[grade_key] = value.strip()

	return subject_achievement_per_grade

	def _extract_value(self, tag, pattern) -> str:
	value = str(tag)
	next_tag = tag.find_next_sibling()
	while next_tag and not (next_tag.name == 'p' and
	next_tag.get('data-category') == 'paragraph' and
	re.match(pattern, next_tag.decode_contents().strip())):
	value += str(next_tag)
	next_tag = next_tag.find_next_sibling()
	return value

	class CourseTypeExtractor:
	def __init__(self, grade_content: str):
	self.grade_soup = HTMLParser.parse(grade_content)

	def extract(self) -> Dict[str, str]:
	course_types = {}
	grade_paragraph_tags = self.grade_soup.find_all()
	course_type = "BASIC"
	is_first = True

	for tag in grade_paragraph_tags:
	key = tag.decode_contents().strip()
	if is_first:
	value = self._extract_basic_course(tag)
	course_types[f"{course_type}"] = value.strip()
	is_first = False

	if "<" in key and ">" in key:
	if "진로" in key and "선택" in key and "과목" in key:
	course_type = "CAREER"
	value = self._extract_career_course(tag)
	course_types[f"{course_type}"] = value.strip()

	if "체육" in key and "예술" in key:
	course_type = "PE_ARTS"
	value = self._extract_pe_arts_course(tag)
	course_types[f"{course_type}"] = value.strip()

	return course_types

	def _extract_basic_course(self, tag) -> str:
	value = ""
	next_tag = tag.find_next_sibling()
	while next_tag and not (("<" in next_tag.decode_contents().strip() and
	">" in next_tag.decode_contents().strip()) and
	("진로" in next_tag.decode_contents().strip() and
	"선택" in next_tag.decode_contents().strip() and
	"과목" in next_tag.decode_contents().strip())):
	value += str(next_tag)
	next_tag = next_tag.find_next_sibling()
	return value

	def _extract_career_course(self, tag) -> str:
	value = ""
	next_tag = tag.find_next_sibling()
	while next_tag and not (("<" in next_tag.decode_contents().strip() and
	">" in next_tag.decode_contents().strip()) and
	("체육" in next_tag.decode_contents().strip() and
	"예술" in next_tag.decode_contents().strip())):
	value += str(next_tag)
	next_tag = next_tag.find_next_sibling()
	return value

	def _extract_pe_arts_course(self, tag) -> str:
	value = ""
	next_tag = tag.find_next_sibling()
	while next_tag:
	value += str(next_tag)
	next_tag = next_tag.find_next_sibling()
	return value


	class DocumentProcessor:
	def __init__(self, file_path: str):
	self.file_path = file_path
	self.loader = UpstageDocumentParseLoader(file_path, ocr="force", output_format="html")
	self.desired_keys = [
	'4. 자격증 및 인증 취득상황',
	'5. 창의적 체험활동상황',
	'6. 교과학습발달상황',
	'7. 독서활동상황',
	'8. 행동특성 및 종합의견'
	]

	def process(self):
	html_contents = self._load_document()
	soup = HTMLParser.parse(html_contents)

	chapter_extractor = ChapterExtractor(soup, self.desired_keys)
	chapters = chapter_extractor.extract()

	subject_achievement = self._process_subject_achievement(chapters)
	course_types = self._process_course_types(subject_achievement)

	return chapters, subject_achievement, course_types

	def _load_document(self) -> str:
	pages = self.loader.load()
	return "".join(page.page_content for page in pages)

	def _process_subject_achievement(self, chapters: Dict[str, str]) -> Dict[str, str]:
	if '6. 교과학습발달상황' in chapters:
	extractor = SubjectAchievementExtractor(chapters['6. 교과학습발달상황'])
	return extractor.extract()
	return {}

	def _process_course_types(self, subject_achievement: Dict[str, str]) -> Dict[str, Dict[str, str]]:
	course_types = {}
	for grade, content in subject_achievement.items():
	extractor = CourseTypeExtractor(content)
	course_types[grade] = extractor.extract()
	return course_types

	class CreativeExperientialActivity(BaseModel):
	activityType: str
	grade: int
	hours: int
	specialNotes: str
	areaOfInterest: str

	class AcademicAchievement(BaseModel):
	courseType: str
	grade: int
	semester: int
	subject: str
	course: str
	detailedAbilities: str


	class AcademicAchievementResult(BaseModel):
	academicAchievement: List[AcademicAchievement]

	class CreativeExperientialActivityResult(BaseModel):
	creativeExperientialActivities: List[CreativeExperientialActivity]

	class OcrResult(BaseModel):
	creativeExperientialActivities: List[CreativeExperientialActivity]
	academicAchievement: List[AcademicAchievement]

	continuation_prompt = """
	아래 latest-response는 이전 응답의 마지막 부분입니다.
	respond from where you left off.
	your response would be appended to the latest-response, so there is no need to include the latest-response in your response.
	your response appended to latest-response should be a full json output format.
	if there is new line space remove them.

	<latest-response>
	{latest-response}
	</latest-response>
	"""

	def clean_json_output(output):
	output = output.strip()
	if output.startswith("```json"):
	output = output[7:]
	if output.endswith("```"):
	output = output[:-3]
	cleaned_output = output.strip()

	return cleaned_output

	def get_complete_response(client, prompt, input: dict):
	full_response = ""
	original_prompt = ChatPromptTemplate.from_messages(
	messages=[("system", "You are a tutor for a high school student"),
	("human", prompt)]
	)
	chain = original_prompt \| client
	response = chain.invoke(input)
	print(response)
	full_response += clean_json_output(response.content)
	while True:
	#if not response.response_metadata["stop_reason"] == "max_tokens":
	if not response.response_metadata["finish_reason"] == "length":
	break

	next_prompt = prompt + continuation_prompt
	next_prompt_message = ChatPromptTemplate.from_messages(
	messages=[("system", "You are a tutor for a high school student"),
	("human", next_prompt)]
	)

	chain = next_prompt_message \| client

	input["latest-response"] = response.content
	response = chain.invoke(input)
	full_response += clean_json_output(response.content)

	return full_response


	def ocr_process(pdfFile: str):
	file_path = pdfFile
	processor = DocumentProcessor(file_path)
	chapters, subject_achievement, course_types = processor.process()

	print("Chapters:", chapters)
	print("Subject Achievement:", subject_achievement)
	print("Course Types:", course_types)

	claude = ChatAnthropic(
	model="claude-3-5-sonnet-20240620",
	temperature=0,
	max_tokens=8192,
	timeout=None,
	max_retries=2,
	)

	creative_response = get_complete_response(gpt_4o, creative_experiential_activities_prompt, {
	"creative-experiential-activities-html": chapters['5. 창의적 체험활동상황'],
	})

	print("Response: " + creative_response)
	creative_result = CreativeExperientialActivityResult.model_validate(json.loads(creative_response))
	creative_activities = creative_result.creativeExperientialActivities

	print("-----학년별 항목별 교과학습발달상황----")
	print(course_types)

	academic_results = []
	for courseType, grade_achievement in course_types.items():
	for grade, grade_content in grade_achievement.items():
	academic_response = get_complete_response(gpt_4o, academic_achievement_prompt,{"grade-course-type": f"{grade}_{courseType}","academic-achievement-html": grade_content} )
	result = AcademicAchievementResult.model_validate(json.loads(academic_response))
	print("----full response of academic_response----")
	print(result)

	for value in result.academicAchievement:
	academic_results.append(value)
	print(academic_results)
	print("----결과 in json----")
	print(json.dumps({
	"creative-experiential-activity": [activity.model_dump() for activity in creative_activities],
	"academic-achievements": [achievement.model_dump() for achievement in academic_results]
	}, ensure_ascii=False, indent=4))

	result = OcrResult(creativeExperientialActivities=creative_activities, academicAchievement=academic_results)
	return result