seawolf2357 commited on
Commit
cb1f35e
·
verified ·
1 Parent(s): 608dd69

Update file_api.py

Browse files
Files changed (1) hide show
  1. file_api.py +67 -11
file_api.py CHANGED
@@ -441,9 +441,20 @@ def call_groq_api_stream(messages: List[Dict]) -> Generator[str, None, None]:
441
  yield f"❌ API 오류: {str(e)}"
442
 
443
 
444
- def fetch_announcement_detail(url: str) -> Tuple[str, List[Dict]]:
445
- """공고 상세 페이지에서 본문과 첨부파일 정보 추출"""
 
 
 
 
 
 
 
446
  try:
 
 
 
 
447
  headers = {
448
  "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
449
  "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
@@ -453,6 +464,8 @@ def fetch_announcement_detail(url: str) -> Tuple[str, List[Dict]]:
453
  response.raise_for_status()
454
  html_text = response.text
455
  soup = BeautifulSoup(html_text, 'html.parser')
 
 
456
  content_text = ""
457
  tables = soup.find_all('table')
458
  for table in tables:
@@ -462,12 +475,19 @@ def fetch_announcement_detail(url: str) -> Tuple[str, List[Dict]]:
462
  main_content = soup.find('div', {'id': 'container'}) or soup.find('main') or soup.find('article')
463
  if main_content and not content_text:
464
  content_text = main_content.get_text(separator='\n', strip=True)
465
- attachments = []
 
 
 
 
466
  for a_tag in soup.find_all('a', href=True):
467
  href = a_tag.get('href', '')
468
  href_clean = re.sub(r';jsessionid=[^?]*', '', href)
 
469
  if 'getImageFile.do' in href_clean or 'fileDown' in href_clean or 'atchFileId' in href_clean:
470
  filename = a_tag.get_text(strip=True)
 
 
471
  if filename in ['다운로드', '바로보기', '내려받기', '']:
472
  parent = a_tag.parent
473
  if parent:
@@ -482,24 +502,60 @@ def fetch_announcement_detail(url: str) -> Tuple[str, List[Dict]]:
482
  match = re.search(r'첨부파일\s+(.+?)\s+다운로드', title)
483
  if match:
484
  filename = match.group(1)
 
485
  if not filename or filename in ['다운로드', '바로보기', '내려받기']:
486
  filename = f"첨부파일_{len(attachments)+1}"
 
 
487
  if href_clean.startswith('/'):
488
  full_url = f"https://www.bizinfo.go.kr{href_clean}"
489
  elif href_clean.startswith('http'):
490
  full_url = href_clean
491
  else:
492
  continue
 
493
  ext = Path(filename).suffix.lower()
494
  if not ext:
495
  ext = '.unknown'
496
- if not any(att['url'] == full_url for att in attachments):
497
- attachments.append({
498
- "filename": filename,
499
- "url": full_url,
500
- "type": ext[1:] if ext.startswith('.') else ext
501
- })
502
- return content_text, attachments
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
503
  except Exception as e:
504
  import traceback
505
- return f"상세 정보 조회 실패: {str(e)}\n{traceback.format_exc()}", []
 
441
  yield f"❌ API 오류: {str(e)}"
442
 
443
 
444
+ def fetch_announcement_detail(url: str) -> Tuple[str, List[Dict], Optional[Dict]]:
445
+ """공고 상세 페이지에서 본문, 첨부파일, 본문출력파일 정보 추출
446
+
447
+ Returns:
448
+ (content_text, attachments, print_file)
449
+ - content_text: 공고 본문 텍스트
450
+ - attachments: 일반 첨부파일 리스트 (서식, 양식 등)
451
+ - print_file: 본문출력파일 (공고문 PDF/HWP) - AI 분석용
452
+ """
453
  try:
454
+ # URL 정규화
455
+ if url.startswith('/'):
456
+ url = f"https://www.bizinfo.go.kr{url}"
457
+
458
  headers = {
459
  "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
460
  "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
 
464
  response.raise_for_status()
465
  html_text = response.text
466
  soup = BeautifulSoup(html_text, 'html.parser')
467
+
468
+ # 본문 텍스트 추출
469
  content_text = ""
470
  tables = soup.find_all('table')
471
  for table in tables:
 
475
  main_content = soup.find('div', {'id': 'container'}) or soup.find('main') or soup.find('article')
476
  if main_content and not content_text:
477
  content_text = main_content.get_text(separator='\n', strip=True)
478
+
479
+ attachments = [] # 일반 첨부파일
480
+ print_file = None # 본문출력파일
481
+
482
+ # 파일 링크 추출
483
  for a_tag in soup.find_all('a', href=True):
484
  href = a_tag.get('href', '')
485
  href_clean = re.sub(r';jsessionid=[^?]*', '', href)
486
+
487
  if 'getImageFile.do' in href_clean or 'fileDown' in href_clean or 'atchFileId' in href_clean:
488
  filename = a_tag.get_text(strip=True)
489
+
490
+ # 파일명 추출 개선
491
  if filename in ['다운로드', '바로보기', '내려받기', '']:
492
  parent = a_tag.parent
493
  if parent:
 
502
  match = re.search(r'첨부파일\s+(.+?)\s+다운로드', title)
503
  if match:
504
  filename = match.group(1)
505
+
506
  if not filename or filename in ['다운로드', '바로보기', '내려받기']:
507
  filename = f"첨부파일_{len(attachments)+1}"
508
+
509
+ # URL 정규화
510
  if href_clean.startswith('/'):
511
  full_url = f"https://www.bizinfo.go.kr{href_clean}"
512
  elif href_clean.startswith('http'):
513
  full_url = href_clean
514
  else:
515
  continue
516
+
517
  ext = Path(filename).suffix.lower()
518
  if not ext:
519
  ext = '.unknown'
520
+
521
+ file_info = {
522
+ "filename": filename,
523
+ "url": full_url,
524
+ "type": ext[1:] if ext.startswith('.') else ext
525
+ }
526
+
527
+ # ⭐ 본문출력파일 판별 (공고문 PDF/HWP)
528
+ # 패턴: "(제XXXX-XX호)", "공고", "공고문", "본문", 날짜 패턴
529
+ is_print_file = False
530
+ filename_lower = filename.lower()
531
+
532
+ # 1. "(제XXXX-XX호)" 패턴 - 공고번호 포함
533
+ if re.search(r'\(제\d+[-_]?\d*호\)', filename):
534
+ is_print_file = True
535
+ # 2. "공고", "공고문", "모집공고" 등 키워드
536
+ elif any(kw in filename for kw in ['공고문', '모집공고', '공고(안)', '공고 안', '_공고_', '_공고.']):
537
+ is_print_file = True
538
+ # 3. "본문출력" 키워드
539
+ elif '본문출력' in filename or '본문 출력' in filename:
540
+ is_print_file = True
541
+ # 4. 부모 요소에 "본문출력파일" 텍스트가 있는지 확인
542
+ parent = a_tag.parent
543
+ grandparent = parent.parent if parent else None
544
+ for ancestor in [parent, grandparent]:
545
+ if ancestor:
546
+ ancestor_text = ancestor.get_text(strip=True)
547
+ if '본문출력파일' in ancestor_text or '본문출력 파일' in ancestor_text:
548
+ is_print_file = True
549
+ break
550
+
551
+ if is_print_file and not print_file:
552
+ print_file = file_info
553
+ elif not any(att['url'] == full_url for att in attachments):
554
+ # "(첨부" 로 시작하는 파일은 일반 첨부파일
555
+ attachments.append(file_info)
556
+
557
+ return content_text, attachments, print_file
558
+
559
  except Exception as e:
560
  import traceback
561
+ return f"상세 정보 조회 실패: {str(e)}\n{traceback.format_exc()}", [], None