Haneen211 commited on
Commit
2666071
·
verified ·
1 Parent(s): 0563899

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +50 -2
app.py CHANGED
@@ -47,6 +47,49 @@ prompt = ChatPromptTemplate.from_messages([
47
  question_answer_chain = create_stuff_documents_chain(chatModel, prompt)
48
  rag_chain = create_retrieval_chain(retriever, question_answer_chain)
49
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  def format_latex_for_gradio(text):
51
  """Convert LaTeX delimiters to Gradio-friendly format while preserving markdown"""
52
  # Convert display math \[ ... \] to $$ ... $$
@@ -68,10 +111,15 @@ def chat_function(message, history):
68
  # Extract sources from retrieved documents
69
  source_documents = response.get("context", [])
70
  unique_sources = []
 
 
71
  for doc in source_documents:
72
  source = doc.metadata.get('source', None)
73
- if source and source not in unique_sources:
74
- unique_sources.append(source)
 
 
 
75
 
76
  # Remove ANY existing "Sources:" section that the LLM generated
77
  if "Sources:" in final_answer:
 
47
  question_answer_chain = create_stuff_documents_chain(chatModel, prompt)
48
  rag_chain = create_retrieval_chain(retriever, question_answer_chain)
49
 
50
+ def format_citation(filename):
51
+ """
52
+ Convert filename format 'Author_Year - Title' to 'Author (Year) – Title'
53
+ Handles 'et al.' and '&' in author names properly
54
+
55
+ Examples:
56
+ 'Ng et al._2023 - Problems and Solutions' -> 'Ng et al. (2023) – Problems and Solutions'
57
+ 'Godley & Xia_2016 - Physics Guide' -> 'Godley & Xia (2016) – Physics Guide'
58
+ 'Khan (2003) - Therapy' -> 'Khan (2003) – Therapy'
59
+ """
60
+ # Remove file extension if present
61
+ filename = re.sub(r'\.(pdf|txt)$', '', filename, flags=re.IGNORECASE)
62
+
63
+ # Pattern 1: Handle "Author_YEAR - Title" format
64
+ match = re.match(r'^(.+?)_(\d{4})\s*-\s*(.+)$', filename)
65
+ if match:
66
+ author = match.group(1)
67
+ year = match.group(2)
68
+ title = match.group(3)
69
+
70
+ # Replace underscores in author names with spaces
71
+ author = author.replace('_', ' ')
72
+
73
+ # Format: Author (Year) – Title
74
+ return f"{author} ({year}) – {title}"
75
+
76
+ # Pattern 2: Already has parentheses "Author (YEAR) - Title"
77
+ match = re.match(r'^(.+?)\s*\((\d{4})\)\s*-\s*(.+)$', filename)
78
+ if match:
79
+ author = match.group(1)
80
+ year = match.group(2)
81
+ title = match.group(3)
82
+
83
+ # Replace underscores and hyphens with proper formatting
84
+ author = author.replace('_', ' ')
85
+ return f"{author} ({year}) – {title}"
86
+
87
+ # Pattern 3: Just clean up underscores and hyphens in any format
88
+ filename = filename.replace('_', ' ')
89
+ filename = re.sub(r'\s*-\s*', ' – ', filename)
90
+
91
+ return filename
92
+
93
  def format_latex_for_gradio(text):
94
  """Convert LaTeX delimiters to Gradio-friendly format while preserving markdown"""
95
  # Convert display math \[ ... \] to $$ ... $$
 
111
  # Extract sources from retrieved documents
112
  source_documents = response.get("context", [])
113
  unique_sources = []
114
+ seen_sources = set()
115
+
116
  for doc in source_documents:
117
  source = doc.metadata.get('source', None)
118
+ if source and source not in seen_sources:
119
+ # Format the citation properly
120
+ formatted_source = format_citation(source)
121
+ unique_sources.append(formatted_source)
122
+ seen_sources.add(source)
123
 
124
  # Remove ANY existing "Sources:" section that the LLM generated
125
  if "Sources:" in final_answer: