File size: 39,967 Bytes
158410c
 
 
0527a96
 
 
 
 
 
158410c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13a6c4d
 
 
9d1f0fc
13a6c4d
 
 
 
9d1f0fc
13a6c4d
 
0527a96
158410c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13a6c4d
 
158410c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13a6c4d
158410c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21842f3
158410c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
from typing import Dict, Any, Optional
from pydantic import BaseModel, Field
from uuid import uuid4
import os
from typing import Dict, Any, List, Optional, Tuple # Ensure Tuple is imported
from github import Github, PullRequest
from github.GithubException import GithubException, UnknownObjectException
import requests # Make sure requests is imported for patch_url
from dotenv import load_dotenv

class PRReviewState(BaseModel):
    # GitHub PR Information (Mandatory)
    pr_id: int
    repo_name: str # e.g., "owner/repo"

    # GitHub PR Information (Now Optional)
    diff_url: Optional[str] = None
    pr_title: Optional[str] = None
    pr_author: Optional[str] = None
    # review_run_id: str = Field(default_factory=lambda: str(uuid.uuid4())) # Optional: Unique ID for this specific review run

    # Code Content (Now Optional)
    code_diff: Optional[str] = None # The fetched raw diff content
    file_contents: Dict[str, str] = {} # Map of filename to full content for context (already has default)

    # LLM Review Outputs (Already Optional)
    llm_markdown_review: Optional[str] = None # The raw Markdown output from the LLM (e.g., from generate_code_review_markdown)
    parsed_llm_review_data: Optional[Dict[str, Any]] = None # Structured dict from parsing the Markdown (e.g., from parse_llm_review_markdown)

    # Human-in-the-Loop (Simplified for Phase 1) (Already Optional or has default)
    require_human_approval: bool = False # Config flag, set at graph initialization
    human_approval_status: Optional[bool] = None # True if approved, False if rejected
    human_feedback_message: Optional[str] = None # Any message from human rejection

    # System Status (Now Optional or has default)
    review_status: str = "initiated" # e.g., "initiated", "fetching_code", "code_retrieved", "generating_llm_review", "llm_review_generated", "parsing_llm_review", "review_parsed", "awaiting_human_approval", "posting_review", "posted", "rejected", "failed"
    last_error: Optional[str] = None # Stores the last encountered error message
    # error_traceback: Optional[str] = None # Optional: For more detailed error debugging
    review_id: Optional[int] = None # this is the pull-request-review-id (pending -> approved/Discarded based on HIL) (Now Optional)
    review_comment_url: Optional[str] = None # URL of the main posted GitHub review comment

'''
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
git_hub_token = user_secrets.get_secret("GITHUB_token_ID")
google_api_key = user_secrets.get_secret("GOOGLE_API_KEY")
'''

google_api_key = os.getenv("GOOGLE_API_KEY")
if not google_api_key:
    print("Google API key not found in environment variables.")
print(f"Using Google API key: {google_api_key[:4]}... (truncated for security)")

git_hub_token = os.getenv("GITHUB_token_ID")
if not git_hub_token:
    print("git_hub_token not found in environment variables.")
print(f"git_hub_token : {git_hub_token[:4]}... (truncated for security)")




# For local testing, you might need to load dotenv if your environment variables
# are managed via a .env file. In a deployed environment, they would likely be
# set directly.
# Only load dotenv if it's not already loaded (e.g., in __main__ or a test setup)
# This prevents redundant loading in production or if your main script handles it.
if not os.getenv("GITHUB_TOKEN"): # Only load if token not already set
    load_dotenv() # Load environment variables from .env file

# Assuming 'git_hub_token' is defined globally or passed in a larger context
# If git_hub_token is expected to be a global variable, ensure it's imported or declared.
# For better practice, pass it as an argument or rely solely on os.getenv.
# Let's adjust to purely rely on os.getenv for this function.
# github_token = os.getenv("GITHUB_TOKEN") # Moved inside function for safety


def fetch_pr_code_changes(repo_name: str, pr_id: int) -> Tuple[Optional[str], Optional[Dict[str, str]], Optional[str], Optional[str]]:
    """
    Fetches the raw diff content, the full contents of changed files,
    and the head commit SHA for a given PR.

    Args:
        repo_name (str): The full name of the repository (e.g., "octocat/Spoon-Knife").
        pr_id (int): The ID of the Pull Request.

    Returns:
        Tuple[Optional[str], Optional[Dict[str, str]], Optional[str], Optional[str]]:
            - raw_diff_content (str or None): The raw diff content of the PR.
            - file_contents (Dict[str, str] or None): Dictionary mapping filename to its full content (after changes).
            - head_commit_sha (str or None): The SHA of the head commit of the PR.
            - error_message (str or None): An error message if something went wrong.
    """
    github_token = os.getenv("GITHUB_token_ID")
    #github_token = git_hub_token

    if not github_token:
        print("Error: GITHUB_TOKEN environment variable not set.")
        return None, None, None, "GitHub token not found in environment variables."

    try:
        g = Github(github_token)
        repo = g.get_repo(repo_name)
        pull_request = repo.get_pull(pr_id)

        # --- NEW: Get the head commit SHA ---
        head_commit_sha = pull_request.head.sha
        print(f"Fetched PR {pr_id} head commit SHA: {head_commit_sha}")


        # 1. Fetch raw diff content (patch)
        # Using requests directly for patch_url is good as PyGithub's get_patch() can sometimes be rate-limited differently
        patch_url = pull_request.patch_url
        headers = {"Authorization": f"token {github_token}"}
        raw_diff_content = requests.get(patch_url, headers=headers).text

        # 2. Fetch full content of changed files
        file_contents: Dict[str, str] = {}
        for file in pull_request.get_files():
            # Skip files that were deleted, as their content cannot be retrieved from the current head.
            if file.status == 'deleted':
                file_contents[file.filename] = "[FILE DELETED]"
                continue

            try:
                # We want the content *after* the change, which is from the PR's head branch.
                # PyGithub's get_contents should be called with `ref` set to `pull_request.head.ref`
                # or `pull_request.head.sha` for explicit content at the PR's head.
                # Using pull_request.head.sha is more robust as ref might change.
                file_content_obj = repo.get_contents(file.filename, ref=pull_request.head.sha)

                if isinstance(file_content_obj, list):
                    print(f"Warning: '{file.filename}' is a directory or multiple files, skipping content retrieval for now.")
                    file_contents[file.filename] = "[DIRECTORY OR MULTIPLE FILES]"
                    continue

                file_contents[file.filename] = file_content_obj.decoded_content.decode('utf-8')

            except GithubException as e:
                print(f"Warning: GitHub API error fetching content for {file.filename} (PR {pr_id}, Repo {repo_name}): {e.status} - {e.data.get('message', 'No message')}")
                file_contents[file.filename] = f"[ERROR: Could not fetch content. Status: {e.status}, Message: {e.data.get('message', 'No message')}]"
            except Exception as e:
                print(f"Unexpected error fetching content for {file.filename} (PR {pr_id}, Repo {repo_name}): {e}")
                file_contents[file.filename] = f"[ERROR: Unexpected error fetching content: {e}]"

        # Return the new head_commit_sha along with existing returns
        return raw_diff_content, file_contents, head_commit_sha, None # No error message if successful

    except UnknownObjectException as e:
        error_msg = f"GitHub object not found (repo or PR): {e.data.get('message', 'No message')}"
        print(f"Error in fetch_pr_code_changes: {error_msg}")
        return None, None, None, error_msg
    except GithubException as e:
        error_msg = f"GitHub API error for PR {pr_id} from {repo_name}: {e.status} - {e.data.get('message', 'No message')}"
        print(f"Error in fetch_pr_code_changes: {error_msg}")
        return None, None, None, error_msg
    except Exception as e:
        error_msg = f"An unexpected error occurred while fetching PR {pr_id} from {repo_name}: {e}"
        print(f"Error in fetch_pr_code_changes: {error_msg}")
        return None, None, None, error_msg

def code_retriever_node(state:PRReviewState):
    repo_name = state.repo_name
    pull_req_id = state.pr_id

    print(f"repo_name :{repo_name}-------- pull_req_id:{pull_req_id}")

    diff, contents,head_commit_sha, error = fetch_pr_code_changes(repo_name, pull_req_id)

    # Don't forget to return an updated state, as nodes in LangGraph should always do
    # For this simple example, we'll just return a copy with an updated status
    updated_state = state.model_copy(update={
        "review_status": "code_retrieved", # Update status after retrieval logic
        "code_diff": diff,
        "file_contents": contents
    })
    return updated_state

import os
from typing import Dict, Any
from langchain_core.prompts import ChatPromptTemplate
# Ensure you have your LLM provider installed, e.g., pip install langchain-google-genai
from langchain_google_genai import ChatGoogleGenerativeAI # Using Gemini as per your preference

# Initialize your LLM. Make sure your GOOGLE_API_KEY is set in environment variables.
# You can also configure other models like "gemini-1.5-flash" or "gemini-1.5-pro"
llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash", temperature=0.0, api_key=google_api_key) # Lower temperature for more deterministic output

def generate_code_review_markdown(code_diff: str, file_contents: Dict[str, str]) -> str:
    """
    Generates a detailed, human-readable code review in Markdown format from the LLM.

    The prompt is designed to elicit structured Markdown output that can then be
    parsed for GitHub PR comments, grouped by file and function.

    Args:
        code_diff (str): The string representation of the code diff.
        file_contents (Dict[str, str]): A dictionary where keys are file paths
                                        and values are their full content.

    Returns:
        str: A Markdown string representing the code review.
    """

    # Prepare full contents context
    full_contents_str = ""
    if file_contents:
        for filename, content in file_contents.items():
            # Add a clear separator and Markdown code block for each file
            full_contents_str += f"--- Full Content of {filename} ---\n```python\n{content}\n```\n\n"
    else:
        full_contents_str = "No full file contents provided for additional context."

    # Construct the Prompt Template
    prompt = ChatPromptTemplate.from_messages(
        [
            ("system",
             "You are an expert Senior Software Engineer and a meticulous code reviewer.\n"
             "Your task is to review the provided code changes in a Pull Request.\n"
             "Analyze the `code_diff` for potential bugs, performance issues, security vulnerabilities, code style violations, maintainability concerns, and missing tests or documentation.\n"
             "Refer to the `full_file_contents` for additional context if the diff alone is insufficient to understand the changes or their implications.\n"
             "Provide a comprehensive, actionable, and constructive review.\n"
             "Format your review clearly using Markdown. Structure it with the following top-level sections:\n"
             "1. **Overall Impression:** A brief summary of the PR's purpose and overall quality.\n"
             "2. **Specific Observations and Suggestions:** Detailed feedback, grouped by file.\n"
             "   - Within each file's section, group related comments, ideally by function or logical block.\n"
             "   - For each observation/suggestion, include relevant line numbers from the *new* file for context (e.g., 'Line X-Y:').\n"
             "3. **Potential Issues and Edge Cases:** Discuss any missed scenarios or potential problems.\n"
             "4. **Security Implications:** Highlight any security concerns.\n"
             "5. **Adherence to Best Practices (PEP 8):** Comment on style and best practice compliance.\n"
             "6. **Performance Considerations:** Discuss performance aspects.\n"
             "7. **Unit Testing Suggestions:** Recommend additional tests.\n"
             "8. **Docstring/Comment Improvements:** Suggest documentation enhancements.\n"
             "9. **Clarity and Conciseness:** Feedback on code readability.\n"
             "10. **Summary:** A concise conclusion and recommended action (e.g., 'Approve', 'Request Changes', 'Comment').\n\n"
             "For code suggestions, use GitHub's Markdown code block with 'suggestion' annotation, like this:\n"
             "```suggestion\n"
             "your_suggested_code_here\n"
             "```\n"
             "Ensure file paths are correctly formatted (e.g., `src/utils/data_processor.py`)."
            ),
            ("human",
             "Here are the code changes (diff):\n"
             "```diff\n"
             "{code_diff}\n"
             "```\n\n"
             "Here are the full contents of the changed files (for additional context, use only if necessary to understand the diff):\n"
             "{full_contents_context}\n\n"
             "Please provide your structured code review in Markdown."
            ),
        ]
    )

    # Create the Chain
    review_chain = prompt | llm

    # Invoke the Chain
    try:
        review_markdown = review_chain.invoke({
            "code_diff": code_diff,
            "full_contents_context": full_contents_str
        }).content # Access the content attribute for Chat model output
        return review_markdown
    except Exception as e:
        print(f"Error generating code review: {e}")
        return f"Error: Could not generate code review. {e}\n\n" \
               f"Please check the LLM API call or token limits."

def code_reviewer_node(state:PRReviewState):
    code_diff = state.code_diff
    file_contents = state.file_contents

    review_markdown = generate_code_review_markdown(code_diff, file_contents)

    # Don't forget to return an updated state, as nodes in LangGraph should always do
    # For this simple example, we'll just return a copy with an updated status
    updated_state = state.model_copy(update={
        "review_status": "code_reviewed", # Update status after retrieval logic
        "llm_markdown_review":review_markdown,
    })
    return updated_state


import re
from typing import List, Dict, Tuple, Optional, Any

class ParsedComment:
    """
    Represents a single parsed comment from the LLM's review,
    intended for grouping by file/function.
    """
    def __init__(self, message: str, suggestion: Optional[str] = None):
        self.message = message
        self.suggestion = suggestion

    def __repr__(self):
        return f"ParsedComment(msg='{self.message[:50]}...', has_suggestion={self.suggestion is not None})"

class ParsedReviewSection:
    """
    Represents a categorized section of the review, e.g., 'Potential Issues'.
    """
    def __init__(self, title: str, content: str):
        self.title = title
        self.content = content

    def __repr__(self):
        return f"ParsedReviewSection(title='{self.title}', content='{self.content[:50]}...')"


# Helper to extract suggestion block and clean message
def _extract_suggestion(text: str) -> Tuple[Optional[str], str]:
    """Helper to extract suggestion block and clean message."""
    suggestion_match = re.search(r"```suggestion\n([\s\S]*?)\n```", text, re.MULTILINE)
    suggestion_code = suggestion_match.group(1).strip() if suggestion_match else None
    
    # Remove suggestion from the main message
    cleaned_message = re.sub(r"```suggestion[\s\S]*?```", "", text).strip()
    return suggestion_code, cleaned_message

def _parse_bullet_comments(text_block: str) -> List[ParsedComment]:
    """Helper to parse bullet-point comments from a given text block."""
    comments = []
    # FIX: Updated regex for bullet comments
    # Now matches from a bullet point until the start of the next bullet point or end of the text block.
    # This handles multi-line comments and embedded suggestion blocks more robustly.
    comment_matches = re.finditer(r"(^ *[-*]\s*[\s\S]*?)(?=\n *[-*]\s*|\Z)", text_block, re.MULTILINE | re.DOTALL)
    for cm in comment_matches:
        full_comment_text = cm.group(1).strip()
        if full_comment_text:
            suggestion_code, cleaned_message = _extract_suggestion(full_comment_text)
            comments.append(ParsedComment(message=cleaned_message, suggestion=suggestion_code))
    return comments


def parse_llm_review_markdown(markdown_review: str) -> Dict[str, Any]:
    """
    Parses the LLM-generated Markdown review into a structured dictionary.
    It extracts the overall summary, file-specific/function-specific comments,
    and other general review sections.

    Args:
        markdown_review (str): The full Markdown string generated by the LLM.

    Returns:
        Dict[str, Any]: A dictionary containing structured review data:
            - 'overall_impression': str
            - 'file_comments': Dict[str, Dict[str, List[ParsedComment]]]
              (file_path -> function_name -> List[ParsedComment])
            - 'general_sections': List[ParsedReviewSection]
            - 'summary': str
            - 'approval_status': str (extracted from summary, if present)
    """
    structured_review: Dict[str, Any] = {
        'overall_impression': '',
        'file_comments': {},
        'general_sections': [],
        'summary': '',
        'approval_status': 'Comment' # Default status
    }

    # Helper to extract content between two headers.
    # Now more flexible: allows optional numbering and variable header level for top sections
    def extract_section_content(text: str, start_header_text: str, end_header_text: str) -> Optional[str]:
        # Pattern to match headers with optional numbering and flexible spacing
        start_pattern = r"^(?:##|###)\s*\d*\.?\s*" + re.escape(start_header_text) + r":\s*$"
        end_pattern = r"^(?:##|###)\s*\d*\.?\s*" + re.escape(end_header_text) + r":\s*$"
        
        # Use re.DOTALL to allow . to match newlines
        match = re.search(f"{start_pattern}([\\s\\S]*?)(?={end_pattern}|\\Z)", text, re.MULTILINE | re.DOTALL)
        if match:
            return match.group(1).strip()
        return None

    # --- 1. Extract Overall Impression ---
    overall_impression_content = extract_section_content(markdown_review, "Overall Impression", "Specific Observations and Suggestions")
    if overall_impression_content:
        structured_review['overall_impression'] = overall_impression_content

    # --- 2. Extract Specific Observations and Suggestions (File/Function Comments) ---
    specific_obs_section_content = extract_section_content(markdown_review, "Specific Observations and Suggestions", "Potential Issues and Edge Cases")

    # Debug prints for specific_obs_section_content (kept for verification)
    print(f"\n--- DEBUG: specific_obs_section_content (extracted from markdown_review) ---")
    if specific_obs_section_content is None:
        print("specific_obs_section_content is None")
    elif not specific_obs_section_content.strip():
        print("specific_obs_section_content is empty or only whitespace")
    else:
        print(specific_obs_section_content[:500] + "..." if len(specific_obs_section_content) > 500 else specific_obs_section_content)
    print(f"--- END DEBUG: specific_obs_section_content ---\n")

    print(f"\n--- DEBUG: Raw specific_obs_section_content (using repr()):")
    if specific_obs_section_content is not None:
        print(repr(specific_obs_section_content))
        print(f"Length of specific_obs_section_content: {len(specific_obs_section_content)}")
        print(f"Does it start with '### `data_processor.py`'? {specific_obs_section_content.startswith('### `data_processor.py`')}")
        starts_as_file_header = False
        if specific_obs_section_content.startswith('### `') or specific_obs_section_content.startswith('**File:'):
            starts_as_file_header = True
        print(f"Does it start with a common file header pattern? {starts_as_file_header}")
    else:
        print("specific_obs_section_content is None.")
    print(f"--- END DEBUG: Raw specific_obs_section_content ---\n")


    if specific_obs_section_content:
        # NEW STRATEGY FOR FILE BLOCK PARSING:
        # Step 1: Find all file header line matches first
        file_header_line_pattern = re.compile(
            r"^(?:\*\*File:\s*`?([\w\/\.\-_]+\.\w+)`?\*\*|###\s*`?([\w\/\.\-_]+\.\w+)`?)\s*$",
            re.MULTILINE
        )

        header_matches = list(file_header_line_pattern.finditer(specific_obs_section_content))
        
        print(f"--- DEBUG: Number of file_header_line_pattern matches found (New Strategy): {len(header_matches)} ---")
        if not header_matches:
            print("No file headers were found. Cannot parse file blocks.")
            pass 
        else:
            # Step 2: Iterate through header matches and extract content blocks
            for i, header_match in enumerate(header_matches):
                file_name = (header_match.group(1) or header_match.group(2)).strip().replace('`', '')
                
                # Determine the start of the content block (after the header line)
                content_start_index = header_match.end() 
                
                # Determine the end of the content block (start of next header or end of section content)
                content_end_index = len(specific_obs_section_content)
                if i + 1 < len(header_matches):
                    content_end_index = header_matches[i+1].start()
                
                file_content_block = specific_obs_section_content[content_start_index:content_end_index].strip()

                print(f"\n--- DEBUG: Processing file (new strategy): {file_name} ---")
                print(f"File content block (first 200 chars):\n{file_content_block[:200]}..." if len(file_content_block) > 200 else file_content_block)
                
                if not file_name: continue

                structured_review['file_comments'][file_name] = {}
                general_comments_for_file: List[ParsedComment] = [] 

                # Refined split to capture general file comments and specific function/section comments
                # Matches '#### Function: `func_name`' OR '#### Any other section title'
                sub_section_header_pattern = re.compile(
                    r"^(####\s*(?:Function:\s*`?([\w_]+)`?|[\s\S]+?))\s*$",
                    re.MULTILINE
                )
                
                sub_section_matches_list = list(sub_section_header_pattern.finditer(file_content_block))
                print(f"--- DEBUG: Number of sub-section (####) matches for {file_name}: {len(sub_section_matches_list)} ---")
                if not sub_section_matches_list:
                    print(f"No '####' sub-sections were found in the block for {file_name}. All content will be general comments or missed.")
                    if file_content_block.strip():
                        parsed_general_comments = _parse_bullet_comments(file_content_block.strip())
                        structured_review['file_comments'][file_name]["General_File_Comments"] = parsed_general_comments
                        print(f"  - DEBUG: Parsed {len(parsed_general_comments)} general comments for {file_name}.")
                    continue 
                
                # If sub-sections (#### headers) ARE found, process comments before the first sub-section header (these are file-level comments)
                first_match_start_index = sub_section_matches_list[0].start() 
                pre_section_comments_content = file_content_block[:first_match_start_index].strip()
                if pre_section_comments_content:
                    general_comments_for_file.extend(_parse_bullet_comments(pre_section_comments_content))
                    print(f"  - DEBUG: Added {len(general_comments_for_file)} general comments (before first sub-section) for {file_name}.")


                # Process each sub-section
                for k, current_match in enumerate(sub_section_matches_list): 
                    section_header_raw = current_match.group(1).strip()
                    func_name_from_group = current_match.group(2) 
                    
                    section_title_key = ""
                    if func_name_from_group: 
                        section_title_key = func_name_from_group.replace('`', '')
                    else: 
                        section_title_key = section_header_raw[section_header_raw.find('####') + 4:].strip().replace('`', '')

                    content_start_index = current_match.end()
                    content_end_index = (sub_section_matches_list[k+1].start() 
                                         if k + 1 < len(sub_section_matches_list)
                                         else len(file_content_block))
                    
                    sub_section_content = file_content_block[content_start_index:content_end_index].strip()
                    
                    print(f"    - DEBUG: Sub-section '{section_title_key}' content (first 100 chars): {sub_section_content[:100]}..." if len(sub_section_content) > 100 else sub_section_content)
                    if sub_section_content:
                        parsed_comments_for_section = _parse_bullet_comments(sub_section_content)
                        structured_review['file_comments'][file_name][section_title_key] = parsed_comments_for_section
                        print(f"      - DEBUG: Parsed {len(parsed_comments_for_section)} comments for '{section_title_key}'.")
                    else:
                        structured_review['file_comments'][file_name][section_title_key] = []
                        print(f"      - DEBUG: No content for sub-section '{section_title_key}'.")

                if general_comments_for_file:
                    structured_review['file_comments'][file_name]["General_File_Comments"] = general_comments_for_file


    # --- 3. Extract General Sections ---
    general_section_headers = [
        ("Potential Issues and Edge Cases", "Potential Issues and Edge Cases"),
        ("Security Implications", "Security Implications"),
        ("Adherence to Best Practices (PEP 8)", "Adherence to Best Practices (PEP 8)"),
        ("Performance Considerations", "Performance Considerations"),
        ("Unit Testing Suggestions", "Unit Testing Suggestions"),
        ("Docstring/Comment Improvements", "Docstring/Comment Improvements"),
        ("Clarity and Conciseness", "Clarity and Conciseness"),
        ("Summary", "Summary"), 
    ]

    current_markdown_to_parse = markdown_review
    
    start_parsing_from_match = re.search(r"^##\s*\d*\.?\s*Potential Issues and Edge Cases:\s*$", current_markdown_to_parse, re.MULTILINE)
    if not start_parsing_from_match:
        specific_obs_end_idx = 0 
        specific_obs_match = re.search(r"^##\s*\d*\.?\s*Specific Observations and Suggestions:\s*([\s\S]*?)(?=^##\s*\d*\.?\s*[\w\s\(\)\/]+:|\Z)", current_markdown_to_parse, re.MULTILINE | re.DOTALL)
        if specific_obs_match:
            current_markdown_to_parse = current_markdown_to_parse[specific_obs_match.end():].strip()
        else: 
            pass 
    else:
        current_markdown_to_parse = current_markdown_to_parse[start_parsing_from_match.start():].strip()


    for i, (title, header_text) in enumerate(general_section_headers):
        current_header_pattern = r"^##\s*\d*\.?\s*" + re.escape(header_text) + r":\s*$"
        
        start_match = re.search(current_header_pattern, current_markdown_to_parse, re.MULTILINE)
        if not start_match:
            continue 

        section_start_idx = start_match.end() 
        
        section_end_idx = len(current_markdown_to_parse) 
        
        if i + 1 < len(general_section_headers):
            next_header_text = general_section_headers[i+1][1]
            next_header_pattern = r"^##\s*\d*\.?\s*" + re.escape(next_header_text) + r":\s*$"
            next_match = re.search(next_header_pattern, current_markdown_to_parse[section_start_idx:], re.MULTILINE)
            if next_match:
                section_end_idx = section_start_idx + next_match.start()

        content_raw = current_markdown_to_parse[section_start_idx:section_end_idx].strip()
        
        if title == "Summary":
            structured_review['summary'] = content_raw
            structured_review['summary'] = re.sub(r'(`{3,})\s*$', '', structured_review['summary']).strip()

            approval_match = re.search(r"^\s*\*\*(?:Action|Recommended Action|Status):\*\*\s*(Approve|Request Changes|Comment|No action required)", structured_review['summary'], re.IGNORECASE | re.MULTILINE)
            if approval_match:
                structured_review['approval_status'] = approval_match.group(1).strip().replace(' ', '').capitalize()
            else: 
                structured_review['approval_status'] = 'Comment'
        else:
            structured_review['general_sections'].append(ParsedReviewSection(title=title, content=content_raw))
        
        current_markdown_to_parse = current_markdown_to_parse[section_end_idx:].strip()

    if not structured_review['summary']:
        summary_match = re.search(r"^##\s*\d*\.?\s*Summary:\s*([\s\S]*)$", markdown_review, re.MULTILINE | re.DOTALL)
        if summary_match:
            structured_review['summary'] = summary_match.group(1).strip()
            structured_review['summary'] = re.sub(r'(`{3,})\s*$', '', structured_review['summary']).strip()
            
            approval_match = re.search(r"^\s*\*\*(?:Action|Recommended Action|Status):\*\*\s*(Approve|Request Changes|Comment|No action required)", structured_review['summary'], re.IGNORECASE | re.MULTILINE)
            if approval_match:
                structured_review['approval_status'] = approval_match.group(1).strip().replace(' ', '').capitalize()
            else:
                structured_review['approval_status'] = 'Comment'
        else:
            structured_review['summary'] = "Automated review completed." 

    return structured_review

def feedback_formatter_node(state: PRReviewState):
    llm_markdown_review = state.llm_markdown_review

    parsed_llm_review_data = parse_llm_review_markdown(llm_markdown_review)

    # Don't forget to return an updated state, as nodes in LangGraph should always do
    # For this simple example, we'll just return a copy with an updated status
    updated_state = state.model_copy(update={
        "review_status": "review_parsed" ,# Update status after retrieval logic
        "parsed_llm_review_data":parsed_llm_review_data,
    })
    return updated_state


from github import Github, PullRequest
from github.GithubException import GithubException, UnknownObjectException
from github.Commit import Commit # Import Commit type for clarity and correctness
from typing import Dict, Any, List, Optional
import os
import re
import logging

# IMPORTANT: These classes should be imported from src.utils.markdown_parser
# For standalone execution or if import paths are complex, ensure they are correctly defined or imported.
class ParsedComment:
    def __init__(self, message: str, suggestion: Optional[str] = None):
        self.message = message
        self.suggestion = suggestion
    def __repr__(self):
        return f"ParsedComment(msg='{self.message[:50]}...', has_suggestion={self.suggestion is not None})"

class ParsedReviewSection:
    def __init__(self, title: str, content: str):
        self.title = title
        self.content = content
    def __repr__(self):
        return f"ParsedReviewSection(title='{self.title}', content='{self.content[:50]}...')"


# Configure logging (optional, but good practice)
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')


def post_review_comments_on_github(
    repo_name: str,
    pr_id: int,
    parsed_review_data: Dict[str, Any],
    github_token: Optional[str] = None
) -> Dict[str, Any]:
    """
    Posts a structured code review to a GitHub Pull Request.

    Args:
        repo_name (str): The full name of the repository (e.g., "owner/repo").
        pr_id (int): The Pull Request number.
        parsed_review_data (Dict[str, Any]): The structured review data
                                             as returned by parse_llm_review_markdown.
        github_token (str, optional): GitHub Personal Access Token.
                                       If None, tries to read from GITHUB_TOKEN env var.

    Returns:
        Dict[str, Any]: A dictionary containing details of the posted review,
                        e.g., {'status': 'success', 'review_url': '...', 'main_comment_id': ...}.
                        Raises an exception on failure.
    """
    if github_token is None:
        github_token = os.getenv("GITHUB_token_ID")
        if github_token is None:
            logging.error("GitHub token not provided and GITHUB_TOKEN environment variable not set.")
            raise ValueError("GitHub token not provided and GITHUB_TOKEN environment variable not set.")

    try:
        g = Github(github_token)
        repo = g.get_repo(repo_name)
        pr = repo.get_pull(pr_id)
        logging.info(f"Connected to GitHub repo '{repo_name}', PR #{pr_id}.")

        # --- 1. Prepare the Main Review Body ---
        overall_impression = parsed_review_data.get('overall_impression', '')
        general_sections = parsed_review_data.get('general_sections', [])
        summary = parsed_review_data.get('summary', '')
        approval_status = parsed_review_data.get('approval_status', 'COMMENT').upper()

        main_review_body = f"### 🤖 Automated Code Review\n\n"
        
        if overall_impression.strip():
            main_review_body += f"**Overall Impression:**\n{overall_impression}\n\n---\n\n"

        for section in general_sections:
            title_to_add = section.title
            content_to_add = section.content
            
            if content_to_add.strip():
                main_review_body += f"### {title_to_add}\n{content_to_add}\n\n---\n\n"

        if summary.strip():
            main_review_body += f"### Summary\n{summary}\n\n"
            
        main_review_body += f"**Recommended Action:** {approval_status}\n"

        github_event = "COMMENT"
        if approval_status == "APPROVE":
            github_event = "APPROVE"
        elif approval_status == "REQUEST CHANGES":
            github_event = "REQUEST_CHANGES"
            
        logging.info(f"Calculated GitHub review event: {github_event}")

        # --- 2. Prepare Line/File Comments ---
        github_comments = []
        file_comments_data = parsed_review_data.get('file_comments', {})

        head_commit_sha = pr.head.sha
        # FIX: Get the Commit object from the SHA
        pr_commit_obj = repo.get_commit(head_commit_sha) # <--- ADDED THIS LINE
        logging.info(f"Using head commit SHA: {head_commit_sha} (as Commit object)")

        if file_comments_data:
            logging.info(f"Preparing {len(file_comments_data)} file-specific comments.")
            for file_path, functions_data in file_comments_data.items():
                consolidated_file_comment_body = f"### Review for `{file_path}`\n\n"
                
                sorted_func_names = sorted(functions_data.keys(), key=lambda x: (0 if x == "General_File_Comments" else 1, x))

                for func_name in sorted_func_names:
                    comments_for_func = functions_data[func_name]
                    
                    if not comments_for_func:
                        continue

                    if func_name != "General_File_Comments":
                        consolidated_file_comment_body += f"#### ⚙️ Function: `{func_name}`\n\n"
                    else:
                        if len(sorted_func_names) > 1 or (len(sorted_func_names) == 1 and func_name == "General_File_Comments"):
                             consolidated_file_comment_body += f"#### 📄 General File Comments\n\n"

                    for comment in comments_for_func:
                        consolidated_file_comment_body += f"{comment.message}\n"
                        if comment.suggestion:
                            consolidated_file_comment_body += f"\n```suggestion\n{comment.suggestion}\n```\n\n"
                        consolidated_file_comment_body += "\n---\n\n"
                
                if consolidated_file_comment_body.strip() != f"### Review for `{file_path}`":
                    github_comments.append({
                        "path": file_path,
                        "position": 1,
                        "body": consolidated_file_comment_body.strip(),
                    })

        # --- 3. Submit the Review ---
        # Pass the Commit object to the 'commit' parameter
        review = pr.create_review(
            commit=pr_commit_obj, # <--- CHANGED THIS LINE
            body=main_review_body,
            event=github_event,
            comments=github_comments
        )
        
        logging.info(f"Successfully posted GitHub review. URL: {review.html_url}")
        return {
            'status': 'success',
            'review_url': review.html_url,
            'review_id': review.id,
            'main_comment_body': main_review_body
        }

    except UnknownObjectException as e:
        logging.error(f"GitHub object not found (repo or PR): {e}")
        raise ValueError(f"GitHub object not found (repo or PR): {e}")
    except GithubException as e:
        logging.error(f"GitHub API error: {e}")
        raise RuntimeError(f"GitHub API error: {e}")
    except Exception as e:
        logging.critical(f"An unexpected error occurred while posting review: {e}", exc_info=True)
        raise RuntimeError(f"An unexpected error occurred while posting review: {e}")


def post_review_coments_on_github_node(state:PRReviewState):
    repo_name = state.repo_name
    pr_id = state.pr_id
    parsed_llm_review_data = state.parsed_llm_review_data

    result = post_review_comments_on_github(repo_name,pr_id,parsed_llm_review_data, git_hub_token)

    # Don't forget to return an updated state, as nodes in LangGraph should always do
    # For this simple example, we'll just return a copy with an updated status
    updated_state = state.model_copy(update={
        "review_status": "posted", # Update status after retrieval logic
        "review_comment_url":result['review_url'],
        "review_id":result['review_id'],
        "last_error":result['status'] # change this field later
        
    })
    return updated_state


#from IPython.display import Image, display
from langgraph.graph import StateGraph, START, END

# Build graph
builder = StateGraph(PRReviewState)
builder.add_node("code_retriever_node", code_retriever_node)
builder.add_node("code_reviewer_node", code_reviewer_node)
builder.add_node("feedback_formatter_node", feedback_formatter_node)
builder.add_node("post_review_coments_on_github_node", post_review_coments_on_github_node)

# Logic
builder.add_edge(START, "code_retriever_node")
builder.add_edge("code_retriever_node", "code_reviewer_node")
builder.add_edge("code_reviewer_node", "feedback_formatter_node")
builder.add_edge("feedback_formatter_node", "post_review_coments_on_github_node")
builder.add_edge("post_review_coments_on_github_node", END)

# need to fix ParsedComment serializable error
#graph = builder.compile(checkpointer=memory)
graph = builder.compile()


# View
#display(Image(graph.get_graph().draw_mermaid_png()))