Spaces:

ABAO77
/

Grammarly-Checking

Sleeping

App Files Files Community

ABAO77 commited on Apr 24, 2025

Commit

98ba910

verified ·

1 Parent(s): 953da1c

Upload 2 files

Browse files

Files changed (2) hide show

grammar_checker.py +161 -25
requirements.txt +1 -1

grammar_checker.py CHANGED Viewed

@@ -1,7 +1,6 @@
 import os
 from loguru import logger
 import json
-import io
 import tempfile
 from typing import List, Dict, Any, Annotated, Optional
 from langchain_openai import AzureChatOpenAI
@@ -13,23 +12,35 @@ from rich.table import Table
 from rich.box import ROUNDED
 import re
 import pandas as pd
-# Get Azure OpenAI credentials from environment variables
-AZURE_OPENAI_API_KEY = os.environ.get("AZURE_OPENAI_API_KEY")
-AZURE_OPENAI_ENDPOINT = os.environ.get("AZURE_OPENAI_ENDPOINT")
-AZURE_OPENAI_DEPLOYMENT_NAME = os.environ.get("AZURE_OPENAI_DEPLOYMENT")
-AZURE_OPENAI_API_VERSION = os.environ.get("API_VERSION")
-llm = AzureChatOpenAI(
     temperature=0,
-    api_key=AZURE_OPENAI_API_KEY,
-    azure_endpoint=AZURE_OPENAI_ENDPOINT,
-    azure_deployment=AZURE_OPENAI_DEPLOYMENT_NAME,
-    api_version=AZURE_OPENAI_API_VERSION,
 )
 # Constants for text splitting
 CHUNK_SIZE = 1000  # Approximate characters per page
-CHUNK_OVERLAP = 250  # Overlap between chunks to maintain context
 # Common tech terms and proper nouns that should not be flagged as errors
 DEFAULT_PROPER_NOUNS = """
@@ -63,11 +74,13 @@ def check_grammar_question(data: Dict[str, Any]) -> Dict[str, str]:
         [("system", system_message), ("user", input_message)]
     )
-    class GrammarResult(TypedDict):
-        output: Annotated[
-            Dict[str, str], ..., "A dictionary with same keys as the input dictionary."
-        ]
-        wrong_locations: Annotated[Optional[str], ..., "point out errors briefly. Leave blank if there are no errors."]
     chain = prompt | llm.with_structured_output(GrammarResult)
     result = chain.invoke({"data": data})
@@ -88,13 +101,18 @@ def check_grammar_qa(
         Dictionary with corrected text for each field
     """
     corrected_dict = {}
     # Only process the Question and Answer Options A-D
     if "Question" in qa_dict and not pd.isna(qa_dict["Question"]):
         corrected_dict["Question"] = qa_dict["Question"]
     # Process answer options
-    for option in ["Answer Option A", "Answer Option B", "Answer Option C", "Answer Option D"]:
         if option in qa_dict and not pd.isna(qa_dict[option]):
             corrected_dict[option] = qa_dict[option]
@@ -374,7 +392,9 @@ def check_grammar(text: str, proper_nouns: str = DEFAULT_PROPER_NOUNS) -> Gramma
         chunks = split_text(text)
         # Initialize LangChain with Azure OpenAI
-        logger.debug(f"Using Azure OpenAI with deployment: {AZURE_OPENAI_DEPLOYMENT_NAME}")
         # Create system message for JSON format
         system_message = """You are a spellchecker database that outputs grammar errors and corrected text in JSON.
@@ -521,11 +541,127 @@ def display_results(response: Grammar, path: str = "", repo_link: str = "") -> i
                 total_errors += 1
         if errors:
-           print(table)
         else:
             no_errors_msg = f"No {category} errors found."
-    return total_errors

 import os
 from loguru import logger
 import json
 import tempfile
 from typing import List, Dict, Any, Annotated, Optional
 from langchain_openai import AzureChatOpenAI
 from rich.box import ROUNDED
 import re
 import pandas as pd
+import asyncio
+from concurrent.futures import ThreadPoolExecutor
+from langchain_google_genai.chat_models import ChatGoogleGenerativeAI
+from pydantic import BaseModel, Field
+from langchain_google_genai import ChatGoogleGenerativeAI
+llm = ChatGoogleGenerativeAI(
+    model="gemini-2.0-flash-001",
     temperature=0,
+    max_tokens=None,
+    timeout=None,
+    max_retries=2,
+    # other params...
 )
+# Get Azure OpenAI credentials from environment variables
+# AZURE_OPENAI_API_KEY = os.environ.get("AZURE_OPENAI_API_KEY")
+# AZURE_OPENAI_ENDPOINT = os.environ.get("AZURE_OPENAI_ENDPOINT")
+# AZURE_OPENAI_DEPLOYMENT_NAME = os.environ.get("AZURE_OPENAI_DEPLOYMENT")
+# AZURE_OPENAI_API_VERSION = os.environ.get("API_VERSION")
+# llm = AzureChatOpenAI(
+#     temperature=0,
+#     api_key=AZURE_OPENAI_API_KEY,
+#     azure_endpoint=AZURE_OPENAI_ENDPOINT,
+#     azure_deployment=AZURE_OPENAI_DEPLOYMENT_NAME,
+#     api_version=AZURE_OPENAI_API_VERSION,
+# )
 # Constants for text splitting
 CHUNK_SIZE = 1000  # Approximate characters per page
+CHUNK_OVERLAP = 0  # Overlap between chunks to maintain context
 # Common tech terms and proper nouns that should not be flagged as errors
 DEFAULT_PROPER_NOUNS = """
         [("system", system_message), ("user", input_message)]
     )
+    class GrammarResult(BaseModel):
+        output: Dict[str, str] = Field(
+            ..., description="A dictionary with same keys as the input dictionary."
+        )
+        wrong_locations: Optional[str] = Field(
+            None, description="point out errors briefly. Leave blank if there are no errors."
+        )
     chain = prompt | llm.with_structured_output(GrammarResult)
     result = chain.invoke({"data": data})
         Dictionary with corrected text for each field
     """
     corrected_dict = {}
     # Only process the Question and Answer Options A-D
     if "Question" in qa_dict and not pd.isna(qa_dict["Question"]):
         corrected_dict["Question"] = qa_dict["Question"]
     # Process answer options
+    for option in [
+        "Answer Option A",
+        "Answer Option B",
+        "Answer Option C",
+        "Answer Option D",
+    ]:
         if option in qa_dict and not pd.isna(qa_dict[option]):
             corrected_dict[option] = qa_dict[option]
         chunks = split_text(text)
         # Initialize LangChain with Azure OpenAI
+        # logger.debug(
+        #     f"Using Azure OpenAI with deployment: {AZURE_OPENAI_DEPLOYMENT_NAME}"
+        # )
         # Create system message for JSON format
         system_message = """You are a spellchecker database that outputs grammar errors and corrected text in JSON.
                 total_errors += 1
         if errors:
+            print(table)
         else:
             no_errors_msg = f"No {category} errors found."
+    return total_errors
+def check_grammar_questions_batch(questions: List[Dict[str, Any]], batch_size: int = 5) -> List[Dict[str, Any]]:
+    """
+    Process multiple questions in batches for grammar checking.
+    Args:
+        questions: List of question dictionaries to process
+        batch_size: Number of questions to process in each batch
+    Returns:
+        List of processed question dictionaries with grammar corrections
+    """
+    system_message = """
+    You are a spellchecker for a batch of questions and answers related to IT and programming.
+    You will be given multiple question and answer pairs.
+    Check the grammar of each question and answer pair.
+    Return a list of dictionaries with the same structure as the input, but with corrected text.
+    If any fields have no errors, return the original value.
+    """
+    def process_batch(batch: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        input_message = """
+        Here is a question to check:
+        {data}
+        """
+        prompt = ChatPromptTemplate.from_messages([
+            ("system", system_message),
+            ("user", input_message)
+        ])
+        class BatchGrammarResult(BaseModel):
+            output: Dict[str, Any] = Field(
+                ..., description="Dictionary with corrected text"
+            )
+            wrong_locations: str = Field(
+                ..., description="Error descriptions for the question"
+            )
+        chain = prompt | llm.with_structured_output(BatchGrammarResult)
+        # Create prompts for each question in the batch
+        prompts = [{"data": question} for question in batch]
+        logger.info(f"prompt {prompts}")
+        # Process all questions in parallel using batch
+        results = chain.batch(prompts)
+        # Extract and combine results
+        processed_results = []
+        for result in results:
+            result = result.dict()
+            processed_results.append({
+                **result["output"],
+                "wrong_locations": result["wrong_locations"]
+            })
+        return processed_results
+    # Preprocess questions to include only relevant fields
+    preprocessed_questions = []
+    for qa_dict in questions:
+        processed_dict = {}
+        if "Question" in qa_dict and not pd.isna(qa_dict["Question"]):
+            processed_dict["Question"] = qa_dict["Question"]
+        for option in ["Answer Option A", "Answer Option B", "Answer Option C", "Answer Option D"]:
+            if option in qa_dict and not pd.isna(qa_dict[option]):
+                processed_dict[option] = qa_dict[option]
+        # Keep original metadata
+        processed_dict["No."] = qa_dict.get("No.")
+        processed_dict["Training content"] = qa_dict.get("Training content")
+        processed_dict["Answer"] = qa_dict.get("Answer")
+        preprocessed_questions.append(processed_dict)
+    # Process questions in batches
+    results = []
+    total_batches = (len(preprocessed_questions) + batch_size - 1) // batch_size
+    logger.info(f"Processing {len(preprocessed_questions)} questions in {total_batches} batches")
+    for i in range(0, len(preprocessed_questions), batch_size):
+        batch = preprocessed_questions[i:i + batch_size]
+        batch_num = (i // batch_size) + 1
+        logger.info(f"Processing batch {batch_num}/{total_batches} with {len(batch)} questions")
+        batch_results = process_batch(batch)
+        results.extend(batch_results)
+    return results
+def process_grammar_check(input_file: str, output_file: str, limit: Optional[int] = None) -> str:
+    """
+    Process an Excel file with questions and answers, check grammar, and save the corrected data.
+    Args:
+        input_file (str): Path to the input Excel file
+        output_file (str): Path to save the output Excel file
+        limit (int, optional): Limit the number of records to process. If None, process all records.
+    Returns:
+        str: Path to the output file
+    """
+    # Read the input file
+    df = pd.read_excel(input_file, sheet_name="Sheet1")
+    records = df.to_dict(orient="records")
+    if limit is not None:
+        records = records[:limit]
+    # Process the records in batches
+    processed_records = check_grammar_questions_batch(records,batch_size=30)
+    # Create a DataFrame from the processed data and write to Excel
+    output_df = pd.DataFrame(processed_records)
+    output_df.to_excel(output_file, index=False)
+    return output_file

requirements.txt CHANGED Viewed

@@ -10,4 +10,4 @@ langchain_text_splitters
 rich
 python-docx
 python-multipart
-openpyxl

 rich
 python-docx
 python-multipart
+langchain-google-genai