Spaces:

mohdelgaar
/

LingConv

Running

App Files Files Community

mohdelgaar commited on Apr 15, 2025

Commit

251ecbb

1 Parent(s): 26ab567

lng indices

Browse files

Files changed (2) hide show

const.py +1 -0
lng/L2SCA/analyzeText.py +87 -35

const.py CHANGED Viewed

@@ -1030,6 +1030,7 @@ used_indices = [
         63, 64, 65, 66, 67, 68, 73, 121, 124, 129, 134, 136, 254,
         257, 258, 261, 263, 272, 274
         ]
 eval_indices = [4,5,6,18,257,272]
 eval_indices = [used_indices.index(idx) for idx in eval_indices]

         63, 64, 65, 66, 67, 68, 73, 121, 124, 129, 134, 136, 254,
         257, 258, 261, 263, 272, 274
         ]
+lftk_used_indices = [1, 7, 8, 9, 10, 11, 12, 17, 65, 68, 73, 78, 80, 198, 201, 202, 205, 207, 216, 218]
 eval_indices = [4,5,6,18,257,272]
 eval_indices = [used_indices.index(idx) for idx in eval_indices]

lng/L2SCA/analyzeText.py CHANGED Viewed

@@ -62,40 +62,94 @@ patternlist=[s,vp,c,t,dc,ct,cp,cn1,cn2,cn3,fc,ft,vp_q]
 pre_path = 'lng/L2SCA'
 #location of the Stanford parser
-parserPath= os.path.join(pre_path, "stanford-parser-full-2014-01-04/lexparser.sh")
 def sca(input_text):
-    inputFile = '/tmp/%s.txt'%next(tempfile._get_candidate_names())
-    with open(inputFile, 'w') as f:
-        f.write(input_text + '\n')
-#extract the name of the file being processed
     output = []
-#name a temporary file to hold the parse trees of the input file
-    parsedFile=inputFile+".parsed"
-#parse the input file
-    command=[parserPath, inputFile]
-    with open(parsedFile, 'w') as f:
-        subprocess.run(command, stdout = f,
-                stderr = subprocess.DEVNULL
-                )
-#list of counts of the patterns
-    patterncount=[]
-#query the parse trees using the tregex patterns
     for pattern in patternlist:
-        command = [os.path.join(pre_path, "tregex.sh"), pattern, parsedFile, "-C", "-o"]
-        out = subprocess.run(command, check = True, stdout = subprocess.PIPE, stderr = subprocess.DEVNULL)
-        if len(out.stdout) > 0:
-            count = int(out.stdout)
-        else:
             count = 0
         patterncount.append(count)
 #update frequencies of complex nominals, clauses, and T-units
     patterncount[7]=patterncount[-4]+patterncount[-5]+patterncount[-6]
     patterncount[2]=patterncount[2]+patterncount[-3]
@@ -103,10 +157,12 @@ def sca(input_text):
     patterncount[1]=patterncount[1]+patterncount[-1]
 #word count
-    infile=open(parsedFile,"r")
-    content=infile.read()
-    w=len(re.findall("\([A-Z]+\$? [^\)\(]+\)",content))
-    infile.close()
 #add frequencies of words and other structures to output string
     output.append(int(w))
@@ -139,8 +195,4 @@ def sca(input_text):
 #list of 24 comma-delimited fields
     # fields="Filename,W,S,VP,C,T,DC,CT,CP,CN,MLS,MLT,MLC,C/S,VP/T,C/T,DC/C,DC/T,T/S,CT/T,CP/T,CP/C,CN/T,CN/C"
-#delete the temporary file holding the parse trees
-    os.remove(inputFile)
-    os.remove(parsedFile)
     return output

 pre_path = 'lng/L2SCA'
 #location of the Stanford parser
+parserPath= os.path.join(pre_path, "stanford-parser-full-2014-01-04")
 def sca(input_text):
     output = []
+#     inputFile = '/tmp/%s.txt'%next(tempfile._get_candidate_names())
+#     with open(inputFile, 'w') as f:
+#         f.write(input_text + '\n')
+# #extract the name of the file being processed
+#     output = []
+# #name a temporary file to hold the parse trees of the input file
+#     parsedFile=inputFile+".parsed"
+# #parse the input file
+#     command=[parserPath, inputFile]
+#     with open(parsedFile, 'w') as f:
+#         subprocess.run(command, stdout = f,
+#                 stderr = subprocess.DEVNULL
+#                 )
+# #list of counts of the patterns
+#     patterncount=[]
+# #query the parse trees using the tregex patterns
+#     for pattern in patternlist:
+#         command = [os.path.join(pre_path, "tregex.sh"), pattern, parsedFile, "-C", "-o"]
+#         out = subprocess.run(command, check = True, stdout = subprocess.PIPE, stderr = subprocess.DEVNULL)
+#         if len(out.stdout) > 0:
+#             count = int(out.stdout)
+#         else:
+#             count = 0
+#         patterncount.append(count)
+    # Parse directly into memory
+    stanford_parser_jar = os.path.join(parserPath, "stanford-parser.jar")
+    stanford_models_jar = os.path.join(parserPath, "stanford-parser-3.3.1-models.jar") #Correct the version
+    command = [
+        "java",
+        "-mx1500m",
+        "-cp",
+        f"{stanford_parser_jar}:{stanford_models_jar}:",
+        "edu.stanford.nlp.parser.lexparser.LexicalizedParser",
+        "-outputFormat", "penn",  # Changed output format to penn
+        "-sentences", "newline",
+        "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz",
+        "-",  # Read from standard input
+    ]
+    # Parse the input text.
+    process = subprocess.run(command, input=input_text.encode(), stdout=subprocess.PIPE, stderr=subprocess.DEVNULL)
+    parsed_output = process.stdout.decode()
+    # Word count
+    w = len(re.findall(r"\([A-Z]+\$? [^\)\(]+\)", parsed_output))
+    # For each tregex pattern in the list, run tregex in filter mode
+    # so that it reads the parsed trees from standard input.
+    patterncount = []
     for pattern in patternlist:
+        tregex_command = [
+            "java",
+            "-mx100m",
+            "-cp",
+            f"{pre_path}/stanford-tregex.jar",
+            "edu.stanford.nlp.trees.tregex.TregexPattern",
+            "-filter",   # Use filter mode: read trees from stdin
+            "-C",        # Suppress printing matches; only outputs the count
+            "-o",        # Report each tree node only once as the root of a match
+            pattern      # Supply the pattern as a command-line argument
+        ]
+        tregex_process = subprocess.run(
+            tregex_command,
+            input=parsed_output.encode(),
+            stdout=subprocess.PIPE,
+            stderr=subprocess.DEVNULL,
+        )
+        tregex_output = tregex_process.stdout.decode().strip()
+        try:
+            count = int(tregex_output)
+        except ValueError:
             count = 0
         patterncount.append(count)
 #update frequencies of complex nominals, clauses, and T-units
     patterncount[7]=patterncount[-4]+patterncount[-5]+patterncount[-6]
     patterncount[2]=patterncount[2]+patterncount[-3]
     patterncount[1]=patterncount[1]+patterncount[-1]
 #word count
+    # infile=open(parsedFile,"r")
+    # content=infile.read()
+    # w=len(re.findall("\([A-Z]+\$? [^\)\(]+\)",content))
+    # infile.close()
+    w = len(re.findall(r"\([A-Z]+\$? [^\)\(]+\)", parsed_output))
 #add frequencies of words and other structures to output string
     output.append(int(w))
 #list of 24 comma-delimited fields
     # fields="Filename,W,S,VP,C,T,DC,CT,CP,CN,MLS,MLT,MLC,C/S,VP/T,C/T,DC/C,DC/T,T/S,CT/T,CP/T,CP/C,CN/T,CN/C"
     return output