{"instruction": "Can you tell what is the following Python 3 function doing\ndef train(train_dir, model_save_path=None, n_neighbors=None, knn_algo='ball_tree', verbose=False):\n X = []\n y = []\n\n # Loop through each person in the training set\n for class_dir in os.listdir(train_dir):\n if not os.path.isdir(os.path.join(train_dir, class_dir)):\n continue\n\n # Loop through each training image for the current person\n for img_path in image_files_in_folder(os.path.join(train_dir, class_dir)):\n image = face_recognition.load_image_file(img_path)\n face_bounding_boxes = face_recognition.face_locations(image)\n\n if len(face_bounding_boxes) != 1:\n # If there are no people (or too many people) in a training image, skip the image.\n if verbose:\n print(\"Image {} not suitable for training: {}\".format(img_path, \"Didn't find a face\" if len(face_bounding_boxes) < 1 else \"Found more than one face\"))\n else:\n # Add face encoding for current image to the training set\n X.append(face_recognition.face_encodings(image, known_face_locations=face_bounding_boxes)[0])\n y.append(class_dir)\n\n # Determine how many neighbors to use for weighting in the KNN classifier\n if n_neighbors is None:\n n_neighbors = int(round(math.sqrt(len(X))))\n if verbose:\n print(\"Chose n_neighbors automatically:\", n_neighbors)\n\n # Create and train the KNN classifier\n knn_clf = neighbors.KNeighborsClassifier(n_neighbors=n_neighbors, algorithm=knn_algo, weights='distance')\n knn_clf.fit(X, y)\n\n # Save the trained KNN classifier\n if model_save_path is not None:\n with open(model_save_path, 'wb') as f:\n pickle.dump(knn_clf, f)\n\n return knn_clf", "response": "Train a k - nearest neighbors classifier for face recognition."} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef predict(X_img_path, knn_clf=None, model_path=None, distance_threshold=0.6):\n if not os.path.isfile(X_img_path) or os.path.splitext(X_img_path)[1][1:] not in ALLOWED_EXTENSIONS:\n raise Exception(\"Invalid image path: {}\".format(X_img_path))\n\n if knn_clf is None and model_path is None:\n raise Exception(\"Must supply knn classifier either thourgh knn_clf or model_path\")\n\n # Load a trained KNN model (if one was passed in)\n if knn_clf is None:\n with open(model_path, 'rb') as f:\n knn_clf = pickle.load(f)\n\n # Load image file and find face locations\n X_img = face_recognition.load_image_file(X_img_path)\n X_face_locations = face_recognition.face_locations(X_img)\n\n # If no faces are found in the image, return an empty result.\n if len(X_face_locations) == 0:\n return []\n\n # Find encodings for faces in the test iamge\n faces_encodings = face_recognition.face_encodings(X_img, known_face_locations=X_face_locations)\n\n # Use the KNN model to find the best matches for the test face\n closest_distances = knn_clf.kneighbors(faces_encodings, n_neighbors=1)\n are_matches = [closest_distances[0][i][0] <= distance_threshold for i in range(len(X_face_locations))]\n\n # Predict classes and remove classifications that aren't within the threshold\n return [(pred, loc) if rec else (\"unknown\", loc) for pred, loc, rec in zip(knn_clf.predict(faces_encodings), X_face_locations, are_matches)]", "response": "Predicts faces in a given image using a KNN classifier."} {"instruction": "How would you implement a function in Python 3 that\nshows the face recognition results visually.", "response": "def show_prediction_labels_on_image(img_path, predictions):\n \"\"\"\n Shows the face recognition results visually.\n\n :param img_path: path to image to be recognized\n :param predictions: results of the predict function\n :return:\n \"\"\"\n pil_image = Image.open(img_path).convert(\"RGB\")\n draw = ImageDraw.Draw(pil_image)\n\n for name, (top, right, bottom, left) in predictions:\n # Draw a box around the face using the Pillow module\n draw.rectangle(((left, top), (right, bottom)), outline=(0, 0, 255))\n\n # There's a bug in Pillow where it blows up with non-UTF-8 text\n # when using the default bitmap font\n name = name.encode(\"UTF-8\")\n\n # Draw a label with a name below the face\n text_width, text_height = draw.textsize(name)\n draw.rectangle(((left, bottom - text_height - 10), (right, bottom)), fill=(0, 0, 255), outline=(0, 0, 255))\n draw.text((left + 6, bottom - text_height - 5), name, fill=(255, 255, 255, 255))\n\n # Remove the drawing library from memory as per the Pillow docs\n del draw\n\n # Display the resulting image\n pil_image.show()"} {"instruction": "Implement a function in Python 3 to\nconvert a dlib rect object to a plain tuple in ( top right bottom left", "response": "def _rect_to_css(rect):\n \"\"\"\n Convert a dlib 'rect' object to a plain tuple in (top, right, bottom, left) order\n\n :param rect: a dlib 'rect' object\n :return: a plain tuple representation of the rect in (top, right, bottom, left) order\n \"\"\"\n return rect.top(), rect.right(), rect.bottom(), rect.left()"} {"instruction": "Given the following Python 3 function, write the documentation\ndef _trim_css_to_bounds(css, image_shape):\n return max(css[0], 0), min(css[1], image_shape[1]), min(css[2], image_shape[0]), max(css[3], 0)", "response": "Trim the given CSS tuple to the bounds of the image."} {"instruction": "Write a Python 3 function for\ngiving a list of face encodings compare them to a known face encoding and get a euclidean distance for each comparison face.", "response": "def face_distance(face_encodings, face_to_compare):\n \"\"\"\n Given a list of face encodings, compare them to a known face encoding and get a euclidean distance\n for each comparison face. The distance tells you how similar the faces are.\n\n :param faces: List of face encodings to compare\n :param face_to_compare: A face encoding to compare against\n :return: A numpy ndarray with the distance for each face in the same order as the 'faces' array\n \"\"\"\n if len(face_encodings) == 0:\n return np.empty((0))\n\n return np.linalg.norm(face_encodings - face_to_compare, axis=1)"} {"instruction": "Write a Python 3 script for\nloading an image file into a numpy array.", "response": "def load_image_file(file, mode='RGB'):\n \"\"\"\n Loads an image file (.jpg, .png, etc) into a numpy array\n\n :param file: image file name or file object to load\n :param mode: format to convert the image to. Only 'RGB' (8-bit RGB, 3 channels) and 'L' (black and white) are supported.\n :return: image contents as numpy array\n \"\"\"\n im = PIL.Image.open(file)\n if mode:\n im = im.convert(mode)\n return np.array(im)"} {"instruction": "How would you implement a function in Python 3 that\nreturns an array of bounding boxes of human faces in a image.", "response": "def _raw_face_locations(img, number_of_times_to_upsample=1, model=\"hog\"):\n \"\"\"\n Returns an array of bounding boxes of human faces in a image\n\n :param img: An image (as a numpy array)\n :param number_of_times_to_upsample: How many times to upsample the image looking for faces. Higher numbers find smaller faces.\n :param model: Which face detection model to use. \"hog\" is less accurate but faster on CPUs. \"cnn\" is a more accurate\n deep-learning model which is GPU/CUDA accelerated (if available). The default is \"hog\".\n :return: A list of dlib 'rect' objects of found face locations\n \"\"\"\n if model == \"cnn\":\n return cnn_face_detector(img, number_of_times_to_upsample)\n else:\n return face_detector(img, number_of_times_to_upsample)"} {"instruction": "Implement a function in Python 3 to\nreturn an array of bounding boxes of human faces in a image.", "response": "def face_locations(img, number_of_times_to_upsample=1, model=\"hog\"):\n \"\"\"\n Returns an array of bounding boxes of human faces in a image\n\n :param img: An image (as a numpy array)\n :param number_of_times_to_upsample: How many times to upsample the image looking for faces. Higher numbers find smaller faces.\n :param model: Which face detection model to use. \"hog\" is less accurate but faster on CPUs. \"cnn\" is a more accurate\n deep-learning model which is GPU/CUDA accelerated (if available). The default is \"hog\".\n :return: A list of tuples of found face locations in css (top, right, bottom, left) order\n \"\"\"\n if model == \"cnn\":\n return [_trim_css_to_bounds(_rect_to_css(face.rect), img.shape) for face in _raw_face_locations(img, number_of_times_to_upsample, \"cnn\")]\n else:\n return [_trim_css_to_bounds(_rect_to_css(face), img.shape) for face in _raw_face_locations(img, number_of_times_to_upsample, model)]"} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef batch_face_locations(images, number_of_times_to_upsample=1, batch_size=128):\n def convert_cnn_detections_to_css(detections):\n return [_trim_css_to_bounds(_rect_to_css(face.rect), images[0].shape) for face in detections]\n\n raw_detections_batched = _raw_face_locations_batched(images, number_of_times_to_upsample, batch_size)\n\n return list(map(convert_cnn_detections_to_css, raw_detections_batched))", "response": "Returns a 2d array of bounding boxes of human faces in a given image using the cnn face detectors."} {"instruction": "How would you code a function in Python 3 to\ngive an image returns a dict of face feature locations", "response": "def face_landmarks(face_image, face_locations=None, model=\"large\"):\n \"\"\"\n Given an image, returns a dict of face feature locations (eyes, nose, etc) for each face in the image\n\n :param face_image: image to search\n :param face_locations: Optionally provide a list of face locations to check.\n :param model: Optional - which model to use. \"large\" (default) or \"small\" which only returns 5 points but is faster.\n :return: A list of dicts of face feature locations (eyes, nose, etc)\n \"\"\"\n landmarks = _raw_face_landmarks(face_image, face_locations, model)\n landmarks_as_tuples = [[(p.x, p.y) for p in landmark.parts()] for landmark in landmarks]\n\n # For a definition of each point index, see https://cdn-images-1.medium.com/max/1600/1*AbEg31EgkbXSQehuNJBlWg.png\n if model == 'large':\n return [{\n \"chin\": points[0:17],\n \"left_eyebrow\": points[17:22],\n \"right_eyebrow\": points[22:27],\n \"nose_bridge\": points[27:31],\n \"nose_tip\": points[31:36],\n \"left_eye\": points[36:42],\n \"right_eye\": points[42:48],\n \"top_lip\": points[48:55] + [points[64]] + [points[63]] + [points[62]] + [points[61]] + [points[60]],\n \"bottom_lip\": points[54:60] + [points[48]] + [points[60]] + [points[67]] + [points[66]] + [points[65]] + [points[64]]\n } for points in landmarks_as_tuples]\n elif model == 'small':\n return [{\n \"nose_tip\": [points[4]],\n \"left_eye\": points[2:4],\n \"right_eye\": points[0:2],\n } for points in landmarks_as_tuples]\n else:\n raise ValueError(\"Invalid landmarks model type. Supported models are ['small', 'large'].\")"} {"instruction": "Make a summary of the following Python 3 code\ndef face_encodings(face_image, known_face_locations=None, num_jitters=1):\n raw_landmarks = _raw_face_landmarks(face_image, known_face_locations, model=\"small\")\n return [np.array(face_encoder.compute_face_descriptor(face_image, raw_landmark_set, num_jitters)) for raw_landmark_set in raw_landmarks]", "response": "Given an image returns the 128 - dimensional face encoding for each face in the image."} {"instruction": "Can you tell what is the following Python 3 function doing\ndef _parse_datatype_string(s):\n sc = SparkContext._active_spark_context\n\n def from_ddl_schema(type_str):\n return _parse_datatype_json_string(\n sc._jvm.org.apache.spark.sql.types.StructType.fromDDL(type_str).json())\n\n def from_ddl_datatype(type_str):\n return _parse_datatype_json_string(\n sc._jvm.org.apache.spark.sql.api.python.PythonSQLUtils.parseDataType(type_str).json())\n\n try:\n # DDL format, \"fieldname datatype, fieldname datatype\".\n return from_ddl_schema(s)\n except Exception as e:\n try:\n # For backwards compatibility, \"integer\", \"struct\" and etc.\n return from_ddl_datatype(s)\n except:\n try:\n # For backwards compatibility, \"fieldname: datatype, fieldname: datatype\" case.\n return from_ddl_datatype(\"struct<%s>\" % s.strip())\n except:\n raise e", "response": "Parses a string into a base - level structure type."} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef _int_size_to_type(size):\n if size <= 8:\n return ByteType\n if size <= 16:\n return ShortType\n if size <= 32:\n return IntegerType\n if size <= 64:\n return LongType", "response": "Return the Catalyst datatype from the size of integers."} {"instruction": "Create a Python 3 function for\ninferring the DataType from obj", "response": "def _infer_type(obj):\n \"\"\"Infer the DataType from obj\n \"\"\"\n if obj is None:\n return NullType()\n\n if hasattr(obj, '__UDT__'):\n return obj.__UDT__\n\n dataType = _type_mappings.get(type(obj))\n if dataType is DecimalType:\n # the precision and scale of `obj` may be different from row to row.\n return DecimalType(38, 18)\n elif dataType is not None:\n return dataType()\n\n if isinstance(obj, dict):\n for key, value in obj.items():\n if key is not None and value is not None:\n return MapType(_infer_type(key), _infer_type(value), True)\n return MapType(NullType(), NullType(), True)\n elif isinstance(obj, list):\n for v in obj:\n if v is not None:\n return ArrayType(_infer_type(obj[0]), True)\n return ArrayType(NullType(), True)\n elif isinstance(obj, array):\n if obj.typecode in _array_type_mappings:\n return ArrayType(_array_type_mappings[obj.typecode](), False)\n else:\n raise TypeError(\"not supported type: array(%s)\" % obj.typecode)\n else:\n try:\n return _infer_schema(obj)\n except TypeError:\n raise TypeError(\"not supported type: %s\" % type(obj))"} {"instruction": "Write a Python 3 function for\ninferring the schema from dict namedtuple or object", "response": "def _infer_schema(row, names=None):\n \"\"\"Infer the schema from dict/namedtuple/object\"\"\"\n if isinstance(row, dict):\n items = sorted(row.items())\n\n elif isinstance(row, (tuple, list)):\n if hasattr(row, \"__fields__\"): # Row\n items = zip(row.__fields__, tuple(row))\n elif hasattr(row, \"_fields\"): # namedtuple\n items = zip(row._fields, tuple(row))\n else:\n if names is None:\n names = ['_%d' % i for i in range(1, len(row) + 1)]\n elif len(names) < len(row):\n names.extend('_%d' % i for i in range(len(names) + 1, len(row) + 1))\n items = zip(names, row)\n\n elif hasattr(row, \"__dict__\"): # object\n items = sorted(row.__dict__.items())\n\n else:\n raise TypeError(\"Can not infer schema for type: %s\" % type(row))\n\n fields = [StructField(k, _infer_type(v), True) for k, v in items]\n return StructType(fields)"} {"instruction": "Implement a Python 3 function for\nreturning whether there is NullType in dt", "response": "def _has_nulltype(dt):\n \"\"\" Return whether there is NullType in `dt` or not \"\"\"\n if isinstance(dt, StructType):\n return any(_has_nulltype(f.dataType) for f in dt.fields)\n elif isinstance(dt, ArrayType):\n return _has_nulltype((dt.elementType))\n elif isinstance(dt, MapType):\n return _has_nulltype(dt.keyType) or _has_nulltype(dt.valueType)\n else:\n return isinstance(dt, NullType)"} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef _create_converter(dataType):\n if not _need_converter(dataType):\n return lambda x: x\n\n if isinstance(dataType, ArrayType):\n conv = _create_converter(dataType.elementType)\n return lambda row: [conv(v) for v in row]\n\n elif isinstance(dataType, MapType):\n kconv = _create_converter(dataType.keyType)\n vconv = _create_converter(dataType.valueType)\n return lambda row: dict((kconv(k), vconv(v)) for k, v in row.items())\n\n elif isinstance(dataType, NullType):\n return lambda x: None\n\n elif not isinstance(dataType, StructType):\n return lambda x: x\n\n # dataType must be StructType\n names = [f.name for f in dataType.fields]\n converters = [_create_converter(f.dataType) for f in dataType.fields]\n convert_fields = any(_need_converter(f.dataType) for f in dataType.fields)\n\n def convert_struct(obj):\n if obj is None:\n return\n\n if isinstance(obj, (tuple, list)):\n if convert_fields:\n return tuple(conv(v) for v, conv in zip(obj, converters))\n else:\n return tuple(obj)\n\n if isinstance(obj, dict):\n d = obj\n elif hasattr(obj, \"__dict__\"): # object\n d = obj.__dict__\n else:\n raise TypeError(\"Unexpected obj type: %s\" % type(obj))\n\n if convert_fields:\n return tuple([conv(d.get(name)) for name, conv in zip(names, converters)])\n else:\n return tuple([d.get(name) for name in names])\n\n return convert_struct", "response": "Create a converter to drop the names of fields in obj"} {"instruction": "Explain what the following Python 3 code does\ndef _make_type_verifier(dataType, nullable=True, name=None):\n\n if name is None:\n new_msg = lambda msg: msg\n new_name = lambda n: \"field %s\" % n\n else:\n new_msg = lambda msg: \"%s: %s\" % (name, msg)\n new_name = lambda n: \"field %s in %s\" % (n, name)\n\n def verify_nullability(obj):\n if obj is None:\n if nullable:\n return True\n else:\n raise ValueError(new_msg(\"This field is not nullable, but got None\"))\n else:\n return False\n\n _type = type(dataType)\n\n def assert_acceptable_types(obj):\n assert _type in _acceptable_types, \\\n new_msg(\"unknown datatype: %s for object %r\" % (dataType, obj))\n\n def verify_acceptable_types(obj):\n # subclass of them can not be fromInternal in JVM\n if type(obj) not in _acceptable_types[_type]:\n raise TypeError(new_msg(\"%s can not accept object %r in type %s\"\n % (dataType, obj, type(obj))))\n\n if isinstance(dataType, StringType):\n # StringType can work with any types\n verify_value = lambda _: _\n\n elif isinstance(dataType, UserDefinedType):\n verifier = _make_type_verifier(dataType.sqlType(), name=name)\n\n def verify_udf(obj):\n if not (hasattr(obj, '__UDT__') and obj.__UDT__ == dataType):\n raise ValueError(new_msg(\"%r is not an instance of type %r\" % (obj, dataType)))\n verifier(dataType.toInternal(obj))\n\n verify_value = verify_udf\n\n elif isinstance(dataType, ByteType):\n def verify_byte(obj):\n assert_acceptable_types(obj)\n verify_acceptable_types(obj)\n if obj < -128 or obj > 127:\n raise ValueError(new_msg(\"object of ByteType out of range, got: %s\" % obj))\n\n verify_value = verify_byte\n\n elif isinstance(dataType, ShortType):\n def verify_short(obj):\n assert_acceptable_types(obj)\n verify_acceptable_types(obj)\n if obj < -32768 or obj > 32767:\n raise ValueError(new_msg(\"object of ShortType out of range, got: %s\" % obj))\n\n verify_value = verify_short\n\n elif isinstance(dataType, IntegerType):\n def verify_integer(obj):\n assert_acceptable_types(obj)\n verify_acceptable_types(obj)\n if obj < -2147483648 or obj > 2147483647:\n raise ValueError(\n new_msg(\"object of IntegerType out of range, got: %s\" % obj))\n\n verify_value = verify_integer\n\n elif isinstance(dataType, ArrayType):\n element_verifier = _make_type_verifier(\n dataType.elementType, dataType.containsNull, name=\"element in array %s\" % name)\n\n def verify_array(obj):\n assert_acceptable_types(obj)\n verify_acceptable_types(obj)\n for i in obj:\n element_verifier(i)\n\n verify_value = verify_array\n\n elif isinstance(dataType, MapType):\n key_verifier = _make_type_verifier(dataType.keyType, False, name=\"key of map %s\" % name)\n value_verifier = _make_type_verifier(\n dataType.valueType, dataType.valueContainsNull, name=\"value of map %s\" % name)\n\n def verify_map(obj):\n assert_acceptable_types(obj)\n verify_acceptable_types(obj)\n for k, v in obj.items():\n key_verifier(k)\n value_verifier(v)\n\n verify_value = verify_map\n\n elif isinstance(dataType, StructType):\n verifiers = []\n for f in dataType.fields:\n verifier = _make_type_verifier(f.dataType, f.nullable, name=new_name(f.name))\n verifiers.append((f.name, verifier))\n\n def verify_struct(obj):\n assert_acceptable_types(obj)\n\n if isinstance(obj, dict):\n for f, verifier in verifiers:\n verifier(obj.get(f))\n elif isinstance(obj, Row) and getattr(obj, \"__from_dict__\", False):\n # the order in obj could be different than dataType.fields\n for f, verifier in verifiers:\n verifier(obj[f])\n elif isinstance(obj, (tuple, list)):\n if len(obj) != len(verifiers):\n raise ValueError(\n new_msg(\"Length of object (%d) does not match with \"\n \"length of fields (%d)\" % (len(obj), len(verifiers))))\n for v, (_, verifier) in zip(obj, verifiers):\n verifier(v)\n elif hasattr(obj, \"__dict__\"):\n d = obj.__dict__\n for f, verifier in verifiers:\n verifier(d.get(f))\n else:\n raise TypeError(new_msg(\"StructType can not accept object %r in type %s\"\n % (obj, type(obj))))\n verify_value = verify_struct\n\n else:\n def verify_default(obj):\n assert_acceptable_types(obj)\n verify_acceptable_types(obj)\n\n verify_value = verify_default\n\n def verify(obj):\n if not verify_nullability(obj):\n verify_value(obj)\n\n return verify", "response": "Returns a verifier that checks the type of obj against dataType and raises a TypeError if they do not match."} {"instruction": "Write a Python 3 function that can\nconvert Spark data type to Arrow type", "response": "def to_arrow_type(dt):\n \"\"\" Convert Spark data type to pyarrow type\n \"\"\"\n import pyarrow as pa\n if type(dt) == BooleanType:\n arrow_type = pa.bool_()\n elif type(dt) == ByteType:\n arrow_type = pa.int8()\n elif type(dt) == ShortType:\n arrow_type = pa.int16()\n elif type(dt) == IntegerType:\n arrow_type = pa.int32()\n elif type(dt) == LongType:\n arrow_type = pa.int64()\n elif type(dt) == FloatType:\n arrow_type = pa.float32()\n elif type(dt) == DoubleType:\n arrow_type = pa.float64()\n elif type(dt) == DecimalType:\n arrow_type = pa.decimal128(dt.precision, dt.scale)\n elif type(dt) == StringType:\n arrow_type = pa.string()\n elif type(dt) == BinaryType:\n arrow_type = pa.binary()\n elif type(dt) == DateType:\n arrow_type = pa.date32()\n elif type(dt) == TimestampType:\n # Timestamps should be in UTC, JVM Arrow timestamps require a timezone to be read\n arrow_type = pa.timestamp('us', tz='UTC')\n elif type(dt) == ArrayType:\n if type(dt.elementType) in [StructType, TimestampType]:\n raise TypeError(\"Unsupported type in conversion to Arrow: \" + str(dt))\n arrow_type = pa.list_(to_arrow_type(dt.elementType))\n elif type(dt) == StructType:\n if any(type(field.dataType) == StructType for field in dt):\n raise TypeError(\"Nested StructType not supported in conversion to Arrow\")\n fields = [pa.field(field.name, to_arrow_type(field.dataType), nullable=field.nullable)\n for field in dt]\n arrow_type = pa.struct(fields)\n else:\n raise TypeError(\"Unsupported type in conversion to Arrow: \" + str(dt))\n return arrow_type"} {"instruction": "Make a summary of the following Python 3 code\ndef to_arrow_schema(schema):\n import pyarrow as pa\n fields = [pa.field(field.name, to_arrow_type(field.dataType), nullable=field.nullable)\n for field in schema]\n return pa.schema(fields)", "response": "Convert a Spark schema from Spark to Arrow\n "} {"instruction": "Write a Python 3 script to\nconvert a pyarrow type to Spark data type.", "response": "def from_arrow_type(at):\n \"\"\" Convert pyarrow type to Spark data type.\n \"\"\"\n import pyarrow.types as types\n if types.is_boolean(at):\n spark_type = BooleanType()\n elif types.is_int8(at):\n spark_type = ByteType()\n elif types.is_int16(at):\n spark_type = ShortType()\n elif types.is_int32(at):\n spark_type = IntegerType()\n elif types.is_int64(at):\n spark_type = LongType()\n elif types.is_float32(at):\n spark_type = FloatType()\n elif types.is_float64(at):\n spark_type = DoubleType()\n elif types.is_decimal(at):\n spark_type = DecimalType(precision=at.precision, scale=at.scale)\n elif types.is_string(at):\n spark_type = StringType()\n elif types.is_binary(at):\n spark_type = BinaryType()\n elif types.is_date32(at):\n spark_type = DateType()\n elif types.is_timestamp(at):\n spark_type = TimestampType()\n elif types.is_list(at):\n if types.is_timestamp(at.value_type):\n raise TypeError(\"Unsupported type in conversion from Arrow: \" + str(at))\n spark_type = ArrayType(from_arrow_type(at.value_type))\n elif types.is_struct(at):\n if any(types.is_struct(field.type) for field in at):\n raise TypeError(\"Nested StructType not supported in conversion from Arrow: \" + str(at))\n return StructType(\n [StructField(field.name, from_arrow_type(field.type), nullable=field.nullable)\n for field in at])\n else:\n raise TypeError(\"Unsupported type in conversion from Arrow: \" + str(at))\n return spark_type"} {"instruction": "Create a Python 3 function for\nconverting schema from Arrow to Spark.", "response": "def from_arrow_schema(arrow_schema):\n \"\"\" Convert schema from Arrow to Spark.\n \"\"\"\n return StructType(\n [StructField(field.name, from_arrow_type(field.type), nullable=field.nullable)\n for field in arrow_schema])"} {"instruction": "Create a Python 3 function for\nconverting timezone aware timestamps to timezone - naive in the specified timezone or local timezone.", "response": "def _check_series_localize_timestamps(s, timezone):\n \"\"\"\n Convert timezone aware timestamps to timezone-naive in the specified timezone or local timezone.\n\n If the input series is not a timestamp series, then the same series is returned. If the input\n series is a timestamp series, then a converted series is returned.\n\n :param s: pandas.Series\n :param timezone: the timezone to convert. if None then use local timezone\n :return pandas.Series that have been converted to tz-naive\n \"\"\"\n from pyspark.sql.utils import require_minimum_pandas_version\n require_minimum_pandas_version()\n\n from pandas.api.types import is_datetime64tz_dtype\n tz = timezone or _get_local_timezone()\n # TODO: handle nested timestamps, such as ArrayType(TimestampType())?\n if is_datetime64tz_dtype(s.dtype):\n return s.dt.tz_convert(tz).dt.tz_localize(None)\n else:\n return s"} {"instruction": "Make a summary of the following Python 3 code\ndef _check_dataframe_localize_timestamps(pdf, timezone):\n from pyspark.sql.utils import require_minimum_pandas_version\n require_minimum_pandas_version()\n\n for column, series in pdf.iteritems():\n pdf[column] = _check_series_localize_timestamps(series, timezone)\n return pdf", "response": "Convert timezone aware timestamps to timezone - naive in the specified timezone or local timezone - naive in the specified timezone or local timezone - naive in the specified timezone."} {"instruction": "Write a Python 3 function that can\nconvert a tz - naive timestamp in the specified timezone or local timezone to UTC normalized for Spark internal storage.", "response": "def _check_series_convert_timestamps_internal(s, timezone):\n \"\"\"\n Convert a tz-naive timestamp in the specified timezone or local timezone to UTC normalized for\n Spark internal storage\n\n :param s: a pandas.Series\n :param timezone: the timezone to convert. if None then use local timezone\n :return pandas.Series where if it is a timestamp, has been UTC normalized without a time zone\n \"\"\"\n from pyspark.sql.utils import require_minimum_pandas_version\n require_minimum_pandas_version()\n\n from pandas.api.types import is_datetime64_dtype, is_datetime64tz_dtype\n # TODO: handle nested timestamps, such as ArrayType(TimestampType())?\n if is_datetime64_dtype(s.dtype):\n # When tz_localize a tz-naive timestamp, the result is ambiguous if the tz-naive\n # timestamp is during the hour when the clock is adjusted backward during due to\n # daylight saving time (dst).\n # E.g., for America/New_York, the clock is adjusted backward on 2015-11-01 2:00 to\n # 2015-11-01 1:00 from dst-time to standard time, and therefore, when tz_localize\n # a tz-naive timestamp 2015-11-01 1:30 with America/New_York timezone, it can be either\n # dst time (2015-01-01 1:30-0400) or standard time (2015-11-01 1:30-0500).\n #\n # Here we explicit choose to use standard time. This matches the default behavior of\n # pytz.\n #\n # Here are some code to help understand this behavior:\n # >>> import datetime\n # >>> import pandas as pd\n # >>> import pytz\n # >>>\n # >>> t = datetime.datetime(2015, 11, 1, 1, 30)\n # >>> ts = pd.Series([t])\n # >>> tz = pytz.timezone('America/New_York')\n # >>>\n # >>> ts.dt.tz_localize(tz, ambiguous=True)\n # 0 2015-11-01 01:30:00-04:00\n # dtype: datetime64[ns, America/New_York]\n # >>>\n # >>> ts.dt.tz_localize(tz, ambiguous=False)\n # 0 2015-11-01 01:30:00-05:00\n # dtype: datetime64[ns, America/New_York]\n # >>>\n # >>> str(tz.localize(t))\n # '2015-11-01 01:30:00-05:00'\n tz = timezone or _get_local_timezone()\n return s.dt.tz_localize(tz, ambiguous=False).dt.tz_convert('UTC')\n elif is_datetime64tz_dtype(s.dtype):\n return s.dt.tz_convert('UTC')\n else:\n return s"} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef _check_series_convert_timestamps_localize(s, from_timezone, to_timezone):\n from pyspark.sql.utils import require_minimum_pandas_version\n require_minimum_pandas_version()\n\n import pandas as pd\n from pandas.api.types import is_datetime64tz_dtype, is_datetime64_dtype\n from_tz = from_timezone or _get_local_timezone()\n to_tz = to_timezone or _get_local_timezone()\n # TODO: handle nested timestamps, such as ArrayType(TimestampType())?\n if is_datetime64tz_dtype(s.dtype):\n return s.dt.tz_convert(to_tz).dt.tz_localize(None)\n elif is_datetime64_dtype(s.dtype) and from_tz != to_tz:\n # `s.dt.tz_localize('tzlocal()')` doesn't work properly when including NaT.\n return s.apply(\n lambda ts: ts.tz_localize(from_tz, ambiguous=False).tz_convert(to_tz).tz_localize(None)\n if ts is not pd.NaT else pd.NaT)\n else:\n return s", "response": "Convert timestamp to timezone - naive in the specified timezone or local timezone."} {"instruction": "Can you implement a function in Python 3 that\nconstructs a new StructType object by adding new elements to the list of fields.", "response": "def add(self, field, data_type=None, nullable=True, metadata=None):\n \"\"\"\n Construct a StructType by adding new elements to it to define the schema. The method accepts\n either:\n\n a) A single parameter which is a StructField object.\n b) Between 2 and 4 parameters as (name, data_type, nullable (optional),\n metadata(optional). The data_type parameter may be either a String or a\n DataType object.\n\n >>> struct1 = StructType().add(\"f1\", StringType(), True).add(\"f2\", StringType(), True, None)\n >>> struct2 = StructType([StructField(\"f1\", StringType(), True), \\\\\n ... StructField(\"f2\", StringType(), True, None)])\n >>> struct1 == struct2\n True\n >>> struct1 = StructType().add(StructField(\"f1\", StringType(), True))\n >>> struct2 = StructType([StructField(\"f1\", StringType(), True)])\n >>> struct1 == struct2\n True\n >>> struct1 = StructType().add(\"f1\", \"string\", True)\n >>> struct2 = StructType([StructField(\"f1\", StringType(), True)])\n >>> struct1 == struct2\n True\n\n :param field: Either the name of the field or a StructField object\n :param data_type: If present, the DataType of the StructField to create\n :param nullable: Whether the field to add should be nullable (default True)\n :param metadata: Any additional metadata (default None)\n :return: a new updated StructType\n \"\"\"\n if isinstance(field, StructField):\n self.fields.append(field)\n self.names.append(field.name)\n else:\n if isinstance(field, str) and data_type is None:\n raise ValueError(\"Must specify DataType if passing name of struct_field to create.\")\n\n if isinstance(data_type, str):\n data_type_f = _parse_datatype_json_value(data_type)\n else:\n data_type_f = data_type\n self.fields.append(StructField(field, data_type_f, nullable, metadata))\n self.names.append(field)\n # Precalculated list of fields that need conversion with fromInternal/toInternal functions\n self._needConversion = [f.needConversion() for f in self]\n self._needSerializeAnyField = any(self._needConversion)\n return self"} {"instruction": "Explain what the following Python 3 code does\ndef _cachedSqlType(cls):\n if not hasattr(cls, \"_cached_sql_type\"):\n cls._cached_sql_type = cls.sqlType()\n return cls._cached_sql_type", "response": "Cache the sqlType into class because it s heavy used in toInternal."} {"instruction": "Can you write a function in Python 3 where it\nconverts the table into a dict.", "response": "def asDict(self, recursive=False):\n \"\"\"\n Return as an dict\n\n :param recursive: turns the nested Row as dict (default: False).\n\n >>> Row(name=\"Alice\", age=11).asDict() == {'name': 'Alice', 'age': 11}\n True\n >>> row = Row(key=1, value=Row(name='a', age=2))\n >>> row.asDict() == {'key': 1, 'value': Row(age=2, name='a')}\n True\n >>> row.asDict(True) == {'key': 1, 'value': {'name': 'a', 'age': 2}}\n True\n \"\"\"\n if not hasattr(self, \"__fields__\"):\n raise TypeError(\"Cannot convert a Row class into dict\")\n\n if recursive:\n def conv(obj):\n if isinstance(obj, Row):\n return obj.asDict(True)\n elif isinstance(obj, list):\n return [conv(o) for o in obj]\n elif isinstance(obj, dict):\n return dict((k, conv(v)) for k, v in obj.items())\n else:\n return obj\n return dict(zip(self.__fields__, (conv(o) for o in self)))\n else:\n return dict(zip(self.__fields__, self))"} {"instruction": "Can you write a function in Python 3 where it\nreturns the summary of the LinearRegressionModel.", "response": "def summary(self):\n \"\"\"\n Gets summary (e.g. residuals, mse, r-squared ) of model on\n training set. An exception is thrown if\n `trainingSummary is None`.\n \"\"\"\n if self.hasSummary:\n return LinearRegressionTrainingSummary(super(LinearRegressionModel, self).summary)\n else:\n raise RuntimeError(\"No training summary available for this %s\" %\n self.__class__.__name__)"} {"instruction": "How would you code a function in Python 3 to\nevaluate the model on a test dataset.", "response": "def evaluate(self, dataset):\n \"\"\"\n Evaluates the model on a test dataset.\n\n :param dataset:\n Test dataset to evaluate model on, where dataset is an\n instance of :py:class:`pyspark.sql.DataFrame`\n \"\"\"\n if not isinstance(dataset, DataFrame):\n raise ValueError(\"dataset must be a DataFrame but got %s.\" % type(dataset))\n java_lr_summary = self._call_java(\"evaluate\", dataset)\n return LinearRegressionSummary(java_lr_summary)"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef summary(self):\n if self.hasSummary:\n return GeneralizedLinearRegressionTrainingSummary(\n super(GeneralizedLinearRegressionModel, self).summary)\n else:\n raise RuntimeError(\"No training summary available for this %s\" %\n self.__class__.__name__)", "response": "Returns a GeneralizedLinearRegressionTrainingSummary object for this training set."} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef evaluate(self, dataset):\n if not isinstance(dataset, DataFrame):\n raise ValueError(\"dataset must be a DataFrame but got %s.\" % type(dataset))\n java_glr_summary = self._call_java(\"evaluate\", dataset)\n return GeneralizedLinearRegressionSummary(java_glr_summary)", "response": "Evaluates the model on a test dataset."} {"instruction": "Write a Python 3 script to\nget all the directories that are local", "response": "def _get_local_dirs(sub):\n \"\"\" Get all the directories \"\"\"\n path = os.environ.get(\"SPARK_LOCAL_DIRS\", \"/tmp\")\n dirs = path.split(\",\")\n if len(dirs) > 1:\n # different order in different processes and instances\n rnd = random.Random(os.getpid() + id(dirs))\n random.shuffle(dirs, rnd.random)\n return [os.path.join(d, \"python\", str(os.getpid()), sub) for d in dirs]"} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef _get_spill_dir(self, n):\n return os.path.join(self.localdirs[n % len(self.localdirs)], str(n))", "response": "Choose one directory for spill by number n"} {"instruction": "How would you explain what the following Python 3 function does\ndef mergeValues(self, iterator):\n # speedup attribute lookup\n creator, comb = self.agg.createCombiner, self.agg.mergeValue\n c, data, pdata, hfun, batch = 0, self.data, self.pdata, self._partition, self.batch\n limit = self.memory_limit\n\n for k, v in iterator:\n d = pdata[hfun(k)] if pdata else data\n d[k] = comb(d[k], v) if k in d else creator(v)\n\n c += 1\n if c >= batch:\n if get_used_memory() >= limit:\n self._spill()\n limit = self._next_limit()\n batch /= 2\n c = 0\n else:\n batch *= 1.5\n\n if get_used_memory() >= limit:\n self._spill()", "response": "Combine the items by creator and combiner"} {"instruction": "Given the following Python 3 function, write the documentation\ndef mergeCombiners(self, iterator, limit=None):\n if limit is None:\n limit = self.memory_limit\n # speedup attribute lookup\n comb, hfun, objsize = self.agg.mergeCombiners, self._partition, self._object_size\n c, data, pdata, batch = 0, self.data, self.pdata, self.batch\n for k, v in iterator:\n d = pdata[hfun(k)] if pdata else data\n d[k] = comb(d[k], v) if k in d else v\n if not limit:\n continue\n\n c += objsize(v)\n if c > batch:\n if get_used_memory() > limit:\n self._spill()\n limit = self._next_limit()\n batch /= 2\n c = 0\n else:\n batch *= 1.5\n\n if limit and get_used_memory() >= limit:\n self._spill()", "response": "Merge a set of keys and values by merging them into a single object."} {"instruction": "Here you have a function in Python 3, explain what it does\ndef _spill(self):\n global MemoryBytesSpilled, DiskBytesSpilled\n path = self._get_spill_dir(self.spills)\n if not os.path.exists(path):\n os.makedirs(path)\n\n used_memory = get_used_memory()\n if not self.pdata:\n # The data has not been partitioned, it will iterator the\n # dataset once, write them into different files, has no\n # additional memory. It only called when the memory goes\n # above limit at the first time.\n\n # open all the files for writing\n streams = [open(os.path.join(path, str(i)), 'wb')\n for i in range(self.partitions)]\n\n for k, v in self.data.items():\n h = self._partition(k)\n # put one item in batch, make it compatible with load_stream\n # it will increase the memory if dump them in batch\n self.serializer.dump_stream([(k, v)], streams[h])\n\n for s in streams:\n DiskBytesSpilled += s.tell()\n s.close()\n\n self.data.clear()\n self.pdata.extend([{} for i in range(self.partitions)])\n\n else:\n for i in range(self.partitions):\n p = os.path.join(path, str(i))\n with open(p, \"wb\") as f:\n # dump items in batch\n self.serializer.dump_stream(iter(self.pdata[i].items()), f)\n self.pdata[i].clear()\n DiskBytesSpilled += os.path.getsize(p)\n\n self.spills += 1\n gc.collect() # release the memory as much as possible\n MemoryBytesSpilled += max(used_memory - get_used_memory(), 0) << 20", "response": "This function will dump already partitioned data into disks. It will dump the data into the disks and the memory used by the memory."} {"instruction": "Make a summary of the following Python 3 code\ndef items(self):\n if not self.pdata and not self.spills:\n return iter(self.data.items())\n return self._external_items()", "response": "Return all items as iterator"} {"instruction": "Write a Python 3 function for\nreturning all partitioned items as iterator", "response": "def _external_items(self):\n \"\"\" Return all partitioned items as iterator \"\"\"\n assert not self.data\n if any(self.pdata):\n self._spill()\n # disable partitioning and spilling when merge combiners from disk\n self.pdata = []\n\n try:\n for i in range(self.partitions):\n for v in self._merged_items(i):\n yield v\n self.data.clear()\n\n # remove the merged partition\n for j in range(self.spills):\n path = self._get_spill_dir(j)\n os.remove(os.path.join(path, str(i)))\n finally:\n self._cleanup()"} {"instruction": "Make a summary of the following Python 3 code\ndef _recursive_merged_items(self, index):\n subdirs = [os.path.join(d, \"parts\", str(index)) for d in self.localdirs]\n m = ExternalMerger(self.agg, self.memory_limit, self.serializer, subdirs,\n self.scale * self.partitions, self.partitions, self.batch)\n m.pdata = [{} for _ in range(self.partitions)]\n limit = self._next_limit()\n\n for j in range(self.spills):\n path = self._get_spill_dir(j)\n p = os.path.join(path, str(index))\n with open(p, 'rb') as f:\n m.mergeCombiners(self.serializer.load_stream(f), 0)\n\n if get_used_memory() > limit:\n m._spill()\n limit = self._next_limit()\n\n return m._external_items()", "response": "Merge the partitioned items and return the as iterator\n "} {"instruction": "Can you write a function in Python 3 where it\nchooses one directory for spill by number n", "response": "def _get_path(self, n):\n \"\"\" Choose one directory for spill by number n \"\"\"\n d = self.local_dirs[n % len(self.local_dirs)]\n if not os.path.exists(d):\n os.makedirs(d)\n return os.path.join(d, str(n))"} {"instruction": "Write a Python 3 script for\nsorting the elements in iterator do external sort when the memory is below the limit.", "response": "def sorted(self, iterator, key=None, reverse=False):\n \"\"\"\n Sort the elements in iterator, do external sort when the memory\n goes above the limit.\n \"\"\"\n global MemoryBytesSpilled, DiskBytesSpilled\n batch, limit = 100, self._next_limit()\n chunks, current_chunk = [], []\n iterator = iter(iterator)\n while True:\n # pick elements in batch\n chunk = list(itertools.islice(iterator, batch))\n current_chunk.extend(chunk)\n if len(chunk) < batch:\n break\n\n used_memory = get_used_memory()\n if used_memory > limit:\n # sort them inplace will save memory\n current_chunk.sort(key=key, reverse=reverse)\n path = self._get_path(len(chunks))\n with open(path, 'wb') as f:\n self.serializer.dump_stream(current_chunk, f)\n\n def load(f):\n for v in self.serializer.load_stream(f):\n yield v\n # close the file explicit once we consume all the items\n # to avoid ResourceWarning in Python3\n f.close()\n chunks.append(load(open(path, 'rb')))\n current_chunk = []\n MemoryBytesSpilled += max(used_memory - get_used_memory(), 0) << 20\n DiskBytesSpilled += os.path.getsize(path)\n os.unlink(path) # data will be deleted after close\n\n elif not chunks:\n batch = min(int(batch * 1.5), 10000)\n\n current_chunk.sort(key=key, reverse=reverse)\n if not chunks:\n return current_chunk\n\n if current_chunk:\n chunks.append(iter(current_chunk))\n\n return heapq.merge(chunks, key=key, reverse=reverse)"} {"instruction": "Given the following Python 3 function, write the documentation\ndef _spill(self):\n global MemoryBytesSpilled, DiskBytesSpilled\n if self._file is None:\n self._open_file()\n\n used_memory = get_used_memory()\n pos = self._file.tell()\n self._ser.dump_stream(self.values, self._file)\n self.values = []\n gc.collect()\n DiskBytesSpilled += self._file.tell() - pos\n MemoryBytesSpilled += max(used_memory - get_used_memory(), 0) << 20", "response": "dump the values into disk"} {"instruction": "How would you implement a function in Python 3 that\ndumps already partitioned data into disks.", "response": "def _spill(self):\n \"\"\"\n dump already partitioned data into disks.\n \"\"\"\n global MemoryBytesSpilled, DiskBytesSpilled\n path = self._get_spill_dir(self.spills)\n if not os.path.exists(path):\n os.makedirs(path)\n\n used_memory = get_used_memory()\n if not self.pdata:\n # The data has not been partitioned, it will iterator the\n # data once, write them into different files, has no\n # additional memory. It only called when the memory goes\n # above limit at the first time.\n\n # open all the files for writing\n streams = [open(os.path.join(path, str(i)), 'wb')\n for i in range(self.partitions)]\n\n # If the number of keys is small, then the overhead of sort is small\n # sort them before dumping into disks\n self._sorted = len(self.data) < self.SORT_KEY_LIMIT\n if self._sorted:\n self.serializer = self.flattened_serializer()\n for k in sorted(self.data.keys()):\n h = self._partition(k)\n self.serializer.dump_stream([(k, self.data[k])], streams[h])\n else:\n for k, v in self.data.items():\n h = self._partition(k)\n self.serializer.dump_stream([(k, v)], streams[h])\n\n for s in streams:\n DiskBytesSpilled += s.tell()\n s.close()\n\n self.data.clear()\n # self.pdata is cached in `mergeValues` and `mergeCombiners`\n self.pdata.extend([{} for i in range(self.partitions)])\n\n else:\n for i in range(self.partitions):\n p = os.path.join(path, str(i))\n with open(p, \"wb\") as f:\n # dump items in batch\n if self._sorted:\n # sort by key only (stable)\n sorted_items = sorted(self.pdata[i].items(), key=operator.itemgetter(0))\n self.serializer.dump_stream(sorted_items, f)\n else:\n self.serializer.dump_stream(self.pdata[i].items(), f)\n self.pdata[i].clear()\n DiskBytesSpilled += os.path.getsize(p)\n\n self.spills += 1\n gc.collect() # release the memory as much as possible\n MemoryBytesSpilled += max(used_memory - get_used_memory(), 0) << 20"} {"instruction": "Write a Python 3 function for\nloading a partition from disk then sort and group by key", "response": "def _merge_sorted_items(self, index):\n \"\"\" load a partition from disk, then sort and group by key \"\"\"\n def load_partition(j):\n path = self._get_spill_dir(j)\n p = os.path.join(path, str(index))\n with open(p, 'rb', 65536) as f:\n for v in self.serializer.load_stream(f):\n yield v\n\n disk_items = [load_partition(j) for j in range(self.spills)]\n\n if self._sorted:\n # all the partitions are already sorted\n sorted_items = heapq.merge(disk_items, key=operator.itemgetter(0))\n\n else:\n # Flatten the combined values, so it will not consume huge\n # memory during merging sort.\n ser = self.flattened_serializer()\n sorter = ExternalSorter(self.memory_limit, ser)\n sorted_items = sorter.sorted(itertools.chain(*disk_items),\n key=operator.itemgetter(0))\n return ((k, vs) for k, vs in GroupByKey(sorted_items))"} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef worker(sock, authenticated):\n signal.signal(SIGHUP, SIG_DFL)\n signal.signal(SIGCHLD, SIG_DFL)\n signal.signal(SIGTERM, SIG_DFL)\n # restore the handler for SIGINT,\n # it's useful for debugging (show the stacktrace before exit)\n signal.signal(SIGINT, signal.default_int_handler)\n\n # Read the socket using fdopen instead of socket.makefile() because the latter\n # seems to be very slow; note that we need to dup() the file descriptor because\n # otherwise writes also cause a seek that makes us miss data on the read side.\n infile = os.fdopen(os.dup(sock.fileno()), \"rb\", 65536)\n outfile = os.fdopen(os.dup(sock.fileno()), \"wb\", 65536)\n\n if not authenticated:\n client_secret = UTF8Deserializer().loads(infile)\n if os.environ[\"PYTHON_WORKER_FACTORY_SECRET\"] == client_secret:\n write_with_length(\"ok\".encode(\"utf-8\"), outfile)\n outfile.flush()\n else:\n write_with_length(\"err\".encode(\"utf-8\"), outfile)\n outfile.flush()\n sock.close()\n return 1\n\n exit_code = 0\n try:\n worker_main(infile, outfile)\n except SystemExit as exc:\n exit_code = compute_real_exit_code(exc.code)\n finally:\n try:\n outfile.flush()\n except Exception:\n pass\n return exit_code", "response": "This function is called by the worker process."} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef portable_hash(x):\n\n if sys.version_info >= (3, 2, 3) and 'PYTHONHASHSEED' not in os.environ:\n raise Exception(\"Randomness of hash of string should be disabled via PYTHONHASHSEED\")\n\n if x is None:\n return 0\n if isinstance(x, tuple):\n h = 0x345678\n for i in x:\n h ^= portable_hash(i)\n h *= 1000003\n h &= sys.maxsize\n h ^= len(x)\n if h == -1:\n h = -2\n return int(h)\n return hash(x)", "response": "This function returns consistent hash code for builtin types and tuple with None."} {"instruction": "Make a summary of the following Python 3 code\ndef _parse_memory(s):\n units = {'g': 1024, 'm': 1, 't': 1 << 20, 'k': 1.0 / 1024}\n if s[-1].lower() not in units:\n raise ValueError(\"invalid format: \" + s)\n return int(float(s[:-1]) * units[s[-1].lower()])", "response": "Parse a memory string in the format supported by Java and return the value in MiB."} {"instruction": "How would you implement a function in Python 3 that\nignores the u prefix of string in doc tests", "response": "def ignore_unicode_prefix(f):\n \"\"\"\n Ignore the 'u' prefix of string in doc tests, to make it works\n in both python 2 and 3\n \"\"\"\n if sys.version >= '3':\n # the representation of unicode string in Python 3 does not have prefix 'u',\n # so remove the prefix 'u' for doc tests\n literal_re = re.compile(r\"(\\W|^)[uU](['])\", re.UNICODE)\n f.__doc__ = literal_re.sub(r'\\1\\2', f.__doc__)\n return f"} {"instruction": "Given the following Python 3 function, write the documentation\ndef cache(self):\n self.is_cached = True\n self.persist(StorageLevel.MEMORY_ONLY)\n return self", "response": "Persist this RDD with the default storage level."} {"instruction": "Given the following Python 3 function, write the documentation\ndef persist(self, storageLevel=StorageLevel.MEMORY_ONLY):\n self.is_cached = True\n javaStorageLevel = self.ctx._getJavaStorageLevel(storageLevel)\n self._jrdd.persist(javaStorageLevel)\n return self", "response": "Set this RDD s storage level to persist its values across operations\n ."} {"instruction": "Can you implement a function in Python 3 that\nmarks the RDD as non - persistent and remove all blocks for the current entry from memory and disk.", "response": "def unpersist(self, blocking=False):\n \"\"\"\n Mark the RDD as non-persistent, and remove all blocks for it from\n memory and disk.\n\n .. versionchanged:: 3.0.0\n Added optional argument `blocking` to specify whether to block until all\n blocks are deleted.\n \"\"\"\n self.is_cached = False\n self._jrdd.unpersist(blocking)\n return self"} {"instruction": "Explain what the following Python 3 code does\ndef getCheckpointFile(self):\n checkpointFile = self._jrdd.rdd().getCheckpointFile()\n if checkpointFile.isDefined():\n return checkpointFile.get()", "response": "Gets the name of the file to which this RDD was checkpointed."} {"instruction": "Implement a function in Python 3 to\nreturn a new RDD by applying a function to each element of this RDD.", "response": "def map(self, f, preservesPartitioning=False):\n \"\"\"\n Return a new RDD by applying a function to each element of this RDD.\n\n >>> rdd = sc.parallelize([\"b\", \"a\", \"c\"])\n >>> sorted(rdd.map(lambda x: (x, 1)).collect())\n [('a', 1), ('b', 1), ('c', 1)]\n \"\"\"\n def func(_, iterator):\n return map(fail_on_stopiteration(f), iterator)\n return self.mapPartitionsWithIndex(func, preservesPartitioning)"} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef flatMap(self, f, preservesPartitioning=False):\n def func(s, iterator):\n return chain.from_iterable(map(fail_on_stopiteration(f), iterator))\n return self.mapPartitionsWithIndex(func, preservesPartitioning)", "response": "Return a new RDD by first applying a function to all elements of this RDD and then flattening the results."} {"instruction": "Can you implement a function in Python 3 that\nreturns a new RDD by applying a function to each partition of this RDD.", "response": "def mapPartitions(self, f, preservesPartitioning=False):\n \"\"\"\n Return a new RDD by applying a function to each partition of this RDD.\n\n >>> rdd = sc.parallelize([1, 2, 3, 4], 2)\n >>> def f(iterator): yield sum(iterator)\n >>> rdd.mapPartitions(f).collect()\n [3, 7]\n \"\"\"\n def func(s, iterator):\n return f(iterator)\n return self.mapPartitionsWithIndex(func, preservesPartitioning)"} {"instruction": "How would you explain what the following Python 3 function does\ndef mapPartitionsWithSplit(self, f, preservesPartitioning=False):\n warnings.warn(\"mapPartitionsWithSplit is deprecated; \"\n \"use mapPartitionsWithIndex instead\", DeprecationWarning, stacklevel=2)\n return self.mapPartitionsWithIndex(f, preservesPartitioning)", "response": "Return a new RDD by applying a function to each partition of this RDD while tracking the index of the original partition."} {"instruction": "Write a Python 3 function that can\nreturn an RDD containing the distinct elements in this RDD.", "response": "def distinct(self, numPartitions=None):\n \"\"\"\n Return a new RDD containing the distinct elements in this RDD.\n\n >>> sorted(sc.parallelize([1, 1, 2, 3]).distinct().collect())\n [1, 2, 3]\n \"\"\"\n return self.map(lambda x: (x, None)) \\\n .reduceByKey(lambda x, _: x, numPartitions) \\\n .map(lambda x: x[0])"} {"instruction": "How would you implement a function in Python 3 that\nreturns a new RDD with the specified fraction of the total number of elements in this RDD.", "response": "def sample(self, withReplacement, fraction, seed=None):\n \"\"\"\n Return a sampled subset of this RDD.\n\n :param withReplacement: can elements be sampled multiple times (replaced when sampled out)\n :param fraction: expected size of the sample as a fraction of this RDD's size\n without replacement: probability that each element is chosen; fraction must be [0, 1]\n with replacement: expected number of times each element is chosen; fraction must be >= 0\n :param seed: seed for the random number generator\n\n .. note:: This is not guaranteed to provide exactly the fraction specified of the total\n count of the given :class:`DataFrame`.\n\n >>> rdd = sc.parallelize(range(100), 4)\n >>> 6 <= rdd.sample(False, 0.1, 81).count() <= 14\n True\n \"\"\"\n assert fraction >= 0.0, \"Negative fraction value: %s\" % fraction\n return self.mapPartitionsWithIndex(RDDSampler(withReplacement, fraction, seed).func, True)"} {"instruction": "Make a summary of the following Python 3 code\ndef randomSplit(self, weights, seed=None):\n s = float(sum(weights))\n cweights = [0.0]\n for w in weights:\n cweights.append(cweights[-1] + w / s)\n if seed is None:\n seed = random.randint(0, 2 ** 32 - 1)\n return [self.mapPartitionsWithIndex(RDDRangeSampler(lb, ub, seed).func, True)\n for lb, ub in zip(cweights, cweights[1:])]", "response": "Randomly splits this RDD with the provided weights."} {"instruction": "Can you implement a function in Python 3 that\nreturns a fixed - size sampled subset of this RDD.", "response": "def takeSample(self, withReplacement, num, seed=None):\n \"\"\"\n Return a fixed-size sampled subset of this RDD.\n\n .. note:: This method should only be used if the resulting array is expected\n to be small, as all the data is loaded into the driver's memory.\n\n >>> rdd = sc.parallelize(range(0, 10))\n >>> len(rdd.takeSample(True, 20, 1))\n 20\n >>> len(rdd.takeSample(False, 5, 2))\n 5\n >>> len(rdd.takeSample(False, 15, 3))\n 10\n \"\"\"\n numStDev = 10.0\n\n if num < 0:\n raise ValueError(\"Sample size cannot be negative.\")\n elif num == 0:\n return []\n\n initialCount = self.count()\n if initialCount == 0:\n return []\n\n rand = random.Random(seed)\n\n if (not withReplacement) and num >= initialCount:\n # shuffle current RDD and return\n samples = self.collect()\n rand.shuffle(samples)\n return samples\n\n maxSampleSize = sys.maxsize - int(numStDev * sqrt(sys.maxsize))\n if num > maxSampleSize:\n raise ValueError(\n \"Sample size cannot be greater than %d.\" % maxSampleSize)\n\n fraction = RDD._computeFractionForSampleSize(\n num, initialCount, withReplacement)\n samples = self.sample(withReplacement, fraction, seed).collect()\n\n # If the first sample didn't turn out large enough, keep trying to take samples;\n # this shouldn't happen often because we use a big multiplier for their initial size.\n # See: scala/spark/RDD.scala\n while len(samples) < num:\n # TODO: add log warning for when more than one iteration was run\n seed = rand.randint(0, sys.maxsize)\n samples = self.sample(withReplacement, fraction, seed).collect()\n\n rand.shuffle(samples)\n\n return samples[0:num]"} {"instruction": "How would you explain what the following Python 3 function does\ndef _computeFractionForSampleSize(sampleSizeLowerBound, total, withReplacement):\n fraction = float(sampleSizeLowerBound) / total\n if withReplacement:\n numStDev = 5\n if (sampleSizeLowerBound < 12):\n numStDev = 9\n return fraction + numStDev * sqrt(fraction / total)\n else:\n delta = 0.00005\n gamma = - log(delta) / total\n return min(1, fraction + gamma + sqrt(gamma * gamma + 2 * gamma * fraction))", "response": "Compute the sampling rate for a specific sample size."} {"instruction": "How would you explain what the following Python 3 function does\ndef union(self, other):\n if self._jrdd_deserializer == other._jrdd_deserializer:\n rdd = RDD(self._jrdd.union(other._jrdd), self.ctx,\n self._jrdd_deserializer)\n else:\n # These RDDs contain data in different serialized formats, so we\n # must normalize them to the default serializer.\n self_copy = self._reserialize()\n other_copy = other._reserialize()\n rdd = RDD(self_copy._jrdd.union(other_copy._jrdd), self.ctx,\n self.ctx.serializer)\n if (self.partitioner == other.partitioner and\n self.getNumPartitions() == rdd.getNumPartitions()):\n rdd.partitioner = self.partitioner\n return rdd", "response": "Return the union of this RDD and another RDD."} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef intersection(self, other):\n return self.map(lambda v: (v, None)) \\\n .cogroup(other.map(lambda v: (v, None))) \\\n .filter(lambda k_vs: all(k_vs[1])) \\\n .keys()", "response": "Return the intersection of this RDD and another RDD."} {"instruction": "Implement a function in Python 3 to\nsort this RDD by key.", "response": "def sortByKey(self, ascending=True, numPartitions=None, keyfunc=lambda x: x):\n \"\"\"\n Sorts this RDD, which is assumed to consist of (key, value) pairs.\n\n >>> tmp = [('a', 1), ('b', 2), ('1', 3), ('d', 4), ('2', 5)]\n >>> sc.parallelize(tmp).sortByKey().first()\n ('1', 3)\n >>> sc.parallelize(tmp).sortByKey(True, 1).collect()\n [('1', 3), ('2', 5), ('a', 1), ('b', 2), ('d', 4)]\n >>> sc.parallelize(tmp).sortByKey(True, 2).collect()\n [('1', 3), ('2', 5), ('a', 1), ('b', 2), ('d', 4)]\n >>> tmp2 = [('Mary', 1), ('had', 2), ('a', 3), ('little', 4), ('lamb', 5)]\n >>> tmp2.extend([('whose', 6), ('fleece', 7), ('was', 8), ('white', 9)])\n >>> sc.parallelize(tmp2).sortByKey(True, 3, keyfunc=lambda k: k.lower()).collect()\n [('a', 3), ('fleece', 7), ('had', 2), ('lamb', 5),...('white', 9), ('whose', 6)]\n \"\"\"\n if numPartitions is None:\n numPartitions = self._defaultReducePartitions()\n\n memory = self._memory_limit()\n serializer = self._jrdd_deserializer\n\n def sortPartition(iterator):\n sort = ExternalSorter(memory * 0.9, serializer).sorted\n return iter(sort(iterator, key=lambda kv: keyfunc(kv[0]), reverse=(not ascending)))\n\n if numPartitions == 1:\n if self.getNumPartitions() > 1:\n self = self.coalesce(1)\n return self.mapPartitions(sortPartition, True)\n\n # first compute the boundary of each part via sampling: we want to partition\n # the key-space into bins such that the bins have roughly the same\n # number of (key, value) pairs falling into them\n rddSize = self.count()\n if not rddSize:\n return self # empty RDD\n maxSampleSize = numPartitions * 20.0 # constant from Spark's RangePartitioner\n fraction = min(maxSampleSize / max(rddSize, 1), 1.0)\n samples = self.sample(False, fraction, 1).map(lambda kv: kv[0]).collect()\n samples = sorted(samples, key=keyfunc)\n\n # we have numPartitions many parts but one of the them has\n # an implicit boundary\n bounds = [samples[int(len(samples) * (i + 1) / numPartitions)]\n for i in range(0, numPartitions - 1)]\n\n def rangePartitioner(k):\n p = bisect.bisect_left(bounds, keyfunc(k))\n if ascending:\n return p\n else:\n return numPartitions - 1 - p\n\n return self.partitionBy(numPartitions, rangePartitioner).mapPartitions(sortPartition, True)"} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef sortBy(self, keyfunc, ascending=True, numPartitions=None):\n return self.keyBy(keyfunc).sortByKey(ascending, numPartitions).values()", "response": "Sorts this RDD by the given keyfunc."} {"instruction": "Here you have a function in Python 3, explain what it does\ndef cartesian(self, other):\n # Due to batching, we can't use the Java cartesian method.\n deserializer = CartesianDeserializer(self._jrdd_deserializer,\n other._jrdd_deserializer)\n return RDD(self._jrdd.cartesian(other._jrdd), self.ctx, deserializer)", "response": "Return the Cartesian product of this RDD and another RDD."} {"instruction": "Can you tell what is the following Python 3 function doing\ndef groupBy(self, f, numPartitions=None, partitionFunc=portable_hash):\n return self.map(lambda x: (f(x), x)).groupByKey(numPartitions, partitionFunc)", "response": "Return an RDD of grouped items by a function."} {"instruction": "Can you write a function in Python 3 where it\nreturns an RDD of strings from a shell command.", "response": "def pipe(self, command, env=None, checkCode=False):\n \"\"\"\n Return an RDD created by piping elements to a forked external process.\n\n >>> sc.parallelize(['1', '2', '', '3']).pipe('cat').collect()\n [u'1', u'2', u'', u'3']\n\n :param checkCode: whether or not to check the return value of the shell command.\n \"\"\"\n if env is None:\n env = dict()\n\n def func(iterator):\n pipe = Popen(\n shlex.split(command), env=env, stdin=PIPE, stdout=PIPE)\n\n def pipe_objs(out):\n for obj in iterator:\n s = unicode(obj).rstrip('\\n') + '\\n'\n out.write(s.encode('utf-8'))\n out.close()\n Thread(target=pipe_objs, args=[pipe.stdin]).start()\n\n def check_return_code():\n pipe.wait()\n if checkCode and pipe.returncode:\n raise Exception(\"Pipe function `%s' exited \"\n \"with error code %d\" % (command, pipe.returncode))\n else:\n for i in range(0):\n yield i\n return (x.rstrip(b'\\n').decode('utf-8') for x in\n chain(iter(pipe.stdout.readline, b''), check_return_code()))\n return self.mapPartitions(func)"} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef foreach(self, f):\n f = fail_on_stopiteration(f)\n\n def processPartition(iterator):\n for x in iterator:\n f(x)\n return iter([])\n self.mapPartitions(processPartition).count()", "response": "Applies a function to all elements of this RDD."} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef foreachPartition(self, f):\n def func(it):\n r = f(it)\n try:\n return iter(r)\n except TypeError:\n return iter([])\n self.mapPartitions(func).count()", "response": "Applies a function to each partition of this RDD."} {"instruction": "How would you code a function in Python 3 to\nreturn a list containing all of the elements in this RDD.", "response": "def collect(self):\n \"\"\"\n Return a list that contains all of the elements in this RDD.\n\n .. note:: This method should only be used if the resulting array is expected\n to be small, as all the data is loaded into the driver's memory.\n \"\"\"\n with SCCallSiteSync(self.context) as css:\n sock_info = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd())\n return list(_load_from_socket(sock_info, self._jrdd_deserializer))"} {"instruction": "Write a Python 3 script for\nreducing the elements of this RDD using the specified commutative and an associative binary operator. Currently reduces partitions locally.", "response": "def reduce(self, f):\n \"\"\"\n Reduces the elements of this RDD using the specified commutative and\n associative binary operator. Currently reduces partitions locally.\n\n >>> from operator import add\n >>> sc.parallelize([1, 2, 3, 4, 5]).reduce(add)\n 15\n >>> sc.parallelize((2 for _ in range(10))).map(lambda x: 1).cache().reduce(add)\n 10\n >>> sc.parallelize([]).reduce(add)\n Traceback (most recent call last):\n ...\n ValueError: Can not reduce() empty RDD\n \"\"\"\n f = fail_on_stopiteration(f)\n\n def func(iterator):\n iterator = iter(iterator)\n try:\n initial = next(iterator)\n except StopIteration:\n return\n yield reduce(f, iterator, initial)\n\n vals = self.mapPartitions(func).collect()\n if vals:\n return reduce(f, vals)\n raise ValueError(\"Can not reduce() empty RDD\")"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef treeReduce(self, f, depth=2):\n if depth < 1:\n raise ValueError(\"Depth cannot be smaller than 1 but got %d.\" % depth)\n\n zeroValue = None, True # Use the second entry to indicate whether this is a dummy value.\n\n def op(x, y):\n if x[1]:\n return y\n elif y[1]:\n return x\n else:\n return f(x[0], y[0]), False\n\n reduced = self.map(lambda x: (x, False)).treeAggregate(zeroValue, op, op, depth)\n if reduced[1]:\n raise ValueError(\"Cannot reduce empty RDD.\")\n return reduced[0]", "response": "Reduces the elements of this RDD in a multi - level tree pattern."} {"instruction": "How would you implement a function in Python 3 that\nfolds the elements of each partition into a single value.", "response": "def fold(self, zeroValue, op):\n \"\"\"\n Aggregate the elements of each partition, and then the results for all\n the partitions, using a given associative function and a neutral \"zero value.\"\n\n The function C{op(t1, t2)} is allowed to modify C{t1} and return it\n as its result value to avoid object allocation; however, it should not\n modify C{t2}.\n\n This behaves somewhat differently from fold operations implemented\n for non-distributed collections in functional languages like Scala.\n This fold operation may be applied to partitions individually, and then\n fold those results into the final result, rather than apply the fold\n to each element sequentially in some defined ordering. For functions\n that are not commutative, the result may differ from that of a fold\n applied to a non-distributed collection.\n\n >>> from operator import add\n >>> sc.parallelize([1, 2, 3, 4, 5]).fold(0, add)\n 15\n \"\"\"\n op = fail_on_stopiteration(op)\n\n def func(iterator):\n acc = zeroValue\n for obj in iterator:\n acc = op(acc, obj)\n yield acc\n # collecting result of mapPartitions here ensures that the copy of\n # zeroValue provided to each partition is unique from the one provided\n # to the final reduce call\n vals = self.mapPartitions(func).collect()\n return reduce(op, vals, zeroValue)"} {"instruction": "How would you code a function in Python 3 to\naggregate the elements of each partition and then the results for all the partitions using a given combine functions and a neutral zeroValue value.", "response": "def aggregate(self, zeroValue, seqOp, combOp):\n \"\"\"\n Aggregate the elements of each partition, and then the results for all\n the partitions, using a given combine functions and a neutral \"zero\n value.\"\n\n The functions C{op(t1, t2)} is allowed to modify C{t1} and return it\n as its result value to avoid object allocation; however, it should not\n modify C{t2}.\n\n The first function (seqOp) can return a different result type, U, than\n the type of this RDD. Thus, we need one operation for merging a T into\n an U and one operation for merging two U\n\n >>> seqOp = (lambda x, y: (x[0] + y, x[1] + 1))\n >>> combOp = (lambda x, y: (x[0] + y[0], x[1] + y[1]))\n >>> sc.parallelize([1, 2, 3, 4]).aggregate((0, 0), seqOp, combOp)\n (10, 4)\n >>> sc.parallelize([]).aggregate((0, 0), seqOp, combOp)\n (0, 0)\n \"\"\"\n seqOp = fail_on_stopiteration(seqOp)\n combOp = fail_on_stopiteration(combOp)\n\n def func(iterator):\n acc = zeroValue\n for obj in iterator:\n acc = seqOp(acc, obj)\n yield acc\n # collecting result of mapPartitions here ensures that the copy of\n # zeroValue provided to each partition is unique from the one provided\n # to the final reduce call\n vals = self.mapPartitions(func).collect()\n return reduce(combOp, vals, zeroValue)"} {"instruction": "How would you explain what the following Python 3 function does\ndef treeAggregate(self, zeroValue, seqOp, combOp, depth=2):\n if depth < 1:\n raise ValueError(\"Depth cannot be smaller than 1 but got %d.\" % depth)\n\n if self.getNumPartitions() == 0:\n return zeroValue\n\n def aggregatePartition(iterator):\n acc = zeroValue\n for obj in iterator:\n acc = seqOp(acc, obj)\n yield acc\n\n partiallyAggregated = self.mapPartitions(aggregatePartition)\n numPartitions = partiallyAggregated.getNumPartitions()\n scale = max(int(ceil(pow(numPartitions, 1.0 / depth))), 2)\n # If creating an extra level doesn't help reduce the wall-clock time, we stop the tree\n # aggregation.\n while numPartitions > scale + numPartitions / scale:\n numPartitions /= scale\n curNumPartitions = int(numPartitions)\n\n def mapPartition(i, iterator):\n for obj in iterator:\n yield (i % curNumPartitions, obj)\n\n partiallyAggregated = partiallyAggregated \\\n .mapPartitionsWithIndex(mapPartition) \\\n .reduceByKey(combOp, curNumPartitions) \\\n .values()\n\n return partiallyAggregated.reduce(combOp)", "response": "This function aggregates the elements of this RDD in a multi - level tree."} {"instruction": "Make a summary of the following Python 3 code\ndef max(self, key=None):\n if key is None:\n return self.reduce(max)\n return self.reduce(lambda a, b: max(a, b, key=key))", "response": "Find the maximum item in this RDD."} {"instruction": "Write a Python 3 script for\nfinding the minimum item in this RDD.", "response": "def min(self, key=None):\n \"\"\"\n Find the minimum item in this RDD.\n\n :param key: A function used to generate key for comparing\n\n >>> rdd = sc.parallelize([2.0, 5.0, 43.0, 10.0])\n >>> rdd.min()\n 2.0\n >>> rdd.min(key=str)\n 10.0\n \"\"\"\n if key is None:\n return self.reduce(min)\n return self.reduce(lambda a, b: min(a, b, key=key))"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef sum(self):\n return self.mapPartitions(lambda x: [sum(x)]).fold(0, operator.add)", "response": "Return the sum of the elements in this RDD."} {"instruction": "Given the following Python 3 function, write the documentation\ndef stats(self):\n def redFunc(left_counter, right_counter):\n return left_counter.mergeStats(right_counter)\n\n return self.mapPartitions(lambda i: [StatCounter(i)]).reduce(redFunc)", "response": "Return a new RDD with the mean variance\n and count of the elements in one operation."} {"instruction": "Make a summary of the following Python 3 code\ndef histogram(self, buckets):\n\n if isinstance(buckets, int):\n if buckets < 1:\n raise ValueError(\"number of buckets must be >= 1\")\n\n # filter out non-comparable elements\n def comparable(x):\n if x is None:\n return False\n if type(x) is float and isnan(x):\n return False\n return True\n\n filtered = self.filter(comparable)\n\n # faster than stats()\n def minmax(a, b):\n return min(a[0], b[0]), max(a[1], b[1])\n try:\n minv, maxv = filtered.map(lambda x: (x, x)).reduce(minmax)\n except TypeError as e:\n if \" empty \" in str(e):\n raise ValueError(\"can not generate buckets from empty RDD\")\n raise\n\n if minv == maxv or buckets == 1:\n return [minv, maxv], [filtered.count()]\n\n try:\n inc = (maxv - minv) / buckets\n except TypeError:\n raise TypeError(\"Can not generate buckets with non-number in RDD\")\n\n if isinf(inc):\n raise ValueError(\"Can not generate buckets with infinite value\")\n\n # keep them as integer if possible\n inc = int(inc)\n if inc * buckets != maxv - minv:\n inc = (maxv - minv) * 1.0 / buckets\n\n buckets = [i * inc + minv for i in range(buckets)]\n buckets.append(maxv) # fix accumulated error\n even = True\n\n elif isinstance(buckets, (list, tuple)):\n if len(buckets) < 2:\n raise ValueError(\"buckets should have more than one value\")\n\n if any(i is None or isinstance(i, float) and isnan(i) for i in buckets):\n raise ValueError(\"can not have None or NaN in buckets\")\n\n if sorted(buckets) != list(buckets):\n raise ValueError(\"buckets should be sorted\")\n\n if len(set(buckets)) != len(buckets):\n raise ValueError(\"buckets should not contain duplicated values\")\n\n minv = buckets[0]\n maxv = buckets[-1]\n even = False\n inc = None\n try:\n steps = [buckets[i + 1] - buckets[i] for i in range(len(buckets) - 1)]\n except TypeError:\n pass # objects in buckets do not support '-'\n else:\n if max(steps) - min(steps) < 1e-10: # handle precision errors\n even = True\n inc = (maxv - minv) / (len(buckets) - 1)\n\n else:\n raise TypeError(\"buckets should be a list or tuple or number(int or long)\")\n\n def histogram(iterator):\n counters = [0] * len(buckets)\n for i in iterator:\n if i is None or (type(i) is float and isnan(i)) or i > maxv or i < minv:\n continue\n t = (int((i - minv) / inc) if even\n else bisect.bisect_right(buckets, i) - 1)\n counters[t] += 1\n # add last two together\n last = counters.pop()\n counters[-1] += last\n return [counters]\n\n def mergeCounters(a, b):\n return [i + j for i, j in zip(a, b)]\n\n return buckets, self.mapPartitions(histogram).reduce(mergeCounters)", "response": "Compute a histogram of the given buckets."} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef countByValue(self):\n def countPartition(iterator):\n counts = defaultdict(int)\n for obj in iterator:\n counts[obj] += 1\n yield counts\n\n def mergeMaps(m1, m2):\n for k, v in m2.items():\n m1[k] += v\n return m1\n return self.mapPartitions(countPartition).reduce(mergeMaps)", "response": "Return the count of each unique value in this RDD as a dictionary of\n = > count"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef top(self, num, key=None):\n def topIterator(iterator):\n yield heapq.nlargest(num, iterator, key=key)\n\n def merge(a, b):\n return heapq.nlargest(num, a + b, key=key)\n\n return self.mapPartitions(topIterator).reduce(merge)", "response": "Return the top N elements from an RDD."} {"instruction": "Write a Python 3 script to\ntake the N elements from an RDD ordered in ascending order or as is specified by the optional key function.", "response": "def takeOrdered(self, num, key=None):\n \"\"\"\n Get the N elements from an RDD ordered in ascending order or as\n specified by the optional key function.\n\n .. note:: this method should only be used if the resulting array is expected\n to be small, as all the data is loaded into the driver's memory.\n\n >>> sc.parallelize([10, 1, 2, 9, 3, 4, 5, 6, 7]).takeOrdered(6)\n [1, 2, 3, 4, 5, 6]\n >>> sc.parallelize([10, 1, 2, 9, 3, 4, 5, 6, 7], 2).takeOrdered(6, key=lambda x: -x)\n [10, 9, 7, 6, 5, 4]\n \"\"\"\n\n def merge(a, b):\n return heapq.nsmallest(num, a + b, key)\n\n return self.mapPartitions(lambda it: [heapq.nsmallest(num, it, key)]).reduce(merge)"} {"instruction": "Write a Python 3 function that can\ntake the first num elements of the RDD.", "response": "def take(self, num):\n \"\"\"\n Take the first num elements of the RDD.\n\n It works by first scanning one partition, and use the results from\n that partition to estimate the number of additional partitions needed\n to satisfy the limit.\n\n Translated from the Scala implementation in RDD#take().\n\n .. note:: this method should only be used if the resulting array is expected\n to be small, as all the data is loaded into the driver's memory.\n\n >>> sc.parallelize([2, 3, 4, 5, 6]).cache().take(2)\n [2, 3]\n >>> sc.parallelize([2, 3, 4, 5, 6]).take(10)\n [2, 3, 4, 5, 6]\n >>> sc.parallelize(range(100), 100).filter(lambda x: x > 90).take(3)\n [91, 92, 93]\n \"\"\"\n items = []\n totalParts = self.getNumPartitions()\n partsScanned = 0\n\n while len(items) < num and partsScanned < totalParts:\n # The number of partitions to try in this iteration.\n # It is ok for this number to be greater than totalParts because\n # we actually cap it at totalParts in runJob.\n numPartsToTry = 1\n if partsScanned > 0:\n # If we didn't find any rows after the previous iteration,\n # quadruple and retry. Otherwise, interpolate the number of\n # partitions we need to try, but overestimate it by 50%.\n # We also cap the estimation in the end.\n if len(items) == 0:\n numPartsToTry = partsScanned * 4\n else:\n # the first parameter of max is >=1 whenever partsScanned >= 2\n numPartsToTry = int(1.5 * num * partsScanned / len(items)) - partsScanned\n numPartsToTry = min(max(numPartsToTry, 1), partsScanned * 4)\n\n left = num - len(items)\n\n def takeUpToNumLeft(iterator):\n iterator = iter(iterator)\n taken = 0\n while taken < left:\n try:\n yield next(iterator)\n except StopIteration:\n return\n taken += 1\n\n p = range(partsScanned, min(partsScanned + numPartsToTry, totalParts))\n res = self.context.runJob(self, takeUpToNumLeft, p)\n\n items += res\n partsScanned += numPartsToTry\n\n return items[:num]"} {"instruction": "How would you implement a function in Python 3 that\nsaves a Python RDD of key - value pairs to any Hadoop file system using the new Hadoop OutputFormat API.", "response": "def saveAsNewAPIHadoopDataset(self, conf, keyConverter=None, valueConverter=None):\n \"\"\"\n Output a Python RDD of key-value pairs (of form C{RDD[(K, V)]}) to any Hadoop file\n system, using the new Hadoop OutputFormat API (mapreduce package). Keys/values are\n converted for output using either user specified converters or, by default,\n L{org.apache.spark.api.python.JavaToWritableConverter}.\n\n :param conf: Hadoop job configuration, passed in as a dict\n :param keyConverter: (None by default)\n :param valueConverter: (None by default)\n \"\"\"\n jconf = self.ctx._dictToJavaMap(conf)\n pickledRDD = self._pickled()\n self.ctx._jvm.PythonRDD.saveAsHadoopDataset(pickledRDD._jrdd, True, jconf,\n keyConverter, valueConverter, True)"} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef saveAsNewAPIHadoopFile(self, path, outputFormatClass, keyClass=None, valueClass=None,\n keyConverter=None, valueConverter=None, conf=None):\n \"\"\"\n Output a Python RDD of key-value pairs (of form C{RDD[(K, V)]}) to any Hadoop file\n system, using the new Hadoop OutputFormat API (mapreduce package). Key and value types\n will be inferred if not specified. Keys and values are converted for output using either\n user specified converters or L{org.apache.spark.api.python.JavaToWritableConverter}. The\n C{conf} is applied on top of the base Hadoop conf associated with the SparkContext\n of this RDD to create a merged Hadoop MapReduce job configuration for saving the data.\n\n :param path: path to Hadoop file\n :param outputFormatClass: fully qualified classname of Hadoop OutputFormat\n (e.g. \"org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat\")\n :param keyClass: fully qualified classname of key Writable class\n (e.g. \"org.apache.hadoop.io.IntWritable\", None by default)\n :param valueClass: fully qualified classname of value Writable class\n (e.g. \"org.apache.hadoop.io.Text\", None by default)\n :param keyConverter: (None by default)\n :param valueConverter: (None by default)\n :param conf: Hadoop job configuration, passed in as a dict (None by default)\n \"\"\"\n jconf = self.ctx._dictToJavaMap(conf)\n pickledRDD = self._pickled()\n self.ctx._jvm.PythonRDD.saveAsNewAPIHadoopFile(pickledRDD._jrdd, True, path,\n outputFormatClass,\n keyClass, valueClass,\n keyConverter, valueConverter, jconf)", "response": "Save the current RDD to a new Hadoop file using the new API."} {"instruction": "Can you write a function in Python 3 where it\nsaves the current RDD to a sequence file.", "response": "def saveAsSequenceFile(self, path, compressionCodecClass=None):\n \"\"\"\n Output a Python RDD of key-value pairs (of form C{RDD[(K, V)]}) to any Hadoop file\n system, using the L{org.apache.hadoop.io.Writable} types that we convert from the\n RDD's key and value types. The mechanism is as follows:\n\n 1. Pyrolite is used to convert pickled Python RDD into RDD of Java objects.\n 2. Keys and values of this Java RDD are converted to Writables and written out.\n\n :param path: path to sequence file\n :param compressionCodecClass: (None by default)\n \"\"\"\n pickledRDD = self._pickled()\n self.ctx._jvm.PythonRDD.saveAsSequenceFile(pickledRDD._jrdd, True,\n path, compressionCodecClass)"} {"instruction": "Can you create a Python 3 function that\nsaves this RDD as a PickleFile.", "response": "def saveAsPickleFile(self, path, batchSize=10):\n \"\"\"\n Save this RDD as a SequenceFile of serialized objects. The serializer\n used is L{pyspark.serializers.PickleSerializer}, default batch size\n is 10.\n\n >>> tmpFile = NamedTemporaryFile(delete=True)\n >>> tmpFile.close()\n >>> sc.parallelize([1, 2, 'spark', 'rdd']).saveAsPickleFile(tmpFile.name, 3)\n >>> sorted(sc.pickleFile(tmpFile.name, 5).map(str).collect())\n ['1', '2', 'rdd', 'spark']\n \"\"\"\n if batchSize == 0:\n ser = AutoBatchedSerializer(PickleSerializer())\n else:\n ser = BatchedSerializer(PickleSerializer(), batchSize)\n self._reserialize(ser)._jrdd.saveAsObjectFile(path)"} {"instruction": "Here you have a function in Python 3, explain what it does\ndef saveAsTextFile(self, path, compressionCodecClass=None):\n def func(split, iterator):\n for x in iterator:\n if not isinstance(x, (unicode, bytes)):\n x = unicode(x)\n if isinstance(x, unicode):\n x = x.encode(\"utf-8\")\n yield x\n keyed = self.mapPartitionsWithIndex(func)\n keyed._bypass_serializer = True\n if compressionCodecClass:\n compressionCodec = self.ctx._jvm.java.lang.Class.forName(compressionCodecClass)\n keyed._jrdd.map(self.ctx._jvm.BytesToString()).saveAsTextFile(path, compressionCodec)\n else:\n keyed._jrdd.map(self.ctx._jvm.BytesToString()).saveAsTextFile(path)", "response": "Save this RDD as a text file using string representations of elements."} {"instruction": "How would you explain what the following Python 3 function does\ndef reduceByKey(self, func, numPartitions=None, partitionFunc=portable_hash):\n return self.combineByKey(lambda x: x, func, func, numPartitions, partitionFunc)", "response": "Return a new RDD with the values for each key using an associative and commutative reduce function."} {"instruction": "Can you implement a function in Python 3 that\nreturns a new DStream with the values for each key using an associative and commutative reduce function.", "response": "def reduceByKeyLocally(self, func):\n \"\"\"\n Merge the values for each key using an associative and commutative reduce function, but\n return the results immediately to the master as a dictionary.\n\n This will also perform the merging locally on each mapper before\n sending results to a reducer, similarly to a \"combiner\" in MapReduce.\n\n >>> from operator import add\n >>> rdd = sc.parallelize([(\"a\", 1), (\"b\", 1), (\"a\", 1)])\n >>> sorted(rdd.reduceByKeyLocally(add).items())\n [('a', 2), ('b', 1)]\n \"\"\"\n func = fail_on_stopiteration(func)\n\n def reducePartition(iterator):\n m = {}\n for k, v in iterator:\n m[k] = func(m[k], v) if k in m else v\n yield m\n\n def mergeMaps(m1, m2):\n for k, v in m2.items():\n m1[k] = func(m1[k], v) if k in m1 else v\n return m1\n return self.mapPartitions(reducePartition).reduce(mergeMaps)"} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef partitionBy(self, numPartitions, partitionFunc=portable_hash):\n if numPartitions is None:\n numPartitions = self._defaultReducePartitions()\n partitioner = Partitioner(numPartitions, partitionFunc)\n if self.partitioner == partitioner:\n return self\n\n # Transferring O(n) objects to Java is too expensive.\n # Instead, we'll form the hash buckets in Python,\n # transferring O(numPartitions) objects to Java.\n # Each object is a (splitNumber, [objects]) pair.\n # In order to avoid too huge objects, the objects are\n # grouped into chunks.\n outputSerializer = self.ctx._unbatched_serializer\n\n limit = (_parse_memory(self.ctx._conf.get(\n \"spark.python.worker.memory\", \"512m\")) / 2)\n\n def add_shuffle_key(split, iterator):\n\n buckets = defaultdict(list)\n c, batch = 0, min(10 * numPartitions, 1000)\n\n for k, v in iterator:\n buckets[partitionFunc(k) % numPartitions].append((k, v))\n c += 1\n\n # check used memory and avg size of chunk of objects\n if (c % 1000 == 0 and get_used_memory() > limit\n or c > batch):\n n, size = len(buckets), 0\n for split in list(buckets.keys()):\n yield pack_long(split)\n d = outputSerializer.dumps(buckets[split])\n del buckets[split]\n yield d\n size += len(d)\n\n avg = int(size / n) >> 20\n # let 1M < avg < 10M\n if avg < 1:\n batch *= 1.5\n elif avg > 10:\n batch = max(int(batch / 1.5), 1)\n c = 0\n\n for split, items in buckets.items():\n yield pack_long(split)\n yield outputSerializer.dumps(items)\n\n keyed = self.mapPartitionsWithIndex(add_shuffle_key, preservesPartitioning=True)\n keyed._bypass_serializer = True\n with SCCallSiteSync(self.context) as css:\n pairRDD = self.ctx._jvm.PairwiseRDD(\n keyed._jrdd.rdd()).asJavaPairRDD()\n jpartitioner = self.ctx._jvm.PythonPartitioner(numPartitions,\n id(partitionFunc))\n jrdd = self.ctx._jvm.PythonRDD.valueOfPair(pairRDD.partitionBy(jpartitioner))\n rdd = RDD(jrdd, self.ctx, BatchedSerializer(outputSerializer))\n rdd.partitioner = partitioner\n return rdd", "response": "Return a copy of the RDD partitioned by the specified partitioner."} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef combineByKey(self, createCombiner, mergeValue, mergeCombiners,\n numPartitions=None, partitionFunc=portable_hash):\n \"\"\"\n Generic function to combine the elements for each key using a custom\n set of aggregation functions.\n\n Turns an RDD[(K, V)] into a result of type RDD[(K, C)], for a \"combined\n type\" C.\n\n Users provide three functions:\n\n - C{createCombiner}, which turns a V into a C (e.g., creates\n a one-element list)\n - C{mergeValue}, to merge a V into a C (e.g., adds it to the end of\n a list)\n - C{mergeCombiners}, to combine two C's into a single one (e.g., merges\n the lists)\n\n To avoid memory allocation, both mergeValue and mergeCombiners are allowed to\n modify and return their first argument instead of creating a new C.\n\n In addition, users can control the partitioning of the output RDD.\n\n .. note:: V and C can be different -- for example, one might group an RDD of type\n (Int, Int) into an RDD of type (Int, List[Int]).\n\n >>> x = sc.parallelize([(\"a\", 1), (\"b\", 1), (\"a\", 2)])\n >>> def to_list(a):\n ... return [a]\n ...\n >>> def append(a, b):\n ... a.append(b)\n ... return a\n ...\n >>> def extend(a, b):\n ... a.extend(b)\n ... return a\n ...\n >>> sorted(x.combineByKey(to_list, append, extend).collect())\n [('a', [1, 2]), ('b', [1])]\n \"\"\"\n if numPartitions is None:\n numPartitions = self._defaultReducePartitions()\n\n serializer = self.ctx.serializer\n memory = self._memory_limit()\n agg = Aggregator(createCombiner, mergeValue, mergeCombiners)\n\n def combineLocally(iterator):\n merger = ExternalMerger(agg, memory * 0.9, serializer)\n merger.mergeValues(iterator)\n return merger.items()\n\n locally_combined = self.mapPartitions(combineLocally, preservesPartitioning=True)\n shuffled = locally_combined.partitionBy(numPartitions, partitionFunc)\n\n def _mergeCombiners(iterator):\n merger = ExternalMerger(agg, memory, serializer)\n merger.mergeCombiners(iterator)\n return merger.items()\n\n return shuffled.mapPartitions(_mergeCombiners, preservesPartitioning=True)", "response": "This function returns an RDD of elements from the first entry in the RDD that are combined with the second entry in the RDD."} {"instruction": "Implement a function in Python 3 to\naggregate the values of each key using given combine functions and a neutral zero value.", "response": "def aggregateByKey(self, zeroValue, seqFunc, combFunc, numPartitions=None,\n partitionFunc=portable_hash):\n \"\"\"\n Aggregate the values of each key, using given combine functions and a neutral\n \"zero value\". This function can return a different result type, U, than the type\n of the values in this RDD, V. Thus, we need one operation for merging a V into\n a U and one operation for merging two U's, The former operation is used for merging\n values within a partition, and the latter is used for merging values between\n partitions. To avoid memory allocation, both of these functions are\n allowed to modify and return their first argument instead of creating a new U.\n \"\"\"\n def createZero():\n return copy.deepcopy(zeroValue)\n\n return self.combineByKey(\n lambda v: seqFunc(createZero(), v), seqFunc, combFunc, numPartitions, partitionFunc)"} {"instruction": "Write a Python 3 script for\nreturning a new table with the values for each key in the table grouped by func.", "response": "def foldByKey(self, zeroValue, func, numPartitions=None, partitionFunc=portable_hash):\n \"\"\"\n Merge the values for each key using an associative function \"func\"\n and a neutral \"zeroValue\" which may be added to the result an\n arbitrary number of times, and must not change the result\n (e.g., 0 for addition, or 1 for multiplication.).\n\n >>> rdd = sc.parallelize([(\"a\", 1), (\"b\", 1), (\"a\", 1)])\n >>> from operator import add\n >>> sorted(rdd.foldByKey(0, add).collect())\n [('a', 2), ('b', 1)]\n \"\"\"\n def createZero():\n return copy.deepcopy(zeroValue)\n\n return self.combineByKey(lambda v: func(createZero(), v), func, func, numPartitions,\n partitionFunc)"} {"instruction": "How would you explain what the following Python 3 function does\ndef groupByKey(self, numPartitions=None, partitionFunc=portable_hash):\n def createCombiner(x):\n return [x]\n\n def mergeValue(xs, x):\n xs.append(x)\n return xs\n\n def mergeCombiners(a, b):\n a.extend(b)\n return a\n\n memory = self._memory_limit()\n serializer = self._jrdd_deserializer\n agg = Aggregator(createCombiner, mergeValue, mergeCombiners)\n\n def combine(iterator):\n merger = ExternalMerger(agg, memory * 0.9, serializer)\n merger.mergeValues(iterator)\n return merger.items()\n\n locally_combined = self.mapPartitions(combine, preservesPartitioning=True)\n shuffled = locally_combined.partitionBy(numPartitions, partitionFunc)\n\n def groupByKey(it):\n merger = ExternalGroupBy(agg, memory, serializer)\n merger.mergeCombiners(it)\n return merger.items()\n\n return shuffled.mapPartitions(groupByKey, True).mapValues(ResultIterable)", "response": "Return an RDD of all the values for each key in the RDD."} {"instruction": "Explain what the following Python 3 code does\ndef flatMapValues(self, f):\n flat_map_fn = lambda kv: ((kv[0], x) for x in f(kv[1]))\n return self.flatMap(flat_map_fn, preservesPartitioning=True)", "response": "Return a new RDD with the elements of each key - value pair obtained by applying a function to each value in the key - value pair."} {"instruction": "Given the following Python 3 function, write the documentation\ndef mapValues(self, f):\n map_values_fn = lambda kv: (kv[0], f(kv[1]))\n return self.map(map_values_fn, preservesPartitioning=True)", "response": "Return an RDD with the values of each key - value pair mapped through a map function."} {"instruction": "Here you have a function in Python 3, explain what it does\ndef sampleByKey(self, withReplacement, fractions, seed=None):\n for fraction in fractions.values():\n assert fraction >= 0.0, \"Negative fraction value: %s\" % fraction\n return self.mapPartitionsWithIndex(\n RDDStratifiedSampler(withReplacement, fractions, seed).func, True)", "response": "Return a subset of this RDD with the values of the keys specified by fractions."} {"instruction": "Can you implement a function in Python 3 that\nreturns each key value pair in self that has no pair with matching key in other.", "response": "def subtractByKey(self, other, numPartitions=None):\n \"\"\"\n Return each (key, value) pair in C{self} that has no pair with matching\n key in C{other}.\n\n >>> x = sc.parallelize([(\"a\", 1), (\"b\", 4), (\"b\", 5), (\"a\", 2)])\n >>> y = sc.parallelize([(\"a\", 3), (\"c\", None)])\n >>> sorted(x.subtractByKey(y).collect())\n [('b', 4), ('b', 5)]\n \"\"\"\n def filter_func(pair):\n key, (val1, val2) = pair\n return val1 and not val2\n return self.cogroup(other, numPartitions).filter(filter_func).flatMapValues(lambda x: x[0])"} {"instruction": "Given the following Python 3 function, write the documentation\ndef subtract(self, other, numPartitions=None):\n # note: here 'True' is just a placeholder\n rdd = other.map(lambda x: (x, True))\n return self.map(lambda x: (x, True)).subtractByKey(rdd, numPartitions).keys()", "response": "Return each value in self that is not contained in other."} {"instruction": "Can you tell what is the following Python 3 function doing\ndef coalesce(self, numPartitions, shuffle=False):\n if shuffle:\n # Decrease the batch size in order to distribute evenly the elements across output\n # partitions. Otherwise, repartition will possibly produce highly skewed partitions.\n batchSize = min(10, self.ctx._batchSize or 1024)\n ser = BatchedSerializer(PickleSerializer(), batchSize)\n selfCopy = self._reserialize(ser)\n jrdd_deserializer = selfCopy._jrdd_deserializer\n jrdd = selfCopy._jrdd.coalesce(numPartitions, shuffle)\n else:\n jrdd_deserializer = self._jrdd_deserializer\n jrdd = self._jrdd.coalesce(numPartitions, shuffle)\n return RDD(jrdd, self.ctx, jrdd_deserializer)", "response": "Return a new RDD that is reduced into numPartitions partitions."} {"instruction": "Can you tell what is the following Python 3 function doing\ndef zip(self, other):\n def get_batch_size(ser):\n if isinstance(ser, BatchedSerializer):\n return ser.batchSize\n return 1 # not batched\n\n def batch_as(rdd, batchSize):\n return rdd._reserialize(BatchedSerializer(PickleSerializer(), batchSize))\n\n my_batch = get_batch_size(self._jrdd_deserializer)\n other_batch = get_batch_size(other._jrdd_deserializer)\n if my_batch != other_batch or not my_batch:\n # use the smallest batchSize for both of them\n batchSize = min(my_batch, other_batch)\n if batchSize <= 0:\n # auto batched or unlimited\n batchSize = 100\n other = batch_as(other, batchSize)\n self = batch_as(self, batchSize)\n\n if self.getNumPartitions() != other.getNumPartitions():\n raise ValueError(\"Can only zip with RDD which has the same number of partitions\")\n\n # There will be an Exception in JVM if there are different number\n # of items in each partitions.\n pairRDD = self._jrdd.zip(other._jrdd)\n deserializer = PairDeserializer(self._jrdd_deserializer,\n other._jrdd_deserializer)\n return RDD(pairRDD, self.ctx, deserializer)", "response": "Returns an RDD of key - value pairs with the same entry - level as this one."} {"instruction": "Can you create a Python 3 function that\nreturns an RDD with its element indices.", "response": "def zipWithIndex(self):\n \"\"\"\n Zips this RDD with its element indices.\n\n The ordering is first based on the partition index and then the\n ordering of items within each partition. So the first item in\n the first partition gets index 0, and the last item in the last\n partition receives the largest index.\n\n This method needs to trigger a spark job when this RDD contains\n more than one partitions.\n\n >>> sc.parallelize([\"a\", \"b\", \"c\", \"d\"], 3).zipWithIndex().collect()\n [('a', 0), ('b', 1), ('c', 2), ('d', 3)]\n \"\"\"\n starts = [0]\n if self.getNumPartitions() > 1:\n nums = self.mapPartitions(lambda it: [sum(1 for i in it)]).collect()\n for i in range(len(nums) - 1):\n starts.append(starts[-1] + nums[i])\n\n def func(k, it):\n for i, v in enumerate(it, starts[k]):\n yield v, i\n\n return self.mapPartitionsWithIndex(func)"} {"instruction": "How would you explain what the following Python 3 function does\ndef zipWithUniqueId(self):\n n = self.getNumPartitions()\n\n def func(k, it):\n for i, v in enumerate(it):\n yield v, i * n + k\n\n return self.mapPartitionsWithIndex(func)", "response": "Returns an RDD with generated unique Long ids."} {"instruction": "Write a Python 3 function that can\nget the current storage level of the RDD.", "response": "def getStorageLevel(self):\n \"\"\"\n Get the RDD's current storage level.\n\n >>> rdd1 = sc.parallelize([1,2])\n >>> rdd1.getStorageLevel()\n StorageLevel(False, False, False, False, 1)\n >>> print(rdd1.getStorageLevel())\n Serialized 1x Replicated\n \"\"\"\n java_storage_level = self._jrdd.getStorageLevel()\n storage_level = StorageLevel(java_storage_level.useDisk(),\n java_storage_level.useMemory(),\n java_storage_level.useOffHeap(),\n java_storage_level.deserialized(),\n java_storage_level.replication())\n return storage_level"} {"instruction": "How would you code a function in Python 3 to\nreturn the default number of partitions to use during reduce tasks.", "response": "def _defaultReducePartitions(self):\n \"\"\"\n Returns the default number of partitions to use during reduce tasks (e.g., groupBy).\n If spark.default.parallelism is set, then we'll use the value from SparkContext\n defaultParallelism, otherwise we'll use the number of partitions in this RDD.\n\n This mirrors the behavior of the Scala Partitioner#defaultPartitioner, intended to reduce\n the likelihood of OOMs. Once PySpark adopts Partitioner-based APIs, this behavior will\n be inherent.\n \"\"\"\n if self.ctx._conf.contains(\"spark.default.parallelism\"):\n return self.ctx.defaultParallelism\n else:\n return self.getNumPartitions()"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef lookup(self, key):\n values = self.filter(lambda kv: kv[0] == key).values()\n\n if self.partitioner is not None:\n return self.ctx.runJob(values, lambda x: x, [self.partitioner(key)])\n\n return values.collect()", "response": "Return the list of values in the RDD for key key. This operation is efficiently by searching the partitioner by only\n ."} {"instruction": "Write a Python 3 script to\nreturn a JavaRDD of Object by unpickling", "response": "def _to_java_object_rdd(self):\n \"\"\" Return a JavaRDD of Object by unpickling\n\n It will convert each Python object into Java object by Pyrolite, whenever the\n RDD is serialized in batch or not.\n \"\"\"\n rdd = self._pickled()\n return self.ctx._jvm.SerDeUtil.pythonToJava(rdd._jrdd, True)"} {"instruction": "Can you implement a function in Python 3 that\ncounts the number of incomplete entries within a given timeout.", "response": "def countApprox(self, timeout, confidence=0.95):\n \"\"\"\n .. note:: Experimental\n\n Approximate version of count() that returns a potentially incomplete\n result within a timeout, even if not all tasks have finished.\n\n >>> rdd = sc.parallelize(range(1000), 10)\n >>> rdd.countApprox(1000, 1.0)\n 1000\n \"\"\"\n drdd = self.mapPartitions(lambda it: [float(sum(1 for i in it))])\n return int(drdd.sumApprox(timeout, confidence))"} {"instruction": "How would you explain what the following Python 3 function does\ndef sumApprox(self, timeout, confidence=0.95):\n jrdd = self.mapPartitions(lambda it: [float(sum(it))])._to_java_object_rdd()\n jdrdd = self.ctx._jvm.JavaDoubleRDD.fromRDD(jrdd.rdd())\n r = jdrdd.sumApprox(timeout, confidence).getFinalValue()\n return BoundedFloat(r.mean(), r.confidence(), r.low(), r.high())", "response": "Return the sum of the elements within a given timeout."} {"instruction": "Can you tell what is the following Python 3 function doing\ndef meanApprox(self, timeout, confidence=0.95):\n jrdd = self.map(float)._to_java_object_rdd()\n jdrdd = self.ctx._jvm.JavaDoubleRDD.fromRDD(jrdd.rdd())\n r = jdrdd.meanApprox(timeout, confidence).getFinalValue()\n return BoundedFloat(r.mean(), r.confidence(), r.low(), r.high())", "response": "Return the mean of the set of entries within a given timeout."} {"instruction": "Explain what the following Python 3 code does\ndef countApproxDistinct(self, relativeSD=0.05):\n if relativeSD < 0.000017:\n raise ValueError(\"relativeSD should be greater than 0.000017\")\n # the hash space in Java is 2^32\n hashRDD = self.map(lambda x: portable_hash(x) & 0xFFFFFFFF)\n return hashRDD._to_java_object_rdd().countApproxDistinct(relativeSD)", "response": "Return approximate number of distinct elements in the RDD."} {"instruction": "Write a Python 3 function for\nreturning an iterator that contains all of the elements in this RDD.", "response": "def toLocalIterator(self):\n \"\"\"\n Return an iterator that contains all of the elements in this RDD.\n The iterator will consume as much memory as the largest partition in this RDD.\n\n >>> rdd = sc.parallelize(range(10))\n >>> [x for x in rdd.toLocalIterator()]\n [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]\n \"\"\"\n with SCCallSiteSync(self.context) as css:\n sock_info = self.ctx._jvm.PythonRDD.toLocalIteratorAndServe(self._jrdd.rdd())\n return _load_from_socket(sock_info, self._jrdd_deserializer)"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef mapPartitions(self, f, preservesPartitioning=False):\n def func(s, iterator):\n return f(iterator)\n return PipelinedRDD(self.rdd, func, preservesPartitioning, isFromBarrier=True)", "response": "Returns a new RDD by applying a function to each partition of the wrapped RDD."} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef _to_seq(sc, cols, converter=None):\n if converter:\n cols = [converter(c) for c in cols]\n return sc._jvm.PythonUtils.toSeq(cols)", "response": "Convert a list of Column names or names into a JVM Seq of Column objects."} {"instruction": "Write a Python 3 function that can\nconvert a list of Column or names into a JVM ( Scala ) List of Column objects.", "response": "def _to_list(sc, cols, converter=None):\n \"\"\"\n Convert a list of Column (or names) into a JVM (Scala) List of Column.\n\n An optional `converter` could be used to convert items in `cols`\n into JVM Column objects.\n \"\"\"\n if converter:\n cols = [converter(c) for c in cols]\n return sc._jvm.PythonUtils.toList(cols)"} {"instruction": "Write a Python 3 script for\ncreating a method for given unary operator", "response": "def _unary_op(name, doc=\"unary operator\"):\n \"\"\" Create a method for given unary operator \"\"\"\n def _(self):\n jc = getattr(self._jc, name)()\n return Column(jc)\n _.__doc__ = doc\n return _"} {"instruction": "Can you create a Python 3 function that\ncreates a method that returns a new object for the given binary operator.", "response": "def _bin_op(name, doc=\"binary operator\"):\n \"\"\" Create a method for given binary operator\n \"\"\"\n def _(self, other):\n jc = other._jc if isinstance(other, Column) else other\n njc = getattr(self._jc, name)(jc)\n return Column(njc)\n _.__doc__ = doc\n return _"} {"instruction": "Make a summary of the following Python 3 code\ndef _reverse_op(name, doc=\"binary operator\"):\n def _(self, other):\n jother = _create_column_from_literal(other)\n jc = getattr(jother, name)(self._jc)\n return Column(jc)\n _.__doc__ = doc\n return _", "response": "Create a method for binary operator"} {"instruction": "Create a Python 3 function for\nreturning a Column which is a substring of the column.", "response": "def substr(self, startPos, length):\n \"\"\"\n Return a :class:`Column` which is a substring of the column.\n\n :param startPos: start position (int or Column)\n :param length: length of the substring (int or Column)\n\n >>> df.select(df.name.substr(1, 3).alias(\"col\")).collect()\n [Row(col=u'Ali'), Row(col=u'Bob')]\n \"\"\"\n if type(startPos) != type(length):\n raise TypeError(\n \"startPos and length must be the same type. \"\n \"Got {startPos_t} and {length_t}, respectively.\"\n .format(\n startPos_t=type(startPos),\n length_t=type(length),\n ))\n if isinstance(startPos, int):\n jc = self._jc.substr(startPos, length)\n elif isinstance(startPos, Column):\n jc = self._jc.substr(startPos._jc, length._jc)\n else:\n raise TypeError(\"Unexpected type: %s\" % type(startPos))\n return Column(jc)"} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef isin(self, *cols):\n if len(cols) == 1 and isinstance(cols[0], (list, set)):\n cols = cols[0]\n cols = [c._jc if isinstance(c, Column) else _create_column_from_literal(c) for c in cols]\n sc = SparkContext._active_spark_context\n jc = getattr(self._jc, \"isin\")(_to_seq(sc, cols))\n return Column(jc)", "response": "A boolean expression that is evaluated to true if the value of this\n expression is contained by the evaluated values of the arguments."} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef alias(self, *alias, **kwargs):\n\n metadata = kwargs.pop('metadata', None)\n assert not kwargs, 'Unexpected kwargs where passed: %s' % kwargs\n\n sc = SparkContext._active_spark_context\n if len(alias) == 1:\n if metadata:\n jmeta = sc._jvm.org.apache.spark.sql.types.Metadata.fromJson(\n json.dumps(metadata))\n return Column(getattr(self._jc, \"as\")(alias[0], jmeta))\n else:\n return Column(getattr(self._jc, \"as\")(alias[0]))\n else:\n if metadata:\n raise ValueError('metadata can only be provided for a single column')\n return Column(getattr(self._jc, \"as\")(_to_seq(sc, list(alias))))", "response": "Returns this column aliased with a new name or names."} {"instruction": "Implement a function in Python 3 to\nconvert the column into type dataType.", "response": "def cast(self, dataType):\n \"\"\" Convert the column into type ``dataType``.\n\n >>> df.select(df.age.cast(\"string\").alias('ages')).collect()\n [Row(ages=u'2'), Row(ages=u'5')]\n >>> df.select(df.age.cast(StringType()).alias('ages')).collect()\n [Row(ages=u'2'), Row(ages=u'5')]\n \"\"\"\n if isinstance(dataType, basestring):\n jc = self._jc.cast(dataType)\n elif isinstance(dataType, DataType):\n from pyspark.sql import SparkSession\n spark = SparkSession.builder.getOrCreate()\n jdt = spark._jsparkSession.parseDataType(dataType.json())\n jc = self._jc.cast(jdt)\n else:\n raise TypeError(\"unexpected type: %s\" % type(dataType))\n return Column(jc)"} {"instruction": "How would you code a function in Python 3 to\nevaluate a list of conditions and returns one of multiple possible result expressions.", "response": "def when(self, condition, value):\n \"\"\"\n Evaluates a list of conditions and returns one of multiple possible result expressions.\n If :func:`Column.otherwise` is not invoked, None is returned for unmatched conditions.\n\n See :func:`pyspark.sql.functions.when` for example usage.\n\n :param condition: a boolean :class:`Column` expression.\n :param value: a literal value, or a :class:`Column` expression.\n\n >>> from pyspark.sql import functions as F\n >>> df.select(df.name, F.when(df.age > 4, 1).when(df.age < 3, -1).otherwise(0)).show()\n +-----+------------------------------------------------------------+\n | name|CASE WHEN (age > 4) THEN 1 WHEN (age < 3) THEN -1 ELSE 0 END|\n +-----+------------------------------------------------------------+\n |Alice| -1|\n | Bob| 1|\n +-----+------------------------------------------------------------+\n \"\"\"\n if not isinstance(condition, Column):\n raise TypeError(\"condition should be a Column\")\n v = value._jc if isinstance(value, Column) else value\n jc = self._jc.when(condition._jc, v)\n return Column(jc)"} {"instruction": "Implement a function in Python 3 to\nevaluate a list of conditions and returns one of multiple possible result expressions.", "response": "def otherwise(self, value):\n \"\"\"\n Evaluates a list of conditions and returns one of multiple possible result expressions.\n If :func:`Column.otherwise` is not invoked, None is returned for unmatched conditions.\n\n See :func:`pyspark.sql.functions.when` for example usage.\n\n :param value: a literal value, or a :class:`Column` expression.\n\n >>> from pyspark.sql import functions as F\n >>> df.select(df.name, F.when(df.age > 3, 1).otherwise(0)).show()\n +-----+-------------------------------------+\n | name|CASE WHEN (age > 3) THEN 1 ELSE 0 END|\n +-----+-------------------------------------+\n |Alice| 0|\n | Bob| 1|\n +-----+-------------------------------------+\n \"\"\"\n v = value._jc if isinstance(value, Column) else value\n jc = self._jc.otherwise(v)\n return Column(jc)"} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef over(self, window):\n from pyspark.sql.window import WindowSpec\n if not isinstance(window, WindowSpec):\n raise TypeError(\"window should be WindowSpec\")\n jc = self._jc.over(window._jspec)\n return Column(jc)", "response": "Define a windowing column."} {"instruction": "How would you code a function in Python 3 to\napply transformation on a vector or an RDD of Vectors.", "response": "def transform(self, vector):\n \"\"\"\n Applies transformation on a vector or an RDD[Vector].\n\n .. note:: In Python, transform cannot currently be used within\n an RDD transformation or action.\n Call transform directly on the RDD instead.\n\n :param vector: Vector or RDD of Vector to be transformed.\n \"\"\"\n if isinstance(vector, RDD):\n vector = vector.map(_convert_to_vector)\n else:\n vector = _convert_to_vector(vector)\n return self.call(\"transform\", vector)"} {"instruction": "Implement a function in Python 3 to\nfit the model to the given dataset.", "response": "def fit(self, dataset):\n \"\"\"\n Computes the mean and variance and stores as a model to be used\n for later scaling.\n\n :param dataset: The data used to compute the mean and variance\n to build the transformation model.\n :return: a StandardScalarModel\n \"\"\"\n dataset = dataset.map(_convert_to_vector)\n jmodel = callMLlibFunc(\"fitStandardScaler\", self.withMean, self.withStd, dataset)\n return StandardScalerModel(jmodel)"} {"instruction": "Implement a Python 3 function for\nfitting a ChiSquared feature selector.", "response": "def fit(self, data):\n \"\"\"\n Returns a ChiSquared feature selector.\n\n :param data: an `RDD[LabeledPoint]` containing the labeled dataset\n with categorical features. Real-valued features will be\n treated as categorical for each distinct value.\n Apply feature discretizer before using this function.\n \"\"\"\n jmodel = callMLlibFunc(\"fitChiSqSelector\", self.selectorType, self.numTopFeatures,\n self.percentile, self.fpr, self.fdr, self.fwe, data)\n return ChiSqSelectorModel(jmodel)"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef fit(self, data):\n jmodel = callMLlibFunc(\"fitPCA\", self.k, data)\n return PCAModel(jmodel)", "response": "Computes a PCAModel that contains the principal components of the input vectors."} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef transform(self, document):\n if isinstance(document, RDD):\n return document.map(self.transform)\n\n freq = {}\n for term in document:\n i = self.indexOf(term)\n freq[i] = 1.0 if self.binary else freq.get(i, 0) + 1.0\n return Vectors.sparse(self.numFeatures, freq.items())", "response": "Transforms the input document to term frequency\n vectors or transform the RDD of document to RDD of term frequency\n vectors or transform the input document to RDD of term frequency\n vectors or transform the RDD of document to RDD of term frequency\n vectors."} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef fit(self, dataset):\n if not isinstance(dataset, RDD):\n raise TypeError(\"dataset should be an RDD of term frequency vectors\")\n jmodel = callMLlibFunc(\"fitIDF\", self.minDocFreq, dataset.map(_convert_to_vector))\n return IDFModel(jmodel)", "response": "Computes the inverse document frequency of the given term frequency vectors."} {"instruction": "Write a Python 3 function that can\nfind synonyms of a word.", "response": "def findSynonyms(self, word, num):\n \"\"\"\n Find synonyms of a word\n\n :param word: a word or a vector representation of word\n :param num: number of synonyms to find\n :return: array of (word, cosineSimilarity)\n\n .. note:: Local use only\n \"\"\"\n if not isinstance(word, basestring):\n word = _convert_to_vector(word)\n words, similarity = self.call(\"findSynonyms\", word, num)\n return zip(words, similarity)"} {"instruction": "How would you explain what the following Python 3 function does\ndef load(cls, sc, path):\n jmodel = sc._jvm.org.apache.spark.mllib.feature \\\n .Word2VecModel.load(sc._jsc.sc(), path)\n model = sc._jvm.org.apache.spark.mllib.api.python.Word2VecModelWrapper(jmodel)\n return Word2VecModel(model)", "response": "Loads a word2vec model from the given path."} {"instruction": "Write a Python 3 function that can\ncompute the Hadamard product of the vector.", "response": "def transform(self, vector):\n \"\"\"\n Computes the Hadamard product of the vector.\n \"\"\"\n if isinstance(vector, RDD):\n vector = vector.map(_convert_to_vector)\n\n else:\n vector = _convert_to_vector(vector)\n return callMLlibFunc(\"elementwiseProductVector\", self.scalingVector, vector)"} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef predict(self, x):\n if isinstance(x, RDD):\n return self.call(\"predict\", x.map(_convert_to_vector))\n\n else:\n return self.call(\"predict\", _convert_to_vector(x))", "response": "Predict values for a single data point or an RDD of points."} {"instruction": "Write a Python 3 script to\ntrain a decision tree model for classification.", "response": "def trainClassifier(cls, data, numClasses, categoricalFeaturesInfo,\n impurity=\"gini\", maxDepth=5, maxBins=32, minInstancesPerNode=1,\n minInfoGain=0.0):\n \"\"\"\n Train a decision tree model for classification.\n\n :param data:\n Training data: RDD of LabeledPoint. Labels should take values\n {0, 1, ..., numClasses-1}.\n :param numClasses:\n Number of classes for classification.\n :param categoricalFeaturesInfo:\n Map storing arity of categorical features. An entry (n -> k)\n indicates that feature n is categorical with k categories\n indexed from 0: {0, 1, ..., k-1}.\n :param impurity:\n Criterion used for information gain calculation.\n Supported values: \"gini\" or \"entropy\".\n (default: \"gini\")\n :param maxDepth:\n Maximum depth of tree (e.g. depth 0 means 1 leaf node, depth 1\n means 1 internal node + 2 leaf nodes).\n (default: 5)\n :param maxBins:\n Number of bins used for finding splits at each node.\n (default: 32)\n :param minInstancesPerNode:\n Minimum number of instances required at child nodes to create\n the parent split.\n (default: 1)\n :param minInfoGain:\n Minimum info gain required to create a split.\n (default: 0.0)\n :return:\n DecisionTreeModel.\n\n Example usage:\n\n >>> from numpy import array\n >>> from pyspark.mllib.regression import LabeledPoint\n >>> from pyspark.mllib.tree import DecisionTree\n >>>\n >>> data = [\n ... LabeledPoint(0.0, [0.0]),\n ... LabeledPoint(1.0, [1.0]),\n ... LabeledPoint(1.0, [2.0]),\n ... LabeledPoint(1.0, [3.0])\n ... ]\n >>> model = DecisionTree.trainClassifier(sc.parallelize(data), 2, {})\n >>> print(model)\n DecisionTreeModel classifier of depth 1 with 3 nodes\n\n >>> print(model.toDebugString())\n DecisionTreeModel classifier of depth 1 with 3 nodes\n If (feature 0 <= 0.5)\n Predict: 0.0\n Else (feature 0 > 0.5)\n Predict: 1.0\n \n >>> model.predict(array([1.0]))\n 1.0\n >>> model.predict(array([0.0]))\n 0.0\n >>> rdd = sc.parallelize([[1.0], [0.0]])\n >>> model.predict(rdd).collect()\n [1.0, 0.0]\n \"\"\"\n return cls._train(data, \"classification\", numClasses, categoricalFeaturesInfo,\n impurity, maxDepth, maxBins, minInstancesPerNode, minInfoGain)"} {"instruction": "Can you create a Python 3 function that\ntrains a Decision Tree model for regression.", "response": "def trainRegressor(cls, data, categoricalFeaturesInfo,\n impurity=\"variance\", maxDepth=5, maxBins=32, minInstancesPerNode=1,\n minInfoGain=0.0):\n \"\"\"\n Train a decision tree model for regression.\n\n :param data:\n Training data: RDD of LabeledPoint. Labels are real numbers.\n :param categoricalFeaturesInfo:\n Map storing arity of categorical features. An entry (n -> k)\n indicates that feature n is categorical with k categories\n indexed from 0: {0, 1, ..., k-1}.\n :param impurity:\n Criterion used for information gain calculation.\n The only supported value for regression is \"variance\".\n (default: \"variance\")\n :param maxDepth:\n Maximum depth of tree (e.g. depth 0 means 1 leaf node, depth 1\n means 1 internal node + 2 leaf nodes).\n (default: 5)\n :param maxBins:\n Number of bins used for finding splits at each node.\n (default: 32)\n :param minInstancesPerNode:\n Minimum number of instances required at child nodes to create\n the parent split.\n (default: 1)\n :param minInfoGain:\n Minimum info gain required to create a split.\n (default: 0.0)\n :return:\n DecisionTreeModel.\n\n Example usage:\n\n >>> from pyspark.mllib.regression import LabeledPoint\n >>> from pyspark.mllib.tree import DecisionTree\n >>> from pyspark.mllib.linalg import SparseVector\n >>>\n >>> sparse_data = [\n ... LabeledPoint(0.0, SparseVector(2, {0: 0.0})),\n ... LabeledPoint(1.0, SparseVector(2, {1: 1.0})),\n ... LabeledPoint(0.0, SparseVector(2, {0: 0.0})),\n ... LabeledPoint(1.0, SparseVector(2, {1: 2.0}))\n ... ]\n >>>\n >>> model = DecisionTree.trainRegressor(sc.parallelize(sparse_data), {})\n >>> model.predict(SparseVector(2, {1: 1.0}))\n 1.0\n >>> model.predict(SparseVector(2, {1: 0.0}))\n 0.0\n >>> rdd = sc.parallelize([[0.0, 1.0], [0.0, 0.0]])\n >>> model.predict(rdd).collect()\n [1.0, 0.0]\n \"\"\"\n return cls._train(data, \"regression\", 0, categoricalFeaturesInfo,\n impurity, maxDepth, maxBins, minInstancesPerNode, minInfoGain)"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef trainClassifier(cls, data, numClasses, categoricalFeaturesInfo, numTrees,\n featureSubsetStrategy=\"auto\", impurity=\"gini\", maxDepth=4, maxBins=32,\n seed=None):\n \"\"\"\n Train a random forest model for binary or multiclass\n classification.\n\n :param data:\n Training dataset: RDD of LabeledPoint. Labels should take values\n {0, 1, ..., numClasses-1}.\n :param numClasses:\n Number of classes for classification.\n :param categoricalFeaturesInfo:\n Map storing arity of categorical features. An entry (n -> k)\n indicates that feature n is categorical with k categories\n indexed from 0: {0, 1, ..., k-1}.\n :param numTrees:\n Number of trees in the random forest.\n :param featureSubsetStrategy:\n Number of features to consider for splits at each node.\n Supported values: \"auto\", \"all\", \"sqrt\", \"log2\", \"onethird\".\n If \"auto\" is set, this parameter is set based on numTrees:\n if numTrees == 1, set to \"all\";\n if numTrees > 1 (forest) set to \"sqrt\".\n (default: \"auto\")\n :param impurity:\n Criterion used for information gain calculation.\n Supported values: \"gini\" or \"entropy\".\n (default: \"gini\")\n :param maxDepth:\n Maximum depth of tree (e.g. depth 0 means 1 leaf node, depth 1\n means 1 internal node + 2 leaf nodes).\n (default: 4)\n :param maxBins:\n Maximum number of bins used for splitting features.\n (default: 32)\n :param seed:\n Random seed for bootstrapping and choosing feature subsets.\n Set as None to generate seed based on system time.\n (default: None)\n :return:\n RandomForestModel that can be used for prediction.\n\n Example usage:\n\n >>> from pyspark.mllib.regression import LabeledPoint\n >>> from pyspark.mllib.tree import RandomForest\n >>>\n >>> data = [\n ... LabeledPoint(0.0, [0.0]),\n ... LabeledPoint(0.0, [1.0]),\n ... LabeledPoint(1.0, [2.0]),\n ... LabeledPoint(1.0, [3.0])\n ... ]\n >>> model = RandomForest.trainClassifier(sc.parallelize(data), 2, {}, 3, seed=42)\n >>> model.numTrees()\n 3\n >>> model.totalNumNodes()\n 7\n >>> print(model)\n TreeEnsembleModel classifier with 3 trees\n \n >>> print(model.toDebugString())\n TreeEnsembleModel classifier with 3 trees\n \n Tree 0:\n Predict: 1.0\n Tree 1:\n If (feature 0 <= 1.5)\n Predict: 0.0\n Else (feature 0 > 1.5)\n Predict: 1.0\n Tree 2:\n If (feature 0 <= 1.5)\n Predict: 0.0\n Else (feature 0 > 1.5)\n Predict: 1.0\n \n >>> model.predict([2.0])\n 1.0\n >>> model.predict([0.0])\n 0.0\n >>> rdd = sc.parallelize([[3.0], [1.0]])\n >>> model.predict(rdd).collect()\n [1.0, 0.0]\n \"\"\"\n return cls._train(data, \"classification\", numClasses,\n categoricalFeaturesInfo, numTrees, featureSubsetStrategy, impurity,\n maxDepth, maxBins, seed)", "response": "Train a RandomForestModel for binary or multiclass classification."} {"instruction": "How would you implement a function in Python 3 that\ntrains a random forest model for regression.", "response": "def trainRegressor(cls, data, categoricalFeaturesInfo, numTrees, featureSubsetStrategy=\"auto\",\n impurity=\"variance\", maxDepth=4, maxBins=32, seed=None):\n \"\"\"\n Train a random forest model for regression.\n\n :param data:\n Training dataset: RDD of LabeledPoint. Labels are real numbers.\n :param categoricalFeaturesInfo:\n Map storing arity of categorical features. An entry (n -> k)\n indicates that feature n is categorical with k categories\n indexed from 0: {0, 1, ..., k-1}.\n :param numTrees:\n Number of trees in the random forest.\n :param featureSubsetStrategy:\n Number of features to consider for splits at each node.\n Supported values: \"auto\", \"all\", \"sqrt\", \"log2\", \"onethird\".\n If \"auto\" is set, this parameter is set based on numTrees:\n if numTrees == 1, set to \"all\";\n if numTrees > 1 (forest) set to \"onethird\" for regression.\n (default: \"auto\")\n :param impurity:\n Criterion used for information gain calculation.\n The only supported value for regression is \"variance\".\n (default: \"variance\")\n :param maxDepth:\n Maximum depth of tree (e.g. depth 0 means 1 leaf node, depth 1\n means 1 internal node + 2 leaf nodes).\n (default: 4)\n :param maxBins:\n Maximum number of bins used for splitting features.\n (default: 32)\n :param seed:\n Random seed for bootstrapping and choosing feature subsets.\n Set as None to generate seed based on system time.\n (default: None)\n :return:\n RandomForestModel that can be used for prediction.\n\n Example usage:\n\n >>> from pyspark.mllib.regression import LabeledPoint\n >>> from pyspark.mllib.tree import RandomForest\n >>> from pyspark.mllib.linalg import SparseVector\n >>>\n >>> sparse_data = [\n ... LabeledPoint(0.0, SparseVector(2, {0: 1.0})),\n ... LabeledPoint(1.0, SparseVector(2, {1: 1.0})),\n ... LabeledPoint(0.0, SparseVector(2, {0: 1.0})),\n ... LabeledPoint(1.0, SparseVector(2, {1: 2.0}))\n ... ]\n >>>\n >>> model = RandomForest.trainRegressor(sc.parallelize(sparse_data), {}, 2, seed=42)\n >>> model.numTrees()\n 2\n >>> model.totalNumNodes()\n 4\n >>> model.predict(SparseVector(2, {1: 1.0}))\n 1.0\n >>> model.predict(SparseVector(2, {0: 1.0}))\n 0.5\n >>> rdd = sc.parallelize([[0.0, 1.0], [1.0, 0.0]])\n >>> model.predict(rdd).collect()\n [1.0, 0.5]\n \"\"\"\n return cls._train(data, \"regression\", 0, categoricalFeaturesInfo, numTrees,\n featureSubsetStrategy, impurity, maxDepth, maxBins, seed)"} {"instruction": "Make a summary of the following Python 3 code\ndef trainClassifier(cls, data, categoricalFeaturesInfo,\n loss=\"logLoss\", numIterations=100, learningRate=0.1, maxDepth=3,\n maxBins=32):\n \"\"\"\n Train a gradient-boosted trees model for classification.\n\n :param data:\n Training dataset: RDD of LabeledPoint. Labels should take values\n {0, 1}.\n :param categoricalFeaturesInfo:\n Map storing arity of categorical features. An entry (n -> k)\n indicates that feature n is categorical with k categories\n indexed from 0: {0, 1, ..., k-1}.\n :param loss:\n Loss function used for minimization during gradient boosting.\n Supported values: \"logLoss\", \"leastSquaresError\",\n \"leastAbsoluteError\".\n (default: \"logLoss\")\n :param numIterations:\n Number of iterations of boosting.\n (default: 100)\n :param learningRate:\n Learning rate for shrinking the contribution of each estimator.\n The learning rate should be between in the interval (0, 1].\n (default: 0.1)\n :param maxDepth:\n Maximum depth of tree (e.g. depth 0 means 1 leaf node, depth 1\n means 1 internal node + 2 leaf nodes).\n (default: 3)\n :param maxBins:\n Maximum number of bins used for splitting features. DecisionTree\n requires maxBins >= max categories.\n (default: 32)\n :return:\n GradientBoostedTreesModel that can be used for prediction.\n\n Example usage:\n\n >>> from pyspark.mllib.regression import LabeledPoint\n >>> from pyspark.mllib.tree import GradientBoostedTrees\n >>>\n >>> data = [\n ... LabeledPoint(0.0, [0.0]),\n ... LabeledPoint(0.0, [1.0]),\n ... LabeledPoint(1.0, [2.0]),\n ... LabeledPoint(1.0, [3.0])\n ... ]\n >>>\n >>> model = GradientBoostedTrees.trainClassifier(sc.parallelize(data), {}, numIterations=10)\n >>> model.numTrees()\n 10\n >>> model.totalNumNodes()\n 30\n >>> print(model) # it already has newline\n TreeEnsembleModel classifier with 10 trees\n \n >>> model.predict([2.0])\n 1.0\n >>> model.predict([0.0])\n 0.0\n >>> rdd = sc.parallelize([[2.0], [0.0]])\n >>> model.predict(rdd).collect()\n [1.0, 0.0]\n \"\"\"\n return cls._train(data, \"classification\", categoricalFeaturesInfo,\n loss, numIterations, learningRate, maxDepth, maxBins)", "response": "Train a gradient - boosted trees model for classification."} {"instruction": "Create a Python 3 function for\nsetting a configuration property.", "response": "def set(self, key, value):\n \"\"\"Set a configuration property.\"\"\"\n # Try to set self._jconf first if JVM is created, set self._conf if JVM is not created yet.\n if self._jconf is not None:\n self._jconf.set(key, unicode(value))\n else:\n self._conf[key] = unicode(value)\n return self"} {"instruction": "Write a Python 3 function for\nsetting a configuration property if not already set.", "response": "def setIfMissing(self, key, value):\n \"\"\"Set a configuration property, if not already set.\"\"\"\n if self.get(key) is None:\n self.set(key, value)\n return self"} {"instruction": "Explain what the following Python 3 code does\ndef setExecutorEnv(self, key=None, value=None, pairs=None):\n if (key is not None and pairs is not None) or (key is None and pairs is None):\n raise Exception(\"Either pass one key-value pair or a list of pairs\")\n elif key is not None:\n self.set(\"spark.executorEnv.\" + key, value)\n elif pairs is not None:\n for (k, v) in pairs:\n self.set(\"spark.executorEnv.\" + k, v)\n return self", "response": "Set an environment variable to be passed to executors."} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef setAll(self, pairs):\n for (k, v) in pairs:\n self.set(k, v)\n return self", "response": "Set multiple parameters passed as a list of key - value pairs."} {"instruction": "Here you have a function in Python 3, explain what it does\ndef get(self, key, defaultValue=None):\n if defaultValue is None: # Py4J doesn't call the right get() if we pass None\n if self._jconf is not None:\n if not self._jconf.contains(key):\n return None\n return self._jconf.get(key)\n else:\n if key not in self._conf:\n return None\n return self._conf[key]\n else:\n if self._jconf is not None:\n return self._jconf.get(key, defaultValue)\n else:\n return self._conf.get(key, defaultValue)", "response": "Get the configured value for some key or return a default otherwise."} {"instruction": "How would you explain what the following Python 3 function does\ndef getAll(self):\n if self._jconf is not None:\n return [(elem._1(), elem._2()) for elem in self._jconf.getAll()]\n else:\n return self._conf.items()", "response": "Get all values as a list of key - value pairs."} {"instruction": "Can you create a Python 3 function that\ndoes this configuration contain a given key?", "response": "def contains(self, key):\n \"\"\"Does this configuration contain a given key?\"\"\"\n if self._jconf is not None:\n return self._jconf.contains(key)\n else:\n return key in self._conf"} {"instruction": "Create a Python 3 function to\nreturn a printable version of the configuration as a list of key = value pairs one per line.", "response": "def toDebugString(self):\n \"\"\"\n Returns a printable version of the configuration, as a list of\n key=value pairs, one per line.\n \"\"\"\n if self._jconf is not None:\n return self._jconf.toDebugString()\n else:\n return '\\n'.join('%s=%s' % (k, v) for k, v in self._conf.items())"} {"instruction": "Implement a function in Python 3 to\nreturn a list of databases available across all sessions.", "response": "def listDatabases(self):\n \"\"\"Returns a list of databases available across all sessions.\"\"\"\n iter = self._jcatalog.listDatabases().toLocalIterator()\n databases = []\n while iter.hasNext():\n jdb = iter.next()\n databases.append(Database(\n name=jdb.name(),\n description=jdb.description(),\n locationUri=jdb.locationUri()))\n return databases"} {"instruction": "Write a Python 3 script for\nreturning a list of tables in the specified database.", "response": "def listTables(self, dbName=None):\n \"\"\"Returns a list of tables/views in the specified database.\n\n If no database is specified, the current database is used.\n This includes all temporary views.\n \"\"\"\n if dbName is None:\n dbName = self.currentDatabase()\n iter = self._jcatalog.listTables(dbName).toLocalIterator()\n tables = []\n while iter.hasNext():\n jtable = iter.next()\n tables.append(Table(\n name=jtable.name(),\n database=jtable.database(),\n description=jtable.description(),\n tableType=jtable.tableType(),\n isTemporary=jtable.isTemporary()))\n return tables"} {"instruction": "Implement a Python 3 function for\nreturning a list of functions registered in the specified database.", "response": "def listFunctions(self, dbName=None):\n \"\"\"Returns a list of functions registered in the specified database.\n\n If no database is specified, the current database is used.\n This includes all temporary functions.\n \"\"\"\n if dbName is None:\n dbName = self.currentDatabase()\n iter = self._jcatalog.listFunctions(dbName).toLocalIterator()\n functions = []\n while iter.hasNext():\n jfunction = iter.next()\n functions.append(Function(\n name=jfunction.name(),\n description=jfunction.description(),\n className=jfunction.className(),\n isTemporary=jfunction.isTemporary()))\n return functions"} {"instruction": "Create a Python 3 function for\nreturning a list of columns for the given table or view in the specified database.", "response": "def listColumns(self, tableName, dbName=None):\n \"\"\"Returns a list of columns for the given table/view in the specified database.\n\n If no database is specified, the current database is used.\n\n Note: the order of arguments here is different from that of its JVM counterpart\n because Python does not support method overloading.\n \"\"\"\n if dbName is None:\n dbName = self.currentDatabase()\n iter = self._jcatalog.listColumns(dbName, tableName).toLocalIterator()\n columns = []\n while iter.hasNext():\n jcolumn = iter.next()\n columns.append(Column(\n name=jcolumn.name(),\n description=jcolumn.description(),\n dataType=jcolumn.dataType(),\n nullable=jcolumn.nullable(),\n isPartition=jcolumn.isPartition(),\n isBucket=jcolumn.isBucket()))\n return columns"} {"instruction": "Write a Python 3 script for\ncreating an external table based on the dataset in a data source.", "response": "def createExternalTable(self, tableName, path=None, source=None, schema=None, **options):\n \"\"\"Creates a table based on the dataset in a data source.\n\n It returns the DataFrame associated with the external table.\n\n The data source is specified by the ``source`` and a set of ``options``.\n If ``source`` is not specified, the default data source configured by\n ``spark.sql.sources.default`` will be used.\n\n Optionally, a schema can be provided as the schema of the returned :class:`DataFrame` and\n created external table.\n\n :return: :class:`DataFrame`\n \"\"\"\n warnings.warn(\n \"createExternalTable is deprecated since Spark 2.2, please use createTable instead.\",\n DeprecationWarning)\n return self.createTable(tableName, path, source, schema, **options)"} {"instruction": "How would you explain what the following Python 3 function does\ndef createTable(self, tableName, path=None, source=None, schema=None, **options):\n if path is not None:\n options[\"path\"] = path\n if source is None:\n source = self._sparkSession._wrapped._conf.defaultDataSourceName()\n if schema is None:\n df = self._jcatalog.createTable(tableName, source, options)\n else:\n if not isinstance(schema, StructType):\n raise TypeError(\"schema should be StructType\")\n scala_datatype = self._jsparkSession.parseDataType(schema.json())\n df = self._jcatalog.createTable(tableName, source, scala_datatype, options)\n return DataFrame(df, self._sparkSession._wrapped)", "response": "Creates a table based on the dataset in a data source."} {"instruction": "Explain what the following Python 3 code does\ndef _load_from_socket(port, auth_secret):\n (sockfile, sock) = local_connect_and_auth(port, auth_secret)\n # The barrier() call may block forever, so no timeout\n sock.settimeout(None)\n # Make a barrier() function call.\n write_int(BARRIER_FUNCTION, sockfile)\n sockfile.flush()\n\n # Collect result.\n res = UTF8Deserializer().loads(sockfile)\n\n # Release resources.\n sockfile.close()\n sock.close()\n\n return res", "response": "Load data from a given socket."} {"instruction": "How would you explain what the following Python 3 function does\ndef _getOrCreate(cls):\n if not isinstance(cls._taskContext, BarrierTaskContext):\n cls._taskContext = object.__new__(cls)\n return cls._taskContext", "response": "Internal function to get or create BarrierTaskContext object."} {"instruction": "Write a Python 3 script to\ninitialize BarrierTaskContext, other methods within BarrierTaskContext can only be called after BarrierTaskContext is initialized.", "response": "def _initialize(cls, port, secret):\n \"\"\"\n Initialize BarrierTaskContext, other methods within BarrierTaskContext can only be called\n after BarrierTaskContext is initialized.\n \"\"\"\n cls._port = port\n cls._secret = secret"} {"instruction": "How would you implement a function in Python 3 that\nsets a global barrier and waits until all tasks in this stage hit this barrier.", "response": "def barrier(self):\n \"\"\"\n .. note:: Experimental\n\n Sets a global barrier and waits until all tasks in this stage hit this barrier.\n Similar to `MPI_Barrier` function in MPI, this function blocks until all tasks\n in the same stage have reached this routine.\n\n .. warning:: In a barrier stage, each task much have the same number of `barrier()`\n calls, in all possible code branches.\n Otherwise, you may get the job hanging or a SparkException after timeout.\n\n .. versionadded:: 2.4.0\n \"\"\"\n if self._port is None or self._secret is None:\n raise Exception(\"Not supported to call barrier() before initialize \" +\n \"BarrierTaskContext.\")\n else:\n _load_from_socket(self._port, self._secret)"} {"instruction": "Make a summary of the following Python 3 code\ndef getTaskInfos(self):\n if self._port is None or self._secret is None:\n raise Exception(\"Not supported to call getTaskInfos() before initialize \" +\n \"BarrierTaskContext.\")\n else:\n addresses = self._localProperties.get(\"addresses\", \"\")\n return [BarrierTaskInfo(h.strip()) for h in addresses.split(\",\")]", "response": "Returns a list of BarrierTaskInfo objects for all tasks in this barrier stage ordered by partition ID."} {"instruction": "Can you tell what is the following Python 3 function doing\ndef since(version):\n import re\n indent_p = re.compile(r'\\n( +)')\n\n def deco(f):\n indents = indent_p.findall(f.__doc__)\n indent = ' ' * (min(len(m) for m in indents) if indents else 0)\n f.__doc__ = f.__doc__.rstrip() + \"\\n\\n%s.. versionadded:: %s\" % (indent, version)\n return f\n return deco", "response": "A decorator that adds the version of Spark the function was added."} {"instruction": "Explain what the following Python 3 code does\ndef copy_func(f, name=None, sinceversion=None, doc=None):\n # See\n # http://stackoverflow.com/questions/6527633/how-can-i-make-a-deepcopy-of-a-function-in-python\n fn = types.FunctionType(f.__code__, f.__globals__, name or f.__name__, f.__defaults__,\n f.__closure__)\n # in case f was given attrs (note this dict is a shallow copy):\n fn.__dict__.update(f.__dict__)\n if doc is not None:\n fn.__doc__ = doc\n if sinceversion is not None:\n fn = since(sinceversion)(fn)\n return fn", "response": "Returns a copy of a function f."} {"instruction": "Implement a function in Python 3 to\ngenerate the header part for a param.", "response": "def _gen_param_header(name, doc, defaultValueStr, typeConverter):\n \"\"\"\n Generates the header part for shared variables\n\n :param name: param name\n :param doc: param doc\n \"\"\"\n template = '''class Has$Name(Params):\n \"\"\"\n Mixin for param $name: $doc\n \"\"\"\n\n $name = Param(Params._dummy(), \"$name\", \"$doc\", typeConverter=$typeConverter)\n\n def __init__(self):\n super(Has$Name, self).__init__()'''\n\n if defaultValueStr is not None:\n template += '''\n self._setDefault($name=$defaultValueStr)'''\n\n Name = name[0].upper() + name[1:]\n if typeConverter is None:\n typeConverter = str(None)\n return template \\\n .replace(\"$name\", name) \\\n .replace(\"$Name\", Name) \\\n .replace(\"$doc\", doc) \\\n .replace(\"$defaultValueStr\", str(defaultValueStr)) \\\n .replace(\"$typeConverter\", typeConverter)"} {"instruction": "How would you explain what the following Python 3 function does\ndef _gen_param_code(name, doc, defaultValueStr):\n # TODO: How to correctly inherit instance attributes?\n template = '''\n def set$Name(self, value):\n \"\"\"\n Sets the value of :py:attr:`$name`.\n \"\"\"\n return self._set($name=value)\n\n def get$Name(self):\n \"\"\"\n Gets the value of $name or its default value.\n \"\"\"\n return self.getOrDefault(self.$name)'''\n\n Name = name[0].upper() + name[1:]\n return template \\\n .replace(\"$name\", name) \\\n .replace(\"$Name\", Name) \\\n .replace(\"$doc\", doc) \\\n .replace(\"$defaultValueStr\", str(defaultValueStr))", "response": "Generates code for a shared param class."} {"instruction": "Write a Python 3 script to\nrun the bisecting k - means algorithm and returns the model.", "response": "def train(self, rdd, k=4, maxIterations=20, minDivisibleClusterSize=1.0, seed=-1888008604):\n \"\"\"\n Runs the bisecting k-means algorithm return the model.\n\n :param rdd:\n Training points as an `RDD` of `Vector` or convertible\n sequence types.\n :param k:\n The desired number of leaf clusters. The actual number could\n be smaller if there are no divisible leaf clusters.\n (default: 4)\n :param maxIterations:\n Maximum number of iterations allowed to split clusters.\n (default: 20)\n :param minDivisibleClusterSize:\n Minimum number of points (if >= 1.0) or the minimum proportion\n of points (if < 1.0) of a divisible cluster.\n (default: 1)\n :param seed:\n Random seed value for cluster initialization.\n (default: -1888008604 from classOf[BisectingKMeans].getName.##)\n \"\"\"\n java_model = callMLlibFunc(\n \"trainBisectingKMeans\", rdd.map(_convert_to_vector),\n k, maxIterations, minDivisibleClusterSize, seed)\n return BisectingKMeansModel(java_model)"} {"instruction": "Create a Python 3 function to\ntrain a k - means clustering model on the given data set.", "response": "def train(cls, rdd, k, maxIterations=100, runs=1, initializationMode=\"k-means||\",\n seed=None, initializationSteps=2, epsilon=1e-4, initialModel=None):\n \"\"\"\n Train a k-means clustering model.\n\n :param rdd:\n Training points as an `RDD` of `Vector` or convertible\n sequence types.\n :param k:\n Number of clusters to create.\n :param maxIterations:\n Maximum number of iterations allowed.\n (default: 100)\n :param runs:\n This param has no effect since Spark 2.0.0.\n :param initializationMode:\n The initialization algorithm. This can be either \"random\" or\n \"k-means||\".\n (default: \"k-means||\")\n :param seed:\n Random seed value for cluster initialization. Set as None to\n generate seed based on system time.\n (default: None)\n :param initializationSteps:\n Number of steps for the k-means|| initialization mode.\n This is an advanced setting -- the default of 2 is almost\n always enough.\n (default: 2)\n :param epsilon:\n Distance threshold within which a center will be considered to\n have converged. If all centers move less than this Euclidean\n distance, iterations are stopped.\n (default: 1e-4)\n :param initialModel:\n Initial cluster centers can be provided as a KMeansModel object\n rather than using the random or k-means|| initializationModel.\n (default: None)\n \"\"\"\n if runs != 1:\n warnings.warn(\"The param `runs` has no effect since Spark 2.0.0.\")\n clusterInitialModel = []\n if initialModel is not None:\n if not isinstance(initialModel, KMeansModel):\n raise Exception(\"initialModel is of \"+str(type(initialModel))+\". It needs \"\n \"to be of \")\n clusterInitialModel = [_convert_to_vector(c) for c in initialModel.clusterCenters]\n model = callMLlibFunc(\"trainKMeansModel\", rdd.map(_convert_to_vector), k, maxIterations,\n runs, initializationMode, seed, initializationSteps, epsilon,\n clusterInitialModel)\n centers = callJavaFunc(rdd.context, model.clusterCenters)\n return KMeansModel([c.toArray() for c in centers])"} {"instruction": "Can you create a Python 3 function that\ntrains a Gaussian Mixture clustering model.", "response": "def train(cls, rdd, k, convergenceTol=1e-3, maxIterations=100, seed=None, initialModel=None):\n \"\"\"\n Train a Gaussian Mixture clustering model.\n\n :param rdd:\n Training points as an `RDD` of `Vector` or convertible\n sequence types.\n :param k:\n Number of independent Gaussians in the mixture model.\n :param convergenceTol:\n Maximum change in log-likelihood at which convergence is\n considered to have occurred.\n (default: 1e-3)\n :param maxIterations:\n Maximum number of iterations allowed.\n (default: 100)\n :param seed:\n Random seed for initial Gaussian distribution. Set as None to\n generate seed based on system time.\n (default: None)\n :param initialModel:\n Initial GMM starting point, bypassing the random\n initialization.\n (default: None)\n \"\"\"\n initialModelWeights = None\n initialModelMu = None\n initialModelSigma = None\n if initialModel is not None:\n if initialModel.k != k:\n raise Exception(\"Mismatched cluster count, initialModel.k = %s, however k = %s\"\n % (initialModel.k, k))\n initialModelWeights = list(initialModel.weights)\n initialModelMu = [initialModel.gaussians[i].mu for i in range(initialModel.k)]\n initialModelSigma = [initialModel.gaussians[i].sigma for i in range(initialModel.k)]\n java_model = callMLlibFunc(\"trainGaussianMixtureModel\", rdd.map(_convert_to_vector),\n k, convergenceTol, maxIterations, seed,\n initialModelWeights, initialModelMu, initialModelSigma)\n return GaussianMixtureModel(java_model)"} {"instruction": "Implement a Python 3 function for\nloading a PowerIterationClusteringModel from the given path.", "response": "def load(cls, sc, path):\n \"\"\"\n Load a model from the given path.\n \"\"\"\n model = cls._load_java(sc, path)\n wrapper =\\\n sc._jvm.org.apache.spark.mllib.api.python.PowerIterationClusteringModelWrapper(model)\n return PowerIterationClusteringModel(wrapper)"} {"instruction": "Explain what the following Python 3 code does\ndef train(cls, rdd, k, maxIterations=100, initMode=\"random\"):\n model = callMLlibFunc(\"trainPowerIterationClusteringModel\",\n rdd.map(_convert_to_vector), int(k), int(maxIterations), initMode)\n return PowerIterationClusteringModel(model)", "response": "r Trains a power iteration clustering model on the given RDD."} {"instruction": "How would you explain what the following Python 3 function does\ndef update(self, data, decayFactor, timeUnit):\n if not isinstance(data, RDD):\n raise TypeError(\"Data should be of an RDD, got %s.\" % type(data))\n data = data.map(_convert_to_vector)\n decayFactor = float(decayFactor)\n if timeUnit not in [\"batches\", \"points\"]:\n raise ValueError(\n \"timeUnit should be 'batches' or 'points', got %s.\" % timeUnit)\n vectorCenters = [_convert_to_vector(center) for center in self.centers]\n updatedModel = callMLlibFunc(\n \"updateStreamingKMeansModel\", vectorCenters, self._clusterWeights,\n data, decayFactor, timeUnit)\n self.centers = array(updatedModel[0])\n self._clusterWeights = list(updatedModel[1])\n return self", "response": "Update the k - MEAN model with new data."} {"instruction": "Can you tell what is the following Python 3 function doing\ndef setHalfLife(self, halfLife, timeUnit):\n self._timeUnit = timeUnit\n self._decayFactor = exp(log(0.5) / halfLife)\n return self", "response": "Sets the number of batches after which the centroids of that set have half the weightage."} {"instruction": "Implement a function in Python 3 to\nset initial centers for this store.", "response": "def setInitialCenters(self, centers, weights):\n \"\"\"\n Set initial centers. Should be set before calling trainOn.\n \"\"\"\n self._model = StreamingKMeansModel(centers, weights)\n return self"} {"instruction": "Write a Python 3 script for\nsetting the initial centres to be random samples from the gaussian population with constant weights.", "response": "def setRandomCenters(self, dim, weight, seed):\n \"\"\"\n Set the initial centres to be random samples from\n a gaussian population with constant weights.\n \"\"\"\n rng = random.RandomState(seed)\n clusterCenters = rng.randn(self._k, dim)\n clusterWeights = tile(weight, self._k)\n self._model = StreamingKMeansModel(clusterCenters, clusterWeights)\n return self"} {"instruction": "Create a Python 3 function to\ntrain the model on the incoming dstream.", "response": "def trainOn(self, dstream):\n \"\"\"Train the model on the incoming dstream.\"\"\"\n self._validate(dstream)\n\n def update(rdd):\n self._model.update(rdd, self._decayFactor, self._timeUnit)\n\n dstream.foreachRDD(update)"} {"instruction": "Here you have a function in Python 3, explain what it does\ndef predictOn(self, dstream):\n self._validate(dstream)\n return dstream.map(lambda x: self._model.predict(x))", "response": "Make predictions on a dstream."} {"instruction": "Create a Python 3 function to\nmake predictions on a keyed dstream.", "response": "def predictOnValues(self, dstream):\n \"\"\"\n Make predictions on a keyed dstream.\n Returns a transformed dstream object.\n \"\"\"\n self._validate(dstream)\n return dstream.mapValues(lambda x: self._model.predict(x))"} {"instruction": "Implement a Python 3 function for\nreturning the topics described by weighted terms.", "response": "def describeTopics(self, maxTermsPerTopic=None):\n \"\"\"Return the topics described by weighted terms.\n\n WARNING: If vocabSize and k are large, this can return a large object!\n\n :param maxTermsPerTopic:\n Maximum number of terms to collect for each topic.\n (default: vocabulary size)\n :return:\n Array over topics. Each topic is represented as a pair of\n matching arrays: (term indices, term weights in topic).\n Each topic's terms are sorted in order of decreasing weight.\n \"\"\"\n if maxTermsPerTopic is None:\n topics = self.call(\"describeTopics\")\n else:\n topics = self.call(\"describeTopics\", maxTermsPerTopic)\n return topics"} {"instruction": "Here you have a function in Python 3, explain what it does\ndef load(cls, sc, path):\n if not isinstance(sc, SparkContext):\n raise TypeError(\"sc should be a SparkContext, got type %s\" % type(sc))\n if not isinstance(path, basestring):\n raise TypeError(\"path should be a basestring, got type %s\" % type(path))\n model = callMLlibFunc(\"loadLDAModel\", sc, path)\n return LDAModel(model)", "response": "Load the LDAModel from disk."} {"instruction": "Make a summary of the following Python 3 code\ndef train(cls, rdd, k=10, maxIterations=20, docConcentration=-1.0,\n topicConcentration=-1.0, seed=None, checkpointInterval=10, optimizer=\"em\"):\n \"\"\"Train a LDA model.\n\n :param rdd:\n RDD of documents, which are tuples of document IDs and term\n (word) count vectors. The term count vectors are \"bags of\n words\" with a fixed-size vocabulary (where the vocabulary size\n is the length of the vector). Document IDs must be unique\n and >= 0.\n :param k:\n Number of topics to infer, i.e., the number of soft cluster\n centers.\n (default: 10)\n :param maxIterations:\n Maximum number of iterations allowed.\n (default: 20)\n :param docConcentration:\n Concentration parameter (commonly named \"alpha\") for the prior\n placed on documents' distributions over topics (\"theta\").\n (default: -1.0)\n :param topicConcentration:\n Concentration parameter (commonly named \"beta\" or \"eta\") for\n the prior placed on topics' distributions over terms.\n (default: -1.0)\n :param seed:\n Random seed for cluster initialization. Set as None to generate\n seed based on system time.\n (default: None)\n :param checkpointInterval:\n Period (in iterations) between checkpoints.\n (default: 10)\n :param optimizer:\n LDAOptimizer used to perform the actual calculation. Currently\n \"em\", \"online\" are supported.\n (default: \"em\")\n \"\"\"\n model = callMLlibFunc(\"trainLDAModel\", rdd, k, maxIterations,\n docConcentration, topicConcentration, seed,\n checkpointInterval, optimizer)\n return LDAModel(model)", "response": "Train a LDA model."} {"instruction": "Explain what the following Python 3 code does\ndef _to_java_object_rdd(rdd):\n rdd = rdd._reserialize(AutoBatchedSerializer(PickleSerializer()))\n return rdd.ctx._jvm.org.apache.spark.mllib.api.python.SerDe.pythonToJava(rdd._jrdd, True)", "response": "Return a JavaRDD of Object by unpickling\n "} {"instruction": "Here you have a function in Python 3, explain what it does\ndef _py2java(sc, obj):\n if isinstance(obj, RDD):\n obj = _to_java_object_rdd(obj)\n elif isinstance(obj, DataFrame):\n obj = obj._jdf\n elif isinstance(obj, SparkContext):\n obj = obj._jsc\n elif isinstance(obj, list):\n obj = [_py2java(sc, x) for x in obj]\n elif isinstance(obj, JavaObject):\n pass\n elif isinstance(obj, (int, long, float, bool, bytes, unicode)):\n pass\n else:\n data = bytearray(PickleSerializer().dumps(obj))\n obj = sc._jvm.org.apache.spark.mllib.api.python.SerDe.loads(data)\n return obj", "response": "Convert Python object into Java object."} {"instruction": "Explain what the following Python 3 code does\ndef callMLlibFunc(name, *args):\n sc = SparkContext.getOrCreate()\n api = getattr(sc._jvm.PythonMLLibAPI(), name)\n return callJavaFunc(sc, api, *args)", "response": "Call the specified PythonMLLib API with the given arguments."} {"instruction": "Implement a function in Python 3 to\ncall method of java_model", "response": "def call(self, name, *a):\n \"\"\"Call method of java_model\"\"\"\n return callJavaFunc(self._sc, getattr(self._java_model, name), *a)"} {"instruction": "How would you implement a function in Python 3 that\nreturns a new DStream in which each RDD has a single element generated by counting each element of this DStream.", "response": "def count(self):\n \"\"\"\n Return a new DStream in which each RDD has a single element\n generated by counting each RDD of this DStream.\n \"\"\"\n return self.mapPartitions(lambda i: [sum(1 for _ in i)]).reduce(operator.add)"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef filter(self, f):\n def func(iterator):\n return filter(f, iterator)\n return self.mapPartitions(func, True)", "response": "Return a new DStream containing only the elements that satisfy the predicate."} {"instruction": "Can you implement a function in Python 3 that\nreturns a new DStream by applying a function to each element of the DStream.", "response": "def map(self, f, preservesPartitioning=False):\n \"\"\"\n Return a new DStream by applying a function to each element of DStream.\n \"\"\"\n def func(iterator):\n return map(f, iterator)\n return self.mapPartitions(func, preservesPartitioning)"} {"instruction": "How would you code a function in Python 3 to\nreturn a new DStream by applying a function to each RDD of each RDD in which each RDD is generated by applying mapPartitionsWithIndex.", "response": "def mapPartitionsWithIndex(self, f, preservesPartitioning=False):\n \"\"\"\n Return a new DStream in which each RDD is generated by applying\n mapPartitionsWithIndex() to each RDDs of this DStream.\n \"\"\"\n return self.transform(lambda rdd: rdd.mapPartitionsWithIndex(f, preservesPartitioning))"} {"instruction": "Given the following Python 3 function, write the documentation\ndef reduce(self, func):\n return self.map(lambda x: (None, x)).reduceByKey(func, 1).map(lambda x: x[1])", "response": "Return a new DStream in which each RDD has a single element\n generated by reducing each element of each RDD."} {"instruction": "Explain what the following Python 3 code does\ndef reduceByKey(self, func, numPartitions=None):\n if numPartitions is None:\n numPartitions = self._sc.defaultParallelism\n return self.combineByKey(lambda x: x, func, func, numPartitions)", "response": "Return a new DStream by applying reduceByKey to each RDD."} {"instruction": "Write a Python 3 script to\nreturn a new DStream by applying combineByKey to each RDD.", "response": "def combineByKey(self, createCombiner, mergeValue, mergeCombiners,\n numPartitions=None):\n \"\"\"\n Return a new DStream by applying combineByKey to each RDD.\n \"\"\"\n if numPartitions is None:\n numPartitions = self._sc.defaultParallelism\n\n def func(rdd):\n return rdd.combineByKey(createCombiner, mergeValue, mergeCombiners, numPartitions)\n return self.transform(func)"} {"instruction": "Make a summary of the following Python 3 code\ndef partitionBy(self, numPartitions, partitionFunc=portable_hash):\n return self.transform(lambda rdd: rdd.partitionBy(numPartitions, partitionFunc))", "response": "Return a new DStream in which each RDD is partitioned by numPartitions partitions."} {"instruction": "Given the following Python 3 function, write the documentation\ndef foreachRDD(self, func):\n if func.__code__.co_argcount == 1:\n old_func = func\n func = lambda t, rdd: old_func(rdd)\n jfunc = TransformFunction(self._sc, func, self._jrdd_deserializer)\n api = self._ssc._jvm.PythonDStream\n api.callForeachRDD(self._jdstream, jfunc)", "response": "Apply a function to each RDD in this DStream."} {"instruction": "Create a Python 3 function to\nprint the first num elements of each RDD generated in this DStream.", "response": "def pprint(self, num=10):\n \"\"\"\n Print the first num elements of each RDD generated in this DStream.\n\n @param num: the number of elements from the first will be printed.\n \"\"\"\n def takeAndPrint(time, rdd):\n taken = rdd.take(num + 1)\n print(\"-------------------------------------------\")\n print(\"Time: %s\" % time)\n print(\"-------------------------------------------\")\n for record in taken[:num]:\n print(record)\n if len(taken) > num:\n print(\"...\")\n print(\"\")\n\n self.foreachRDD(takeAndPrint)"} {"instruction": "How would you implement a function in Python 3 that\npersists the RDDs of this DStream with the given storage level.", "response": "def persist(self, storageLevel):\n \"\"\"\n Persist the RDDs of this DStream with the given storage level\n \"\"\"\n self.is_cached = True\n javaStorageLevel = self._sc._getJavaStorageLevel(storageLevel)\n self._jdstream.persist(javaStorageLevel)\n return self"} {"instruction": "Here you have a function in Python 3, explain what it does\ndef checkpoint(self, interval):\n self.is_checkpointed = True\n self._jdstream.checkpoint(self._ssc._jduration(interval))\n return self", "response": "Enable periodic checkpointing of generated\n RDDs of this DStream."} {"instruction": "Make a summary of the following Python 3 code\ndef groupByKey(self, numPartitions=None):\n if numPartitions is None:\n numPartitions = self._sc.defaultParallelism\n return self.transform(lambda rdd: rdd.groupByKey(numPartitions))", "response": "Return a new DStream by applying groupByKey on each RDD."} {"instruction": "Can you tell what is the following Python 3 function doing\ndef countByValue(self):\n return self.map(lambda x: (x, 1)).reduceByKey(lambda x, y: x+y)", "response": "Return a new DStream with the counts of each distinct value in each RDD."} {"instruction": "How would you code a function in Python 3 to\nsave each RDD in this DStream as text files using string representation of elements.", "response": "def saveAsTextFiles(self, prefix, suffix=None):\n \"\"\"\n Save each RDD in this DStream as at text file, using string\n representation of elements.\n \"\"\"\n def saveAsTextFile(t, rdd):\n path = rddToFileName(prefix, suffix, t)\n try:\n rdd.saveAsTextFile(path)\n except Py4JJavaError as e:\n # after recovered from checkpointing, the foreachRDD may\n # be called twice\n if 'FileAlreadyExistsException' not in str(e):\n raise\n return self.foreachRDD(saveAsTextFile)"} {"instruction": "Make a summary of the following Python 3 code\ndef transform(self, func):\n if func.__code__.co_argcount == 1:\n oldfunc = func\n func = lambda t, rdd: oldfunc(rdd)\n assert func.__code__.co_argcount == 2, \"func should take one or two arguments\"\n return TransformedDStream(self, func)", "response": "Return a new DStream in which each RDD is generated by applying a function\n on each RDD of this DStream."} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef transformWith(self, func, other, keepSerializer=False):\n if func.__code__.co_argcount == 2:\n oldfunc = func\n func = lambda t, a, b: oldfunc(a, b)\n assert func.__code__.co_argcount == 3, \"func should take two or three arguments\"\n jfunc = TransformFunction(self._sc, func, self._jrdd_deserializer, other._jrdd_deserializer)\n dstream = self._sc._jvm.PythonTransformed2DStream(self._jdstream.dstream(),\n other._jdstream.dstream(), jfunc)\n jrdd_serializer = self._jrdd_deserializer if keepSerializer else self._sc.serializer\n return DStream(dstream.asJavaDStream(), self._ssc, jrdd_serializer)", "response": "Return a new DStream in which each RDD is generated by applying a functionon each RDD of this DStream and other DStreams."} {"instruction": "Implement a Python 3 function for\nreturning a new DStream by unifying data of another DStream with this DStream.", "response": "def union(self, other):\n \"\"\"\n Return a new DStream by unifying data of another DStream with this DStream.\n\n @param other: Another DStream having the same interval (i.e., slideDuration)\n as this DStream.\n \"\"\"\n if self._slideDuration != other._slideDuration:\n raise ValueError(\"the two DStream should have same slide duration\")\n return self.transformWith(lambda a, b: a.union(b), other, True)"} {"instruction": "Write a Python 3 function for\nreturning a new DStream by applying cogroup between this DStream and other.", "response": "def cogroup(self, other, numPartitions=None):\n \"\"\"\n Return a new DStream by applying 'cogroup' between RDDs of this\n DStream and `other` DStream.\n\n Hash partitioning is used to generate the RDDs with `numPartitions` partitions.\n \"\"\"\n if numPartitions is None:\n numPartitions = self._sc.defaultParallelism\n return self.transformWith(lambda a, b: a.cogroup(b, numPartitions), other)"} {"instruction": "How would you implement a function in Python 3 that\nconverts datetime or unix_timestamp into Time", "response": "def _jtime(self, timestamp):\n \"\"\" Convert datetime or unix_timestamp into Time\n \"\"\"\n if isinstance(timestamp, datetime):\n timestamp = time.mktime(timestamp.timetuple())\n return self._sc._jvm.Time(long(timestamp * 1000))"} {"instruction": "Write a Python 3 function for\nreturning all the RDDs between begin and end.", "response": "def slice(self, begin, end):\n \"\"\"\n Return all the RDDs between 'begin' to 'end' (both included)\n\n `begin`, `end` could be datetime.datetime() or unix_timestamp\n \"\"\"\n jrdds = self._jdstream.slice(self._jtime(begin), self._jtime(end))\n return [RDD(jrdd, self._sc, self._jrdd_deserializer) for jrdd in jrdds]"} {"instruction": "Create a Python 3 function for\nreturning a new DStream in which each element in seen in this DStream contains all the elements in seen in a sliding window of time over this DStream.", "response": "def window(self, windowDuration, slideDuration=None):\n \"\"\"\n Return a new DStream in which each RDD contains all the elements in seen in a\n sliding window of time over this DStream.\n\n @param windowDuration: width of the window; must be a multiple of this DStream's\n batching interval\n @param slideDuration: sliding interval of the window (i.e., the interval after which\n the new DStream will generate RDDs); must be a multiple of this\n DStream's batching interval\n \"\"\"\n self._validate_window_param(windowDuration, slideDuration)\n d = self._ssc._jduration(windowDuration)\n if slideDuration is None:\n return DStream(self._jdstream.window(d), self._ssc, self._jrdd_deserializer)\n s = self._ssc._jduration(slideDuration)\n return DStream(self._jdstream.window(d, s), self._ssc, self._jrdd_deserializer)"} {"instruction": "Make a summary of the following Python 3 code\ndef reduceByWindow(self, reduceFunc, invReduceFunc, windowDuration, slideDuration):\n keyed = self.map(lambda x: (1, x))\n reduced = keyed.reduceByKeyAndWindow(reduceFunc, invReduceFunc,\n windowDuration, slideDuration, 1)\n return reduced.map(lambda kv: kv[1])", "response": "Return a new DStream with a single element generated by reducing all the entries in a sliding window over this DStream."} {"instruction": "Can you create a Python 3 function that\nreturns a new DStream with a count of the number of elements in a given window over this DStream.", "response": "def countByWindow(self, windowDuration, slideDuration):\n \"\"\"\n Return a new DStream in which each RDD has a single element generated\n by counting the number of elements in a window over this DStream.\n windowDuration and slideDuration are as defined in the window() operation.\n\n This is equivalent to window(windowDuration, slideDuration).count(),\n but will be more efficient if window is large.\n \"\"\"\n return self.map(lambda x: 1).reduceByWindow(operator.add, operator.sub,\n windowDuration, slideDuration)"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef countByValueAndWindow(self, windowDuration, slideDuration, numPartitions=None):\n keyed = self.map(lambda x: (x, 1))\n counted = keyed.reduceByKeyAndWindow(operator.add, operator.sub,\n windowDuration, slideDuration, numPartitions)\n return counted.filter(lambda kv: kv[1] > 0)", "response": "Return a new DStream with the count of distinct elements in the given sliding window over this DStream."} {"instruction": "Can you write a function in Python 3 where it\nreturns a new DStream by applying groupByKeyAndWindow over a sliding window.", "response": "def groupByKeyAndWindow(self, windowDuration, slideDuration, numPartitions=None):\n \"\"\"\n Return a new DStream by applying `groupByKey` over a sliding window.\n Similar to `DStream.groupByKey()`, but applies it over a sliding window.\n\n @param windowDuration: width of the window; must be a multiple of this DStream's\n batching interval\n @param slideDuration: sliding interval of the window (i.e., the interval after which\n the new DStream will generate RDDs); must be a multiple of this\n DStream's batching interval\n @param numPartitions: Number of partitions of each RDD in the new DStream.\n \"\"\"\n ls = self.mapValues(lambda x: [x])\n grouped = ls.reduceByKeyAndWindow(lambda a, b: a.extend(b) or a, lambda a, b: a[len(b):],\n windowDuration, slideDuration, numPartitions)\n return grouped.mapValues(ResultIterable)"} {"instruction": "Can you create a Python 3 function that\nreturns a new DStream by applying incremental reduceByKey over a sliding window.", "response": "def reduceByKeyAndWindow(self, func, invFunc, windowDuration, slideDuration=None,\n numPartitions=None, filterFunc=None):\n \"\"\"\n Return a new DStream by applying incremental `reduceByKey` over a sliding window.\n\n The reduced value of over a new window is calculated using the old window's reduce value :\n 1. reduce the new values that entered the window (e.g., adding new counts)\n 2. \"inverse reduce\" the old values that left the window (e.g., subtracting old counts)\n\n `invFunc` can be None, then it will reduce all the RDDs in window, could be slower\n than having `invFunc`.\n\n @param func: associative and commutative reduce function\n @param invFunc: inverse function of `reduceFunc`\n @param windowDuration: width of the window; must be a multiple of this DStream's\n batching interval\n @param slideDuration: sliding interval of the window (i.e., the interval after which\n the new DStream will generate RDDs); must be a multiple of this\n DStream's batching interval\n @param numPartitions: number of partitions of each RDD in the new DStream.\n @param filterFunc: function to filter expired key-value pairs;\n only pairs that satisfy the function are retained\n set this to null if you do not want to filter\n \"\"\"\n self._validate_window_param(windowDuration, slideDuration)\n if numPartitions is None:\n numPartitions = self._sc.defaultParallelism\n\n reduced = self.reduceByKey(func, numPartitions)\n\n if invFunc:\n def reduceFunc(t, a, b):\n b = b.reduceByKey(func, numPartitions)\n r = a.union(b).reduceByKey(func, numPartitions) if a else b\n if filterFunc:\n r = r.filter(filterFunc)\n return r\n\n def invReduceFunc(t, a, b):\n b = b.reduceByKey(func, numPartitions)\n joined = a.leftOuterJoin(b, numPartitions)\n return joined.mapValues(lambda kv: invFunc(kv[0], kv[1])\n if kv[1] is not None else kv[0])\n\n jreduceFunc = TransformFunction(self._sc, reduceFunc, reduced._jrdd_deserializer)\n jinvReduceFunc = TransformFunction(self._sc, invReduceFunc, reduced._jrdd_deserializer)\n if slideDuration is None:\n slideDuration = self._slideDuration\n dstream = self._sc._jvm.PythonReducedWindowedDStream(\n reduced._jdstream.dstream(),\n jreduceFunc, jinvReduceFunc,\n self._ssc._jduration(windowDuration),\n self._ssc._jduration(slideDuration))\n return DStream(dstream.asJavaDStream(), self._ssc, self._sc.serializer)\n else:\n return reduced.window(windowDuration, slideDuration).reduceByKey(func, numPartitions)"} {"instruction": "Write a Python 3 script for\nreturning a new DStream where the state for each key is updated by applying updateFunc on the previous state of the key and the new values of the key.", "response": "def updateStateByKey(self, updateFunc, numPartitions=None, initialRDD=None):\n \"\"\"\n Return a new \"state\" DStream where the state for each key is updated by applying\n the given function on the previous state of the key and the new values of the key.\n\n @param updateFunc: State update function. If this function returns None, then\n corresponding state key-value pair will be eliminated.\n \"\"\"\n if numPartitions is None:\n numPartitions = self._sc.defaultParallelism\n\n if initialRDD and not isinstance(initialRDD, RDD):\n initialRDD = self._sc.parallelize(initialRDD)\n\n def reduceFunc(t, a, b):\n if a is None:\n g = b.groupByKey(numPartitions).mapValues(lambda vs: (list(vs), None))\n else:\n g = a.cogroup(b.partitionBy(numPartitions), numPartitions)\n g = g.mapValues(lambda ab: (list(ab[1]), list(ab[0])[0] if len(ab[0]) else None))\n state = g.mapValues(lambda vs_s: updateFunc(vs_s[0], vs_s[1]))\n return state.filter(lambda k_v: k_v[1] is not None)\n\n jreduceFunc = TransformFunction(self._sc, reduceFunc,\n self._sc.serializer, self._jrdd_deserializer)\n if initialRDD:\n initialRDD = initialRDD._reserialize(self._jrdd_deserializer)\n dstream = self._sc._jvm.PythonStateDStream(self._jdstream.dstream(), jreduceFunc,\n initialRDD._jrdd)\n else:\n dstream = self._sc._jvm.PythonStateDStream(self._jdstream.dstream(), jreduceFunc)\n\n return DStream(dstream.asJavaDStream(), self._ssc, self._sc.serializer)"} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef setParams(self, minSupport=0.3, minConfidence=0.8, itemsCol=\"items\",\n predictionCol=\"prediction\", numPartitions=None):\n \"\"\"\n setParams(self, minSupport=0.3, minConfidence=0.8, itemsCol=\"items\", \\\n predictionCol=\"prediction\", numPartitions=None)\n \"\"\"\n kwargs = self._input_kwargs\n return self._set(**kwargs)", "response": "Sets the parameters of the current object to match the input parameters."} {"instruction": "Write a Python 3 function for\nsetting the parameters of the object to be used for the log record.", "response": "def setParams(self, minSupport=0.1, maxPatternLength=10, maxLocalProjDBSize=32000000,\n sequenceCol=\"sequence\"):\n \"\"\"\n setParams(self, minSupport=0.1, maxPatternLength=10, maxLocalProjDBSize=32000000, \\\n sequenceCol=\"sequence\")\n \"\"\"\n kwargs = self._input_kwargs\n return self._set(**kwargs)"} {"instruction": "Implement a function in Python 3 to\nfind the complete set of frequent sequential patterns in the input sequences of itemsets.", "response": "def findFrequentSequentialPatterns(self, dataset):\n \"\"\"\n .. note:: Experimental\n\n Finds the complete set of frequent sequential patterns in the input sequences of itemsets.\n\n :param dataset: A dataframe containing a sequence column which is\n `ArrayType(ArrayType(T))` type, T is the item type for the input dataset.\n :return: A `DataFrame` that contains columns of sequence and corresponding frequency.\n The schema of it will be:\n - `sequence: ArrayType(ArrayType(T))` (T is the item type)\n - `freq: Long`\n\n >>> from pyspark.ml.fpm import PrefixSpan\n >>> from pyspark.sql import Row\n >>> df = sc.parallelize([Row(sequence=[[1, 2], [3]]),\n ... Row(sequence=[[1], [3, 2], [1, 2]]),\n ... Row(sequence=[[1, 2], [5]]),\n ... Row(sequence=[[6]])]).toDF()\n >>> prefixSpan = PrefixSpan(minSupport=0.5, maxPatternLength=5)\n >>> prefixSpan.findFrequentSequentialPatterns(df).sort(\"sequence\").show(truncate=False)\n +----------+----+\n |sequence |freq|\n +----------+----+\n |[[1]] |3 |\n |[[1], [3]]|2 |\n |[[1, 2]] |3 |\n |[[2]] |3 |\n |[[3]] |2 |\n +----------+----+\n\n .. versionadded:: 2.4.0\n \"\"\"\n self._transfer_params_to_java()\n jdf = self._java_obj.findFrequentSequentialPatterns(dataset._jdf)\n return DataFrame(jdf, dataset.sql_ctx)"} {"instruction": "Write a Python 3 function that can\nreturn a CallSite representing the first Spark call in the current call stack.", "response": "def first_spark_call():\n \"\"\"\n Return a CallSite representing the first Spark call in the current call stack.\n \"\"\"\n tb = traceback.extract_stack()\n if len(tb) == 0:\n return None\n file, line, module, what = tb[len(tb) - 1]\n sparkpath = os.path.dirname(file)\n first_spark_frame = len(tb) - 1\n for i in range(0, len(tb)):\n file, line, fun, what = tb[i]\n if file.startswith(sparkpath):\n first_spark_frame = i\n break\n if first_spark_frame == 0:\n file, line, fun, what = tb[0]\n return CallSite(function=fun, file=file, linenum=line)\n sfile, sline, sfun, swhat = tb[first_spark_frame]\n ufile, uline, ufun, uwhat = tb[first_spark_frame - 1]\n return CallSite(function=sfun, file=ufile, linenum=uline)"} {"instruction": "Explain what the following Python 3 code does\ndef parsePoint(line):\n values = [float(s) for s in line.split(' ')]\n if values[0] == -1: # Convert -1 labels to 0 for MLlib\n values[0] = 0\n return LabeledPoint(values[0], values[1:])", "response": "Parse a line of text into an MLlib LabeledPoint object."} {"instruction": "How would you explain what the following Python 3 function does\ndef fMeasure(self, label, beta=None):\n if beta is None:\n return self.call(\"fMeasure\", label)\n else:\n return self.call(\"fMeasure\", label, beta)", "response": "Returns f - measure."} {"instruction": "Make a summary of the following Python 3 code\ndef precision(self, label=None):\n if label is None:\n return self.call(\"precision\")\n else:\n return self.call(\"precision\", float(label))", "response": "Returns precision for a given label"} {"instruction": "Here you have a function in Python 3, explain what it does\ndef recall(self, label=None):\n if label is None:\n return self.call(\"recall\")\n else:\n return self.call(\"recall\", float(label))", "response": "Returns the recall of a given label."} {"instruction": "How would you explain what the following Python 3 function does\ndef f1Measure(self, label=None):\n if label is None:\n return self.call(\"f1Measure\")\n else:\n return self.call(\"f1Measure\", float(label))", "response": "Returns the f1Measure of the current user."} {"instruction": "Can you create a Python 3 function that\nreturns the content as an RDD of Row.", "response": "def rdd(self):\n \"\"\"Returns the content as an :class:`pyspark.RDD` of :class:`Row`.\n \"\"\"\n if self._lazy_rdd is None:\n jrdd = self._jdf.javaToPython()\n self._lazy_rdd = RDD(jrdd, self.sql_ctx._sc, BatchedSerializer(PickleSerializer()))\n return self._lazy_rdd"} {"instruction": "Can you write a function in Python 3 where it\nconverts a DataFrame into a : class : RDD of string.", "response": "def toJSON(self, use_unicode=True):\n \"\"\"Converts a :class:`DataFrame` into a :class:`RDD` of string.\n\n Each row is turned into a JSON document as one element in the returned RDD.\n\n >>> df.toJSON().first()\n u'{\"age\":2,\"name\":\"Alice\"}'\n \"\"\"\n rdd = self._jdf.toJSON()\n return RDD(rdd.toJavaRDD(), self._sc, UTF8Deserializer(use_unicode))"} {"instruction": "Given the following Python 3 function, write the documentation\ndef schema(self):\n if self._schema is None:\n try:\n self._schema = _parse_datatype_json_string(self._jdf.schema().json())\n except AttributeError as e:\n raise Exception(\n \"Unable to parse datatype from schema. %s\" % e)\n return self._schema", "response": "Returns the schema of this DataFrame as a StructType."} {"instruction": "Can you write a function in Python 3 where it\nprints the logical and physical plan for debugging purpose.", "response": "def explain(self, extended=False):\n \"\"\"Prints the (logical and physical) plans to the console for debugging purpose.\n\n :param extended: boolean, default ``False``. If ``False``, prints only the physical plan.\n\n >>> df.explain()\n == Physical Plan ==\n *(1) Scan ExistingRDD[age#0,name#1]\n\n >>> df.explain(True)\n == Parsed Logical Plan ==\n ...\n == Analyzed Logical Plan ==\n ...\n == Optimized Logical Plan ==\n ...\n == Physical Plan ==\n ...\n \"\"\"\n if extended:\n print(self._jdf.queryExecution().toString())\n else:\n print(self._jdf.queryExecution().simpleString())"} {"instruction": "Write a Python 3 script to\nreturn a new DataFrame containing rows in this DataFrame but not in another.", "response": "def exceptAll(self, other):\n \"\"\"Return a new :class:`DataFrame` containing rows in this :class:`DataFrame` but\n not in another :class:`DataFrame` while preserving duplicates.\n\n This is equivalent to `EXCEPT ALL` in SQL.\n\n >>> df1 = spark.createDataFrame(\n ... [(\"a\", 1), (\"a\", 1), (\"a\", 1), (\"a\", 2), (\"b\", 3), (\"c\", 4)], [\"C1\", \"C2\"])\n >>> df2 = spark.createDataFrame([(\"a\", 1), (\"b\", 3)], [\"C1\", \"C2\"])\n\n >>> df1.exceptAll(df2).show()\n +---+---+\n | C1| C2|\n +---+---+\n | a| 1|\n | a| 1|\n | a| 2|\n | c| 4|\n +---+---+\n\n Also as standard in SQL, this function resolves columns by position (not by name).\n \"\"\"\n return DataFrame(self._jdf.exceptAll(other._jdf), self.sql_ctx)"} {"instruction": "Create a Python 3 function to\nprint the first n rows to the console.", "response": "def show(self, n=20, truncate=True, vertical=False):\n \"\"\"Prints the first ``n`` rows to the console.\n\n :param n: Number of rows to show.\n :param truncate: If set to True, truncate strings longer than 20 chars by default.\n If set to a number greater than one, truncates long strings to length ``truncate``\n and align cells right.\n :param vertical: If set to True, print output rows vertically (one line\n per column value).\n\n >>> df\n DataFrame[age: int, name: string]\n >>> df.show()\n +---+-----+\n |age| name|\n +---+-----+\n | 2|Alice|\n | 5| Bob|\n +---+-----+\n >>> df.show(truncate=3)\n +---+----+\n |age|name|\n +---+----+\n | 2| Ali|\n | 5| Bob|\n +---+----+\n >>> df.show(vertical=True)\n -RECORD 0-----\n age | 2\n name | Alice\n -RECORD 1-----\n age | 5\n name | Bob\n \"\"\"\n if isinstance(truncate, bool) and truncate:\n print(self._jdf.showString(n, 20, vertical))\n else:\n print(self._jdf.showString(n, int(truncate), vertical))"} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef _repr_html_(self):\n import cgi\n if not self._support_repr_html:\n self._support_repr_html = True\n if self.sql_ctx._conf.isReplEagerEvalEnabled():\n max_num_rows = max(self.sql_ctx._conf.replEagerEvalMaxNumRows(), 0)\n sock_info = self._jdf.getRowsToPython(\n max_num_rows, self.sql_ctx._conf.replEagerEvalTruncate())\n rows = list(_load_from_socket(sock_info, BatchedSerializer(PickleSerializer())))\n head = rows[0]\n row_data = rows[1:]\n has_more_data = len(row_data) > max_num_rows\n row_data = row_data[:max_num_rows]\n\n html = \"\\n\"\n # generate table head\n html += \"\\n\" % \"\\n\" % \"
%s
\".join(map(lambda x: cgi.escape(x), head))\n # generate table rows\n for row in row_data:\n html += \"
%s
\".join(\n map(lambda x: cgi.escape(x), row))\n html += \"
\\n\"\n if has_more_data:\n html += \"only showing top %d %s\\n\" % (\n max_num_rows, \"row\" if max_num_rows == 1 else \"rows\")\n return html\n else:\n return None", "response": "Returns a dataframe with html code when you enabled eager evaluation by Spark SQL."} {"instruction": "Can you create a Python 3 function that\nreturns a new DataFrame with the current version of the current DataFrame and optionally eagerly checkpointed.", "response": "def checkpoint(self, eager=True):\n \"\"\"Returns a checkpointed version of this Dataset. Checkpointing can be used to truncate the\n logical plan of this DataFrame, which is especially useful in iterative algorithms where the\n plan may grow exponentially. It will be saved to files inside the checkpoint\n directory set with L{SparkContext.setCheckpointDir()}.\n\n :param eager: Whether to checkpoint this DataFrame immediately\n\n .. note:: Experimental\n \"\"\"\n jdf = self._jdf.checkpoint(eager)\n return DataFrame(jdf, self.sql_ctx)"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef localCheckpoint(self, eager=True):\n jdf = self._jdf.localCheckpoint(eager)\n return DataFrame(jdf, self.sql_ctx)", "response": "Returns a locally checkpointed version of this Dataset."} {"instruction": "Can you tell what is the following Python 3 function doing\ndef withWatermark(self, eventTime, delayThreshold):\n if not eventTime or type(eventTime) is not str:\n raise TypeError(\"eventTime should be provided as a string\")\n if not delayThreshold or type(delayThreshold) is not str:\n raise TypeError(\"delayThreshold should be provided as a string interval\")\n jdf = self._jdf.withWatermark(eventTime, delayThreshold)\n return DataFrame(jdf, self.sql_ctx)", "response": "Defines an event time watermark for this DataFrame."} {"instruction": "Write a Python 3 function for\nspecifying some hint on the current DataFrame.", "response": "def hint(self, name, *parameters):\n \"\"\"Specifies some hint on the current DataFrame.\n\n :param name: A name of the hint.\n :param parameters: Optional parameters.\n :return: :class:`DataFrame`\n\n >>> df.join(df2.hint(\"broadcast\"), \"name\").show()\n +----+---+------+\n |name|age|height|\n +----+---+------+\n | Bob| 5| 85|\n +----+---+------+\n \"\"\"\n if len(parameters) == 1 and isinstance(parameters[0], list):\n parameters = parameters[0]\n\n if not isinstance(name, str):\n raise TypeError(\"name should be provided as str, got {0}\".format(type(name)))\n\n allowed_types = (basestring, list, float, int)\n for p in parameters:\n if not isinstance(p, allowed_types):\n raise TypeError(\n \"all parameters should be in {0}, got {1} of type {2}\".format(\n allowed_types, p, type(p)))\n\n jdf = self._jdf.hint(name, self._jseq(parameters))\n return DataFrame(jdf, self.sql_ctx)"} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef collect(self):\n with SCCallSiteSync(self._sc) as css:\n sock_info = self._jdf.collectToPython()\n return list(_load_from_socket(sock_info, BatchedSerializer(PickleSerializer())))", "response": "Returns all the records as a list of Row."} {"instruction": "Can you create a Python 3 function that\nreturns an iterator that returns all of the rows in this DataFrame.", "response": "def toLocalIterator(self):\n \"\"\"\n Returns an iterator that contains all of the rows in this :class:`DataFrame`.\n The iterator will consume as much memory as the largest partition in this DataFrame.\n\n >>> list(df.toLocalIterator())\n [Row(age=2, name=u'Alice'), Row(age=5, name=u'Bob')]\n \"\"\"\n with SCCallSiteSync(self._sc) as css:\n sock_info = self._jdf.toPythonIterator()\n return _load_from_socket(sock_info, BatchedSerializer(PickleSerializer()))"} {"instruction": "Write a Python 3 script to\nlimit the result count to the number specified.", "response": "def limit(self, num):\n \"\"\"Limits the result count to the number specified.\n\n >>> df.limit(1).collect()\n [Row(age=2, name=u'Alice')]\n >>> df.limit(0).collect()\n []\n \"\"\"\n jdf = self._jdf.limit(num)\n return DataFrame(jdf, self.sql_ctx)"} {"instruction": "How would you code a function in Python 3 to\nset the storage level to persist the contents of the DataFrame across the first time it is computed.", "response": "def persist(self, storageLevel=StorageLevel.MEMORY_AND_DISK):\n \"\"\"Sets the storage level to persist the contents of the :class:`DataFrame` across\n operations after the first time it is computed. This can only be used to assign\n a new storage level if the :class:`DataFrame` does not have a storage level set yet.\n If no storage level is specified defaults to (C{MEMORY_AND_DISK}).\n\n .. note:: The default storage level has changed to C{MEMORY_AND_DISK} to match Scala in 2.0.\n \"\"\"\n self.is_cached = True\n javaStorageLevel = self._sc._getJavaStorageLevel(storageLevel)\n self._jdf.persist(javaStorageLevel)\n return self"} {"instruction": "Can you write a function in Python 3 where it\ngets the current storage level of the DataFrame.", "response": "def storageLevel(self):\n \"\"\"Get the :class:`DataFrame`'s current storage level.\n\n >>> df.storageLevel\n StorageLevel(False, False, False, False, 1)\n >>> df.cache().storageLevel\n StorageLevel(True, True, False, True, 1)\n >>> df2.persist(StorageLevel.DISK_ONLY_2).storageLevel\n StorageLevel(True, False, False, False, 2)\n \"\"\"\n java_storage_level = self._jdf.storageLevel()\n storage_level = StorageLevel(java_storage_level.useDisk(),\n java_storage_level.useMemory(),\n java_storage_level.useOffHeap(),\n java_storage_level.deserialized(),\n java_storage_level.replication())\n return storage_level"} {"instruction": "Given the following Python 3 function, write the documentation\ndef unpersist(self, blocking=False):\n self.is_cached = False\n self._jdf.unpersist(blocking)\n return self", "response": "Marks the DataFrame as non - persistent and remove all blocks for it from memory and disk."} {"instruction": "Implement a function in Python 3 to\nreturn a new DataFrame that has exactly numPartitions partitions.", "response": "def coalesce(self, numPartitions):\n \"\"\"\n Returns a new :class:`DataFrame` that has exactly `numPartitions` partitions.\n\n :param numPartitions: int, to specify the target number of partitions\n\n Similar to coalesce defined on an :class:`RDD`, this operation results in a\n narrow dependency, e.g. if you go from 1000 partitions to 100 partitions,\n there will not be a shuffle, instead each of the 100 new partitions will\n claim 10 of the current partitions. If a larger number of partitions is requested,\n it will stay at the current number of partitions.\n\n However, if you're doing a drastic coalesce, e.g. to numPartitions = 1,\n this may result in your computation taking place on fewer nodes than\n you like (e.g. one node in the case of numPartitions = 1). To avoid this,\n you can call repartition(). This will add a shuffle step, but means the\n current upstream partitions will be executed in parallel (per whatever\n the current partitioning is).\n\n >>> df.coalesce(1).rdd.getNumPartitions()\n 1\n \"\"\"\n return DataFrame(self._jdf.coalesce(numPartitions), self.sql_ctx)"} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef repartition(self, numPartitions, *cols):\n if isinstance(numPartitions, int):\n if len(cols) == 0:\n return DataFrame(self._jdf.repartition(numPartitions), self.sql_ctx)\n else:\n return DataFrame(\n self._jdf.repartition(numPartitions, self._jcols(*cols)), self.sql_ctx)\n elif isinstance(numPartitions, (basestring, Column)):\n cols = (numPartitions, ) + cols\n return DataFrame(self._jdf.repartition(self._jcols(*cols)), self.sql_ctx)\n else:\n raise TypeError(\"numPartitions should be an int or Column\")", "response": "Returns a new DataFrame with the given number of partitions and the given columns."} {"instruction": "Make a summary of the following Python 3 code\ndef sample(self, withReplacement=None, fraction=None, seed=None):\n\n # For the cases below:\n # sample(True, 0.5 [, seed])\n # sample(True, fraction=0.5 [, seed])\n # sample(withReplacement=False, fraction=0.5 [, seed])\n is_withReplacement_set = \\\n type(withReplacement) == bool and isinstance(fraction, float)\n\n # For the case below:\n # sample(faction=0.5 [, seed])\n is_withReplacement_omitted_kwargs = \\\n withReplacement is None and isinstance(fraction, float)\n\n # For the case below:\n # sample(0.5 [, seed])\n is_withReplacement_omitted_args = isinstance(withReplacement, float)\n\n if not (is_withReplacement_set\n or is_withReplacement_omitted_kwargs\n or is_withReplacement_omitted_args):\n argtypes = [\n str(type(arg)) for arg in [withReplacement, fraction, seed] if arg is not None]\n raise TypeError(\n \"withReplacement (optional), fraction (required) and seed (optional)\"\n \" should be a bool, float and number; however, \"\n \"got [%s].\" % \", \".join(argtypes))\n\n if is_withReplacement_omitted_args:\n if fraction is not None:\n seed = fraction\n fraction = withReplacement\n withReplacement = None\n\n seed = long(seed) if seed is not None else None\n args = [arg for arg in [withReplacement, fraction, seed] if arg is not None]\n jdf = self._jdf.sample(*args)\n return DataFrame(jdf, self.sql_ctx)", "response": "Returns a sampled subset of this DataFrame."} {"instruction": "Create a Python 3 function to\nreturn a stratified sample without replacement based on the fractions given on each stratum.", "response": "def sampleBy(self, col, fractions, seed=None):\n \"\"\"\n Returns a stratified sample without replacement based on the\n fraction given on each stratum.\n\n :param col: column that defines strata\n :param fractions:\n sampling fraction for each stratum. If a stratum is not\n specified, we treat its fraction as zero.\n :param seed: random seed\n :return: a new DataFrame that represents the stratified sample\n\n >>> from pyspark.sql.functions import col\n >>> dataset = sqlContext.range(0, 100).select((col(\"id\") % 3).alias(\"key\"))\n >>> sampled = dataset.sampleBy(\"key\", fractions={0: 0.1, 1: 0.2}, seed=0)\n >>> sampled.groupBy(\"key\").count().orderBy(\"key\").show()\n +---+-----+\n |key|count|\n +---+-----+\n | 0| 3|\n | 1| 6|\n +---+-----+\n >>> dataset.sampleBy(col(\"key\"), fractions={2: 1.0}, seed=0).count()\n 33\n\n .. versionchanged:: 3.0\n Added sampling by a column of :class:`Column`\n \"\"\"\n if isinstance(col, basestring):\n col = Column(col)\n elif not isinstance(col, Column):\n raise ValueError(\"col must be a string or a column, but got %r\" % type(col))\n if not isinstance(fractions, dict):\n raise ValueError(\"fractions must be a dict but got %r\" % type(fractions))\n for k, v in fractions.items():\n if not isinstance(k, (float, int, long, basestring)):\n raise ValueError(\"key must be float, int, long, or string, but got %r\" % type(k))\n fractions[k] = float(v)\n col = col._jc\n seed = seed if seed is not None else random.randint(0, sys.maxsize)\n return DataFrame(self._jdf.stat().sampleBy(col, self._jmap(fractions), seed), self.sql_ctx)"} {"instruction": "Can you create a Python 3 function that\nreturns all column names and their data types as a list.", "response": "def dtypes(self):\n \"\"\"Returns all column names and their data types as a list.\n\n >>> df.dtypes\n [('age', 'int'), ('name', 'string')]\n \"\"\"\n return [(str(f.name), f.dataType.simpleString()) for f in self.schema.fields]"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef colRegex(self, colName):\n if not isinstance(colName, basestring):\n raise ValueError(\"colName should be provided as string\")\n jc = self._jdf.colRegex(colName)\n return Column(jc)", "response": "Selects a column based on the column name specified as a regex and returns it\n as a Column."} {"instruction": "Create a Python 3 function to\nreturn a new DataFrame with an alias set.", "response": "def alias(self, alias):\n \"\"\"Returns a new :class:`DataFrame` with an alias set.\n\n :param alias: string, an alias name to be set for the DataFrame.\n\n >>> from pyspark.sql.functions import *\n >>> df_as1 = df.alias(\"df_as1\")\n >>> df_as2 = df.alias(\"df_as2\")\n >>> joined_df = df_as1.join(df_as2, col(\"df_as1.name\") == col(\"df_as2.name\"), 'inner')\n >>> joined_df.select(\"df_as1.name\", \"df_as2.name\", \"df_as2.age\").collect()\n [Row(name=u'Bob', name=u'Bob', age=5), Row(name=u'Alice', name=u'Alice', age=2)]\n \"\"\"\n assert isinstance(alias, basestring), \"alias should be a string\"\n return DataFrame(getattr(self._jdf, \"as\")(alias), self.sql_ctx)"} {"instruction": "Explain what the following Python 3 code does\ndef crossJoin(self, other):\n\n jdf = self._jdf.crossJoin(other._jdf)\n return DataFrame(jdf, self.sql_ctx)", "response": "Returns the cartesian product with another."} {"instruction": "Write a Python 3 function that can\njoin with another DataFrame.", "response": "def join(self, other, on=None, how=None):\n \"\"\"Joins with another :class:`DataFrame`, using the given join expression.\n\n :param other: Right side of the join\n :param on: a string for the join column name, a list of column names,\n a join expression (Column), or a list of Columns.\n If `on` is a string or a list of strings indicating the name of the join column(s),\n the column(s) must exist on both sides, and this performs an equi-join.\n :param how: str, default ``inner``. Must be one of: ``inner``, ``cross``, ``outer``,\n ``full``, ``fullouter``, ``full_outer``, ``left``, ``leftouter``, ``left_outer``,\n ``right``, ``rightouter``, ``right_outer``, ``semi``, ``leftsemi``, ``left_semi``,\n ``anti``, ``leftanti`` and ``left_anti``.\n\n The following performs a full outer join between ``df1`` and ``df2``.\n\n >>> df.join(df2, df.name == df2.name, 'outer').select(df.name, df2.height).collect()\n [Row(name=None, height=80), Row(name=u'Bob', height=85), Row(name=u'Alice', height=None)]\n\n >>> df.join(df2, 'name', 'outer').select('name', 'height').collect()\n [Row(name=u'Tom', height=80), Row(name=u'Bob', height=85), Row(name=u'Alice', height=None)]\n\n >>> cond = [df.name == df3.name, df.age == df3.age]\n >>> df.join(df3, cond, 'outer').select(df.name, df3.age).collect()\n [Row(name=u'Alice', age=2), Row(name=u'Bob', age=5)]\n\n >>> df.join(df2, 'name').select(df.name, df2.height).collect()\n [Row(name=u'Bob', height=85)]\n\n >>> df.join(df4, ['name', 'age']).select(df.name, df.age).collect()\n [Row(name=u'Bob', age=5)]\n \"\"\"\n\n if on is not None and not isinstance(on, list):\n on = [on]\n\n if on is not None:\n if isinstance(on[0], basestring):\n on = self._jseq(on)\n else:\n assert isinstance(on[0], Column), \"on should be Column or list of Column\"\n on = reduce(lambda x, y: x.__and__(y), on)\n on = on._jc\n\n if on is None and how is None:\n jdf = self._jdf.join(other._jdf)\n else:\n if how is None:\n how = \"inner\"\n if on is None:\n on = self._jseq([])\n assert isinstance(how, basestring), \"how should be basestring\"\n jdf = self._jdf.join(other._jdf, on, how)\n return DataFrame(jdf, self.sql_ctx)"} {"instruction": "Can you create a Python 3 function that\nreturns a new DataFrame with each partition sorted by the specified column names.", "response": "def sortWithinPartitions(self, *cols, **kwargs):\n \"\"\"Returns a new :class:`DataFrame` with each partition sorted by the specified column(s).\n\n :param cols: list of :class:`Column` or column names to sort by.\n :param ascending: boolean or list of boolean (default True).\n Sort ascending vs. descending. Specify list for multiple sort orders.\n If a list is specified, length of the list must equal length of the `cols`.\n\n >>> df.sortWithinPartitions(\"age\", ascending=False).show()\n +---+-----+\n |age| name|\n +---+-----+\n | 2|Alice|\n | 5| Bob|\n +---+-----+\n \"\"\"\n jdf = self._jdf.sortWithinPartitions(self._sort_cols(cols, kwargs))\n return DataFrame(jdf, self.sql_ctx)"} {"instruction": "How would you implement a function in Python 3 that\nreturns a JVM Seq of Columns from a list of Column names", "response": "def _jseq(self, cols, converter=None):\n \"\"\"Return a JVM Seq of Columns from a list of Column or names\"\"\"\n return _to_seq(self.sql_ctx._sc, cols, converter)"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef _jcols(self, *cols):\n if len(cols) == 1 and isinstance(cols[0], list):\n cols = cols[0]\n return self._jseq(cols, _to_java_column)", "response": "Return a JVM Seq of Columns from a list of Column or column names\n "} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef _sort_cols(self, cols, kwargs):\n if not cols:\n raise ValueError(\"should sort by at least one column\")\n if len(cols) == 1 and isinstance(cols[0], list):\n cols = cols[0]\n jcols = [_to_java_column(c) for c in cols]\n ascending = kwargs.get('ascending', True)\n if isinstance(ascending, (bool, int)):\n if not ascending:\n jcols = [jc.desc() for jc in jcols]\n elif isinstance(ascending, list):\n jcols = [jc if asc else jc.desc()\n for asc, jc in zip(ascending, jcols)]\n else:\n raise TypeError(\"ascending can only be boolean or list, but got %s\" % type(ascending))\n return self._jseq(jcols)", "response": "Returns a JVM Seq of Columns that describes the sort order"} {"instruction": "Write a Python 3 script to\nreturn a DataFrame with basic statistics for the given columns.", "response": "def describe(self, *cols):\n \"\"\"Computes basic statistics for numeric and string columns.\n\n This include count, mean, stddev, min, and max. If no columns are\n given, this function computes statistics for all numerical or string columns.\n\n .. note:: This function is meant for exploratory data analysis, as we make no\n guarantee about the backward compatibility of the schema of the resulting DataFrame.\n\n >>> df.describe(['age']).show()\n +-------+------------------+\n |summary| age|\n +-------+------------------+\n | count| 2|\n | mean| 3.5|\n | stddev|2.1213203435596424|\n | min| 2|\n | max| 5|\n +-------+------------------+\n >>> df.describe().show()\n +-------+------------------+-----+\n |summary| age| name|\n +-------+------------------+-----+\n | count| 2| 2|\n | mean| 3.5| null|\n | stddev|2.1213203435596424| null|\n | min| 2|Alice|\n | max| 5| Bob|\n +-------+------------------+-----+\n\n Use summary for expanded statistics and control over which statistics to compute.\n \"\"\"\n if len(cols) == 1 and isinstance(cols[0], list):\n cols = cols[0]\n jdf = self._jdf.describe(self._jseq(cols))\n return DataFrame(jdf, self.sql_ctx)"} {"instruction": "Given the following Python 3 function, write the documentation\ndef summary(self, *statistics):\n if len(statistics) == 1 and isinstance(statistics[0], list):\n statistics = statistics[0]\n jdf = self._jdf.summary(self._jseq(statistics))\n return DataFrame(jdf, self.sql_ctx)", "response": "Shows the summary of the specified statistics for the specified numeric and string columns."} {"instruction": "Explain what the following Python 3 code does\ndef head(self, n=None):\n if n is None:\n rs = self.head(1)\n return rs[0] if rs else None\n return self.take(n)", "response": "Returns the first n rows."} {"instruction": "Write a Python 3 script for\nprojecting a set of expressions and returns a new DataFrame.", "response": "def select(self, *cols):\n \"\"\"Projects a set of expressions and returns a new :class:`DataFrame`.\n\n :param cols: list of column names (string) or expressions (:class:`Column`).\n If one of the column names is '*', that column is expanded to include all columns\n in the current DataFrame.\n\n >>> df.select('*').collect()\n [Row(age=2, name=u'Alice'), Row(age=5, name=u'Bob')]\n >>> df.select('name', 'age').collect()\n [Row(name=u'Alice', age=2), Row(name=u'Bob', age=5)]\n >>> df.select(df.name, (df.age + 10).alias('age')).collect()\n [Row(name=u'Alice', age=12), Row(name=u'Bob', age=15)]\n \"\"\"\n jdf = self._jdf.select(self._jcols(*cols))\n return DataFrame(jdf, self.sql_ctx)"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef selectExpr(self, *expr):\n if len(expr) == 1 and isinstance(expr[0], list):\n expr = expr[0]\n jdf = self._jdf.selectExpr(self._jseq(expr))\n return DataFrame(jdf, self.sql_ctx)", "response": "Projects a set of SQL expressions and returns a new DataFrame."} {"instruction": "Explain what the following Python 3 code does\ndef filter(self, condition):\n if isinstance(condition, basestring):\n jdf = self._jdf.filter(condition)\n elif isinstance(condition, Column):\n jdf = self._jdf.filter(condition._jc)\n else:\n raise TypeError(\"condition should be string or Column\")\n return DataFrame(jdf, self.sql_ctx)", "response": "Returns a new DataFrame with only the rows that satisfy the given condition."} {"instruction": "Implement a function in Python 3 to\ngroup the DataFrame using the specified columns and returns a new DataFrame with the grouped rows grouped by the specified columns.", "response": "def groupBy(self, *cols):\n \"\"\"Groups the :class:`DataFrame` using the specified columns,\n so we can run aggregation on them. See :class:`GroupedData`\n for all the available aggregate functions.\n\n :func:`groupby` is an alias for :func:`groupBy`.\n\n :param cols: list of columns to group by.\n Each element should be a column name (string) or an expression (:class:`Column`).\n\n >>> df.groupBy().avg().collect()\n [Row(avg(age)=3.5)]\n >>> sorted(df.groupBy('name').agg({'age': 'mean'}).collect())\n [Row(name=u'Alice', avg(age)=2.0), Row(name=u'Bob', avg(age)=5.0)]\n >>> sorted(df.groupBy(df.name).avg().collect())\n [Row(name=u'Alice', avg(age)=2.0), Row(name=u'Bob', avg(age)=5.0)]\n >>> sorted(df.groupBy(['name', df.age]).count().collect())\n [Row(name=u'Alice', age=2, count=1), Row(name=u'Bob', age=5, count=1)]\n \"\"\"\n jgd = self._jdf.groupBy(self._jcols(*cols))\n from pyspark.sql.group import GroupedData\n return GroupedData(jgd, self)"} {"instruction": "Here you have a function in Python 3, explain what it does\ndef union(self, other):\n return DataFrame(self._jdf.union(other._jdf), self.sql_ctx)", "response": "Returns a new DataFrame containing the union of rows in this and another DataFrame."} {"instruction": "How would you code a function in Python 3 to\nreturn a new DataFrame containing union of rows in this and another DataFrame.", "response": "def unionByName(self, other):\n \"\"\" Returns a new :class:`DataFrame` containing union of rows in this and another frame.\n\n This is different from both `UNION ALL` and `UNION DISTINCT` in SQL. To do a SQL-style set\n union (that does deduplication of elements), use this function followed by :func:`distinct`.\n\n The difference between this function and :func:`union` is that this function\n resolves columns by name (not by position):\n\n >>> df1 = spark.createDataFrame([[1, 2, 3]], [\"col0\", \"col1\", \"col2\"])\n >>> df2 = spark.createDataFrame([[4, 5, 6]], [\"col1\", \"col2\", \"col0\"])\n >>> df1.unionByName(df2).show()\n +----+----+----+\n |col0|col1|col2|\n +----+----+----+\n | 1| 2| 3|\n | 6| 4| 5|\n +----+----+----+\n \"\"\"\n return DataFrame(self._jdf.unionByName(other._jdf), self.sql_ctx)"} {"instruction": "Explain what the following Python 3 code does\ndef intersect(self, other):\n return DataFrame(self._jdf.intersect(other._jdf), self.sql_ctx)", "response": "Returns a new DataFrame containing rows only in\n both this and other."} {"instruction": "Explain what the following Python 3 code does\ndef intersectAll(self, other):\n return DataFrame(self._jdf.intersectAll(other._jdf), self.sql_ctx)", "response": "Return a new DataFrame containing rows in both this dataframe and other dataframe while preserving duplicates."} {"instruction": "Can you write a function in Python 3 where it\nreturns a new DataFrame containing rows in this frame but not in another frame.", "response": "def subtract(self, other):\n \"\"\" Return a new :class:`DataFrame` containing rows in this frame\n but not in another frame.\n\n This is equivalent to `EXCEPT DISTINCT` in SQL.\n\n \"\"\"\n return DataFrame(getattr(self._jdf, \"except\")(other._jdf), self.sql_ctx)"} {"instruction": "Explain what the following Python 3 code does\ndef dropDuplicates(self, subset=None):\n if subset is None:\n jdf = self._jdf.dropDuplicates()\n else:\n jdf = self._jdf.dropDuplicates(self._jseq(subset))\n return DataFrame(jdf, self.sql_ctx)", "response": "Returns a new DataFrame with duplicate rows removed optionally considering certain columns."} {"instruction": "Given the following Python 3 function, write the documentation\ndef dropna(self, how='any', thresh=None, subset=None):\n if how is not None and how not in ['any', 'all']:\n raise ValueError(\"how ('\" + how + \"') should be 'any' or 'all'\")\n\n if subset is None:\n subset = self.columns\n elif isinstance(subset, basestring):\n subset = [subset]\n elif not isinstance(subset, (list, tuple)):\n raise ValueError(\"subset should be a list or tuple of column names\")\n\n if thresh is None:\n thresh = len(subset) if how == 'any' else 1\n\n return DataFrame(self._jdf.na().drop(thresh, self._jseq(subset)), self.sql_ctx)", "response": "Returns a new DataFrame with omitting rows with null values."} {"instruction": "How would you code a function in Python 3 to\nfill missing values with the given value.", "response": "def fillna(self, value, subset=None):\n \"\"\"Replace null values, alias for ``na.fill()``.\n :func:`DataFrame.fillna` and :func:`DataFrameNaFunctions.fill` are aliases of each other.\n\n :param value: int, long, float, string, bool or dict.\n Value to replace null values with.\n If the value is a dict, then `subset` is ignored and `value` must be a mapping\n from column name (string) to replacement value. The replacement value must be\n an int, long, float, boolean, or string.\n :param subset: optional list of column names to consider.\n Columns specified in subset that do not have matching data type are ignored.\n For example, if `value` is a string, and subset contains a non-string column,\n then the non-string column is simply ignored.\n\n >>> df4.na.fill(50).show()\n +---+------+-----+\n |age|height| name|\n +---+------+-----+\n | 10| 80|Alice|\n | 5| 50| Bob|\n | 50| 50| Tom|\n | 50| 50| null|\n +---+------+-----+\n\n >>> df5.na.fill(False).show()\n +----+-------+-----+\n | age| name| spy|\n +----+-------+-----+\n | 10| Alice|false|\n | 5| Bob|false|\n |null|Mallory| true|\n +----+-------+-----+\n\n >>> df4.na.fill({'age': 50, 'name': 'unknown'}).show()\n +---+------+-------+\n |age|height| name|\n +---+------+-------+\n | 10| 80| Alice|\n | 5| null| Bob|\n | 50| null| Tom|\n | 50| null|unknown|\n +---+------+-------+\n \"\"\"\n if not isinstance(value, (float, int, long, basestring, bool, dict)):\n raise ValueError(\"value should be a float, int, long, string, bool or dict\")\n\n # Note that bool validates isinstance(int), but we don't want to\n # convert bools to floats\n\n if not isinstance(value, bool) and isinstance(value, (int, long)):\n value = float(value)\n\n if isinstance(value, dict):\n return DataFrame(self._jdf.na().fill(value), self.sql_ctx)\n elif subset is None:\n return DataFrame(self._jdf.na().fill(value), self.sql_ctx)\n else:\n if isinstance(subset, basestring):\n subset = [subset]\n elif not isinstance(subset, (list, tuple)):\n raise ValueError(\"subset should be a list or tuple of column names\")\n\n return DataFrame(self._jdf.na().fill(value, self._jseq(subset)), self.sql_ctx)"} {"instruction": "Write a Python 3 function that can\nreturn a new DataFrame with the values replaced with another value.", "response": "def replace(self, to_replace, value=_NoValue, subset=None):\n \"\"\"Returns a new :class:`DataFrame` replacing a value with another value.\n :func:`DataFrame.replace` and :func:`DataFrameNaFunctions.replace` are\n aliases of each other.\n Values to_replace and value must have the same type and can only be numerics, booleans,\n or strings. Value can have None. When replacing, the new value will be cast\n to the type of the existing column.\n For numeric replacements all values to be replaced should have unique\n floating point representation. In case of conflicts (for example with `{42: -1, 42.0: 1}`)\n and arbitrary replacement will be used.\n\n :param to_replace: bool, int, long, float, string, list or dict.\n Value to be replaced.\n If the value is a dict, then `value` is ignored or can be omitted, and `to_replace`\n must be a mapping between a value and a replacement.\n :param value: bool, int, long, float, string, list or None.\n The replacement value must be a bool, int, long, float, string or None. If `value` is a\n list, `value` should be of the same length and type as `to_replace`.\n If `value` is a scalar and `to_replace` is a sequence, then `value` is\n used as a replacement for each item in `to_replace`.\n :param subset: optional list of column names to consider.\n Columns specified in subset that do not have matching data type are ignored.\n For example, if `value` is a string, and subset contains a non-string column,\n then the non-string column is simply ignored.\n\n >>> df4.na.replace(10, 20).show()\n +----+------+-----+\n | age|height| name|\n +----+------+-----+\n | 20| 80|Alice|\n | 5| null| Bob|\n |null| null| Tom|\n |null| null| null|\n +----+------+-----+\n\n >>> df4.na.replace('Alice', None).show()\n +----+------+----+\n | age|height|name|\n +----+------+----+\n | 10| 80|null|\n | 5| null| Bob|\n |null| null| Tom|\n |null| null|null|\n +----+------+----+\n\n >>> df4.na.replace({'Alice': None}).show()\n +----+------+----+\n | age|height|name|\n +----+------+----+\n | 10| 80|null|\n | 5| null| Bob|\n |null| null| Tom|\n |null| null|null|\n +----+------+----+\n\n >>> df4.na.replace(['Alice', 'Bob'], ['A', 'B'], 'name').show()\n +----+------+----+\n | age|height|name|\n +----+------+----+\n | 10| 80| A|\n | 5| null| B|\n |null| null| Tom|\n |null| null|null|\n +----+------+----+\n \"\"\"\n if value is _NoValue:\n if isinstance(to_replace, dict):\n value = None\n else:\n raise TypeError(\"value argument is required when to_replace is not a dictionary.\")\n\n # Helper functions\n def all_of(types):\n \"\"\"Given a type or tuple of types and a sequence of xs\n check if each x is instance of type(s)\n\n >>> all_of(bool)([True, False])\n True\n >>> all_of(basestring)([\"a\", 1])\n False\n \"\"\"\n def all_of_(xs):\n return all(isinstance(x, types) for x in xs)\n return all_of_\n\n all_of_bool = all_of(bool)\n all_of_str = all_of(basestring)\n all_of_numeric = all_of((float, int, long))\n\n # Validate input types\n valid_types = (bool, float, int, long, basestring, list, tuple)\n if not isinstance(to_replace, valid_types + (dict, )):\n raise ValueError(\n \"to_replace should be a bool, float, int, long, string, list, tuple, or dict. \"\n \"Got {0}\".format(type(to_replace)))\n\n if not isinstance(value, valid_types) and value is not None \\\n and not isinstance(to_replace, dict):\n raise ValueError(\"If to_replace is not a dict, value should be \"\n \"a bool, float, int, long, string, list, tuple or None. \"\n \"Got {0}\".format(type(value)))\n\n if isinstance(to_replace, (list, tuple)) and isinstance(value, (list, tuple)):\n if len(to_replace) != len(value):\n raise ValueError(\"to_replace and value lists should be of the same length. \"\n \"Got {0} and {1}\".format(len(to_replace), len(value)))\n\n if not (subset is None or isinstance(subset, (list, tuple, basestring))):\n raise ValueError(\"subset should be a list or tuple of column names, \"\n \"column name or None. Got {0}\".format(type(subset)))\n\n # Reshape input arguments if necessary\n if isinstance(to_replace, (float, int, long, basestring)):\n to_replace = [to_replace]\n\n if isinstance(to_replace, dict):\n rep_dict = to_replace\n if value is not None:\n warnings.warn(\"to_replace is a dict and value is not None. value will be ignored.\")\n else:\n if isinstance(value, (float, int, long, basestring)) or value is None:\n value = [value for _ in range(len(to_replace))]\n rep_dict = dict(zip(to_replace, value))\n\n if isinstance(subset, basestring):\n subset = [subset]\n\n # Verify we were not passed in mixed type generics.\n if not any(all_of_type(rep_dict.keys())\n and all_of_type(x for x in rep_dict.values() if x is not None)\n for all_of_type in [all_of_bool, all_of_str, all_of_numeric]):\n raise ValueError(\"Mixed type replacements are not supported\")\n\n if subset is None:\n return DataFrame(self._jdf.na().replace('*', rep_dict), self.sql_ctx)\n else:\n return DataFrame(\n self._jdf.na().replace(self._jseq(subset), self._jmap(rep_dict)), self.sql_ctx)"} {"instruction": "Make a summary of the following Python 3 code\ndef approxQuantile(self, col, probabilities, relativeError):\n\n if not isinstance(col, (basestring, list, tuple)):\n raise ValueError(\"col should be a string, list or tuple, but got %r\" % type(col))\n\n isStr = isinstance(col, basestring)\n\n if isinstance(col, tuple):\n col = list(col)\n elif isStr:\n col = [col]\n\n for c in col:\n if not isinstance(c, basestring):\n raise ValueError(\"columns should be strings, but got %r\" % type(c))\n col = _to_list(self._sc, col)\n\n if not isinstance(probabilities, (list, tuple)):\n raise ValueError(\"probabilities should be a list or tuple\")\n if isinstance(probabilities, tuple):\n probabilities = list(probabilities)\n for p in probabilities:\n if not isinstance(p, (float, int, long)) or p < 0 or p > 1:\n raise ValueError(\"probabilities should be numerical (float, int, long) in [0,1].\")\n probabilities = _to_list(self._sc, probabilities)\n\n if not isinstance(relativeError, (float, int, long)) or relativeError < 0:\n raise ValueError(\"relativeError should be numerical (float, int, long) >= 0.\")\n relativeError = float(relativeError)\n\n jaq = self._jdf.stat().approxQuantile(col, probabilities, relativeError)\n jaq_list = [list(j) for j in jaq]\n return jaq_list[0] if isStr else jaq_list", "response": "This method calculates the approximate quantiles of the given columns."} {"instruction": "Write a Python 3 function that can\ncalculate the correlation coefficient of two columns of a DataFrame.", "response": "def corr(self, col1, col2, method=None):\n \"\"\"\n Calculates the correlation of two columns of a DataFrame as a double value.\n Currently only supports the Pearson Correlation Coefficient.\n :func:`DataFrame.corr` and :func:`DataFrameStatFunctions.corr` are aliases of each other.\n\n :param col1: The name of the first column\n :param col2: The name of the second column\n :param method: The correlation method. Currently only supports \"pearson\"\n \"\"\"\n if not isinstance(col1, basestring):\n raise ValueError(\"col1 should be a string.\")\n if not isinstance(col2, basestring):\n raise ValueError(\"col2 should be a string.\")\n if not method:\n method = \"pearson\"\n if not method == \"pearson\":\n raise ValueError(\"Currently only the calculation of the Pearson Correlation \" +\n \"coefficient is supported.\")\n return self._jdf.stat().corr(col1, col2, method)"} {"instruction": "Implement a function in Python 3 to\ncalculate the sample covariance for the given columns specified by their names as a double value.", "response": "def cov(self, col1, col2):\n \"\"\"\n Calculate the sample covariance for the given columns, specified by their names, as a\n double value. :func:`DataFrame.cov` and :func:`DataFrameStatFunctions.cov` are aliases.\n\n :param col1: The name of the first column\n :param col2: The name of the second column\n \"\"\"\n if not isinstance(col1, basestring):\n raise ValueError(\"col1 should be a string.\")\n if not isinstance(col2, basestring):\n raise ValueError(\"col2 should be a string.\")\n return self._jdf.stat().cov(col1, col2)"} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef crosstab(self, col1, col2):\n if not isinstance(col1, basestring):\n raise ValueError(\"col1 should be a string.\")\n if not isinstance(col2, basestring):\n raise ValueError(\"col2 should be a string.\")\n return DataFrame(self._jdf.stat().crosstab(col1, col2), self.sql_ctx)", "response": "Computes a pair - wise frequency table of the given columns."} {"instruction": "Can you tell what is the following Python 3 function doing\ndef freqItems(self, cols, support=None):\n if isinstance(cols, tuple):\n cols = list(cols)\n if not isinstance(cols, list):\n raise ValueError(\"cols must be a list or tuple of column names as strings.\")\n if not support:\n support = 0.01\n return DataFrame(self._jdf.stat().freqItems(_to_seq(self._sc, cols), support), self.sql_ctx)", "response": "Returns a new DataFrame with the items that have frequent elements for the given columns."} {"instruction": "Can you tell what is the following Python 3 function doing\ndef withColumn(self, colName, col):\n assert isinstance(col, Column), \"col should be Column\"\n return DataFrame(self._jdf.withColumn(colName, col._jc), self.sql_ctx)", "response": "Returns a new DataFrame by adding a column or replacing the existing column with the same name."} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef withColumnRenamed(self, existing, new):\n return DataFrame(self._jdf.withColumnRenamed(existing, new), self.sql_ctx)", "response": "Returns a new DataFrame by renaming an existing column."} {"instruction": "Write a Python 3 function that can\nreturn a new DataFrame that contains the specified column.", "response": "def drop(self, *cols):\n \"\"\"Returns a new :class:`DataFrame` that drops the specified column.\n This is a no-op if schema doesn't contain the given column name(s).\n\n :param cols: a string name of the column to drop, or a\n :class:`Column` to drop, or a list of string name of the columns to drop.\n\n >>> df.drop('age').collect()\n [Row(name=u'Alice'), Row(name=u'Bob')]\n\n >>> df.drop(df.age).collect()\n [Row(name=u'Alice'), Row(name=u'Bob')]\n\n >>> df.join(df2, df.name == df2.name, 'inner').drop(df.name).collect()\n [Row(age=5, height=85, name=u'Bob')]\n\n >>> df.join(df2, df.name == df2.name, 'inner').drop(df2.name).collect()\n [Row(age=5, name=u'Bob', height=85)]\n\n >>> df.join(df2, 'name', 'inner').drop('age', 'height').collect()\n [Row(name=u'Bob')]\n \"\"\"\n if len(cols) == 1:\n col = cols[0]\n if isinstance(col, basestring):\n jdf = self._jdf.drop(col)\n elif isinstance(col, Column):\n jdf = self._jdf.drop(col._jc)\n else:\n raise TypeError(\"col should be a string or a Column\")\n else:\n for col in cols:\n if not isinstance(col, basestring):\n raise TypeError(\"each col in the param list should be a string\")\n jdf = self._jdf.drop(self._jseq(cols))\n\n return DataFrame(jdf, self.sql_ctx)"} {"instruction": "Can you write a function in Python 3 where it\nreturns a new class : DataFrame that with new specified column names", "response": "def toDF(self, *cols):\n \"\"\"Returns a new class:`DataFrame` that with new specified column names\n\n :param cols: list of new column names (string)\n\n >>> df.toDF('f1', 'f2').collect()\n [Row(f1=2, f2=u'Alice'), Row(f1=5, f2=u'Bob')]\n \"\"\"\n jdf = self._jdf.toDF(self._jseq(cols))\n return DataFrame(jdf, self.sql_ctx)"} {"instruction": "Can you implement a function in Python 3 that\nreturns a new class : DataFrame. Concise syntax for chaining custom transformations.", "response": "def transform(self, func):\n \"\"\"Returns a new class:`DataFrame`. Concise syntax for chaining custom transformations.\n\n :param func: a function that takes and returns a class:`DataFrame`.\n\n >>> from pyspark.sql.functions import col\n >>> df = spark.createDataFrame([(1, 1.0), (2, 2.0)], [\"int\", \"float\"])\n >>> def cast_all_to_int(input_df):\n ... return input_df.select([col(col_name).cast(\"int\") for col_name in input_df.columns])\n >>> def sort_columns_asc(input_df):\n ... return input_df.select(*sorted(input_df.columns))\n >>> df.transform(cast_all_to_int).transform(sort_columns_asc).show()\n +-----+---+\n |float|int|\n +-----+---+\n | 1| 1|\n | 2| 2|\n +-----+---+\n \"\"\"\n result = func(self)\n assert isinstance(result, DataFrame), \"Func returned an instance of type [%s], \" \\\n \"should have been DataFrame.\" % type(result)\n return result"} {"instruction": "Explain what the following Python 3 code does\ndef toPandas(self):\n from pyspark.sql.utils import require_minimum_pandas_version\n require_minimum_pandas_version()\n\n import pandas as pd\n\n if self.sql_ctx._conf.pandasRespectSessionTimeZone():\n timezone = self.sql_ctx._conf.sessionLocalTimeZone()\n else:\n timezone = None\n\n if self.sql_ctx._conf.arrowEnabled():\n use_arrow = True\n try:\n from pyspark.sql.types import to_arrow_schema\n from pyspark.sql.utils import require_minimum_pyarrow_version\n\n require_minimum_pyarrow_version()\n to_arrow_schema(self.schema)\n except Exception as e:\n\n if self.sql_ctx._conf.arrowFallbackEnabled():\n msg = (\n \"toPandas attempted Arrow optimization because \"\n \"'spark.sql.execution.arrow.enabled' is set to true; however, \"\n \"failed by the reason below:\\n %s\\n\"\n \"Attempting non-optimization as \"\n \"'spark.sql.execution.arrow.fallback.enabled' is set to \"\n \"true.\" % _exception_message(e))\n warnings.warn(msg)\n use_arrow = False\n else:\n msg = (\n \"toPandas attempted Arrow optimization because \"\n \"'spark.sql.execution.arrow.enabled' is set to true, but has reached \"\n \"the error below and will not continue because automatic fallback \"\n \"with 'spark.sql.execution.arrow.fallback.enabled' has been set to \"\n \"false.\\n %s\" % _exception_message(e))\n warnings.warn(msg)\n raise\n\n # Try to use Arrow optimization when the schema is supported and the required version\n # of PyArrow is found, if 'spark.sql.execution.arrow.enabled' is enabled.\n if use_arrow:\n try:\n from pyspark.sql.types import _check_dataframe_localize_timestamps\n import pyarrow\n batches = self._collectAsArrow()\n if len(batches) > 0:\n table = pyarrow.Table.from_batches(batches)\n # Pandas DataFrame created from PyArrow uses datetime64[ns] for date type\n # values, but we should use datetime.date to match the behavior with when\n # Arrow optimization is disabled.\n pdf = table.to_pandas(date_as_object=True)\n return _check_dataframe_localize_timestamps(pdf, timezone)\n else:\n return pd.DataFrame.from_records([], columns=self.columns)\n except Exception as e:\n # We might have to allow fallback here as well but multiple Spark jobs can\n # be executed. So, simply fail in this case for now.\n msg = (\n \"toPandas attempted Arrow optimization because \"\n \"'spark.sql.execution.arrow.enabled' is set to true, but has reached \"\n \"the error below and can not continue. Note that \"\n \"'spark.sql.execution.arrow.fallback.enabled' does not have an effect \"\n \"on failures in the middle of computation.\\n %s\" % _exception_message(e))\n warnings.warn(msg)\n raise\n\n # Below is toPandas without Arrow optimization.\n pdf = pd.DataFrame.from_records(self.collect(), columns=self.columns)\n\n dtype = {}\n for field in self.schema:\n pandas_type = _to_corrected_pandas_type(field.dataType)\n # SPARK-21766: if an integer field is nullable and has null values, it can be\n # inferred by pandas as float column. Once we convert the column with NaN back\n # to integer type e.g., np.int16, we will hit exception. So we use the inferred\n # float type, not the corrected type from the schema in this case.\n if pandas_type is not None and \\\n not(isinstance(field.dataType, IntegralType) and field.nullable and\n pdf[field.name].isnull().any()):\n dtype[field.name] = pandas_type\n\n for f, t in dtype.items():\n pdf[f] = pdf[f].astype(t, copy=False)\n\n if timezone is None:\n return pdf\n else:\n from pyspark.sql.types import _check_series_convert_timestamps_local_tz\n for field in self.schema:\n # TODO: handle nested timestamps, such as ArrayType(TimestampType())?\n if isinstance(field.dataType, TimestampType):\n pdf[field.name] = \\\n _check_series_convert_timestamps_local_tz(pdf[field.name], timezone)\n return pdf", "response": "Returns the contents of this DataFrame as Pandas. DataFrame."} {"instruction": "Implement a Python 3 function for\nreturning all records as a list of ArrowRecordBatches", "response": "def _collectAsArrow(self):\n \"\"\"\n Returns all records as a list of ArrowRecordBatches, pyarrow must be installed\n and available on driver and worker Python environments.\n\n .. note:: Experimental.\n \"\"\"\n with SCCallSiteSync(self._sc) as css:\n sock_info = self._jdf.collectAsArrowToPython()\n\n # Collect list of un-ordered batches where last element is a list of correct order indices\n results = list(_load_from_socket(sock_info, ArrowCollectSerializer()))\n batches = results[:-1]\n batch_order = results[-1]\n\n # Re-order the batch list using the correct order\n return [batches[i] for i in batch_order]"} {"instruction": "Can you implement a function in Python 3 that\nreturns the statCounter members as a dict.", "response": "def asDict(self, sample=False):\n \"\"\"Returns the :class:`StatCounter` members as a ``dict``.\n\n >>> sc.parallelize([1., 2., 3., 4.]).stats().asDict()\n {'count': 4L,\n 'max': 4.0,\n 'mean': 2.5,\n 'min': 1.0,\n 'stdev': 1.2909944487358056,\n 'sum': 10.0,\n 'variance': 1.6666666666666667}\n \"\"\"\n return {\n 'count': self.count(),\n 'mean': self.mean(),\n 'sum': self.sum(),\n 'min': self.min(),\n 'max': self.max(),\n 'stdev': self.stdev() if sample else self.sampleStdev(),\n 'variance': self.variance() if sample else self.sampleVariance()\n }"} {"instruction": "Given the following Python 3 function, write the documentation\ndef _list_function_infos(jvm):\n\n jinfos = jvm.org.apache.spark.sql.api.python.PythonSQLUtils.listBuiltinFunctionInfos()\n infos = []\n for jinfo in jinfos:\n name = jinfo.getName()\n usage = jinfo.getUsage()\n usage = usage.replace(\"_FUNC_\", name) if usage is not None else usage\n infos.append(ExpressionInfo(\n className=jinfo.getClassName(),\n name=name,\n usage=usage,\n arguments=jinfo.getArguments().replace(\"_FUNC_\", name),\n examples=jinfo.getExamples().replace(\"_FUNC_\", name),\n note=jinfo.getNote(),\n since=jinfo.getSince(),\n deprecated=jinfo.getDeprecated()))\n return sorted(infos, key=lambda i: i.name)", "response": "Returns a list of function information via JVM."} {"instruction": "How would you implement a function in Python 3 that\nmakes the usage description pretty and returns a formatted string.", "response": "def _make_pretty_usage(usage):\n \"\"\"\n Makes the usage description pretty and returns a formatted string if `usage`\n is not an empty string. Otherwise, returns None.\n \"\"\"\n\n if usage is not None and usage.strip() != \"\":\n usage = \"\\n\".join(map(lambda u: u.strip(), usage.split(\"\\n\")))\n return \"%s\\n\\n\" % usage"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef _make_pretty_arguments(arguments):\n\n if arguments.startswith(\"\\n Arguments:\"):\n arguments = \"\\n\".join(map(lambda u: u[6:], arguments.strip().split(\"\\n\")[1:]))\n return \"**Arguments:**\\n\\n%s\\n\\n\" % arguments", "response": "Makes the arguments description pretty and returns a formatted string if arguments starts with the argument prefix. Otherwise returns None."} {"instruction": "Can you tell what is the following Python 3 function doing\ndef _make_pretty_examples(examples):\n\n if examples.startswith(\"\\n Examples:\"):\n examples = \"\\n\".join(map(lambda u: u[6:], examples.strip().split(\"\\n\")[1:]))\n return \"**Examples:**\\n\\n```\\n%s\\n```\\n\\n\" % examples", "response": "Makes the examples description pretty and returns a formatted string if examples starts with the example prefix. Otherwise returns None."} {"instruction": "Write a Python 3 function for\nmaking the note description pretty and returns a formatted string.", "response": "def _make_pretty_note(note):\n \"\"\"\n Makes the note description pretty and returns a formatted string if `note` is not\n an empty string. Otherwise, returns None.\n\n Expected input:\n\n ...\n\n Expected output:\n **Note:**\n\n ...\n\n \"\"\"\n\n if note != \"\":\n note = \"\\n\".join(map(lambda n: n[4:], note.split(\"\\n\")))\n return \"**Note:**\\n%s\\n\" % note"} {"instruction": "Write a Python 3 function that can\nmake the deprecated description pretty and returns a formatted string.", "response": "def _make_pretty_deprecated(deprecated):\n \"\"\"\n Makes the deprecated description pretty and returns a formatted string if `deprecated`\n is not an empty string. Otherwise, returns None.\n\n Expected input:\n\n ...\n\n Expected output:\n **Deprecated:**\n\n ...\n\n \"\"\"\n\n if deprecated != \"\":\n deprecated = \"\\n\".join(map(lambda n: n[4:], deprecated.split(\"\\n\")))\n return \"**Deprecated:**\\n%s\\n\" % deprecated"} {"instruction": "Given the following Python 3 function, write the documentation\ndef generate_sql_markdown(jvm, path):\n\n with open(path, 'w') as mdfile:\n for info in _list_function_infos(jvm):\n name = info.name\n usage = _make_pretty_usage(info.usage)\n arguments = _make_pretty_arguments(info.arguments)\n examples = _make_pretty_examples(info.examples)\n note = _make_pretty_note(info.note)\n since = info.since\n deprecated = _make_pretty_deprecated(info.deprecated)\n\n mdfile.write(\"### %s\\n\\n\" % name)\n if usage is not None:\n mdfile.write(\"%s\\n\\n\" % usage.strip())\n if arguments is not None:\n mdfile.write(arguments)\n if examples is not None:\n mdfile.write(examples)\n if note is not None:\n mdfile.write(note)\n if since is not None and since != \"\":\n mdfile.write(\"**Since:** %s\\n\\n\" % since.strip())\n if deprecated is not None:\n mdfile.write(deprecated)\n mdfile.write(\"
\\n\\n\")", "response": "Generates a markdown file for the specified applet."} {"instruction": "Can you tell what is the following Python 3 function doing\ndef predict(self, x):\n if isinstance(x, RDD):\n return x.map(lambda v: self.predict(v))\n\n x = _convert_to_vector(x)\n if self.numClasses == 2:\n margin = self.weights.dot(x) + self._intercept\n if margin > 0:\n prob = 1 / (1 + exp(-margin))\n else:\n exp_margin = exp(margin)\n prob = exp_margin / (1 + exp_margin)\n if self._threshold is None:\n return prob\n else:\n return 1 if prob > self._threshold else 0\n else:\n best_class = 0\n max_margin = 0.0\n if x.size + 1 == self._dataWithBiasSize:\n for i in range(0, self._numClasses - 1):\n margin = x.dot(self._weightsMatrix[i][0:x.size]) + \\\n self._weightsMatrix[i][x.size]\n if margin > max_margin:\n max_margin = margin\n best_class = i + 1\n else:\n for i in range(0, self._numClasses - 1):\n margin = x.dot(self._weightsMatrix[i])\n if margin > max_margin:\n max_margin = margin\n best_class = i + 1\n return best_class", "response": "Predict the class of a single data point or an RDD of points."} {"instruction": "How would you explain what the following Python 3 function does\ndef save(self, sc, path):\n java_model = sc._jvm.org.apache.spark.mllib.classification.LogisticRegressionModel(\n _py2java(sc, self._coeff), self.intercept, self.numFeatures, self.numClasses)\n java_model.save(sc._jsc.sc(), path)", "response": "Save this logistic regression model to the given path."} {"instruction": "Explain what the following Python 3 code does\ndef train(cls, data, iterations=100, initialWeights=None, regParam=0.0, regType=\"l2\",\n intercept=False, corrections=10, tolerance=1e-6, validateData=True, numClasses=2):\n \"\"\"\n Train a logistic regression model on the given data.\n\n :param data:\n The training data, an RDD of LabeledPoint.\n :param iterations:\n The number of iterations.\n (default: 100)\n :param initialWeights:\n The initial weights.\n (default: None)\n :param regParam:\n The regularizer parameter.\n (default: 0.0)\n :param regType:\n The type of regularizer used for training our model.\n Supported values:\n\n - \"l1\" for using L1 regularization\n - \"l2\" for using L2 regularization (default)\n - None for no regularization\n :param intercept:\n Boolean parameter which indicates the use or not of the\n augmented representation for training data (i.e., whether bias\n features are activated or not).\n (default: False)\n :param corrections:\n The number of corrections used in the LBFGS update.\n If a known updater is used for binary classification,\n it calls the ml implementation and this parameter will\n have no effect. (default: 10)\n :param tolerance:\n The convergence tolerance of iterations for L-BFGS.\n (default: 1e-6)\n :param validateData:\n Boolean parameter which indicates if the algorithm should\n validate data before training.\n (default: True)\n :param numClasses:\n The number of classes (i.e., outcomes) a label can take in\n Multinomial Logistic Regression.\n (default: 2)\n\n >>> data = [\n ... LabeledPoint(0.0, [0.0, 1.0]),\n ... LabeledPoint(1.0, [1.0, 0.0]),\n ... ]\n >>> lrm = LogisticRegressionWithLBFGS.train(sc.parallelize(data), iterations=10)\n >>> lrm.predict([1.0, 0.0])\n 1\n >>> lrm.predict([0.0, 1.0])\n 0\n \"\"\"\n def train(rdd, i):\n return callMLlibFunc(\"trainLogisticRegressionModelWithLBFGS\", rdd, int(iterations), i,\n float(regParam), regType, bool(intercept), int(corrections),\n float(tolerance), bool(validateData), int(numClasses))\n\n if initialWeights is None:\n if numClasses == 2:\n initialWeights = [0.0] * len(data.first().features)\n else:\n if intercept:\n initialWeights = [0.0] * (len(data.first().features) + 1) * (numClasses - 1)\n else:\n initialWeights = [0.0] * len(data.first().features) * (numClasses - 1)\n return _regression_train_wrapper(train, LogisticRegressionModel, data, initialWeights)", "response": "Train a logistic regression model on the given data."} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef predict(self, x):\n if isinstance(x, RDD):\n return x.map(lambda v: self.predict(v))\n\n x = _convert_to_vector(x)\n margin = self.weights.dot(x) + self.intercept\n if self._threshold is None:\n return margin\n else:\n return 1 if margin > self._threshold else 0", "response": "Predict values for a single data point or an RDD of points\n "} {"instruction": "Given the following Python 3 function, write the documentation\ndef save(self, sc, path):\n java_model = sc._jvm.org.apache.spark.mllib.classification.SVMModel(\n _py2java(sc, self._coeff), self.intercept)\n java_model.save(sc._jsc.sc(), path)", "response": "Save this model to the given path."} {"instruction": "Implement a function in Python 3 to\nload a SVM model from the given path.", "response": "def load(cls, sc, path):\n \"\"\"\n Load a model from the given path.\n \"\"\"\n java_model = sc._jvm.org.apache.spark.mllib.classification.SVMModel.load(\n sc._jsc.sc(), path)\n weights = _java2py(sc, java_model.weights())\n intercept = java_model.intercept()\n threshold = java_model.getThreshold().get()\n model = SVMModel(weights, intercept)\n model.setThreshold(threshold)\n return model"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef train(cls, data, lambda_=1.0):\n first = data.first()\n if not isinstance(first, LabeledPoint):\n raise ValueError(\"`data` should be an RDD of LabeledPoint\")\n labels, pi, theta = callMLlibFunc(\"trainNaiveBayesModel\", data, lambda_)\n return NaiveBayesModel(labels.toArray(), pi.toArray(), numpy.array(theta))", "response": "Train a Naive Bayes model given an RDD of LabeledPoint vectors."} {"instruction": "Make a summary of the following Python 3 code\ndef heappush(heap, item):\n heap.append(item)\n _siftdown(heap, 0, len(heap)-1)", "response": "Push item onto heap maintaining the heap invariant."} {"instruction": "How would you explain what the following Python 3 function does\ndef heappop(heap):\n lastelt = heap.pop() # raises appropriate IndexError if heap is empty\n if heap:\n returnitem = heap[0]\n heap[0] = lastelt\n _siftup(heap, 0)\n return returnitem\n return lastelt", "response": "Pop the smallest item off the heap maintaining the heap invariant."} {"instruction": "Create a Python 3 function to\npop and return the current smallest value and add the new item.", "response": "def heapreplace(heap, item):\n \"\"\"Pop and return the current smallest value, and add the new item.\n\n This is more efficient than heappop() followed by heappush(), and can be\n more appropriate when using a fixed-size heap. Note that the value\n returned may be larger than item! That constrains reasonable uses of\n this routine unless written as part of a conditional replacement:\n\n if item > heap[0]:\n item = heapreplace(heap, item)\n \"\"\"\n returnitem = heap[0] # raises appropriate IndexError if heap is empty\n heap[0] = item\n _siftup(heap, 0)\n return returnitem"} {"instruction": "Write a Python 3 script to\nfast version of a heappush followed by a heappop.", "response": "def heappushpop(heap, item):\n \"\"\"Fast version of a heappush followed by a heappop.\"\"\"\n if heap and heap[0] < item:\n item, heap[0] = heap[0], item\n _siftup(heap, 0)\n return item"} {"instruction": "Write a Python 3 script to\ntransform list into a heap in - place in O ( len ( x ) time.", "response": "def heapify(x):\n \"\"\"Transform list into a heap, in-place, in O(len(x)) time.\"\"\"\n n = len(x)\n # Transform bottom-up. The largest index there's any point to looking at\n # is the largest with a child index in-range, so must have 2*i + 1 < n,\n # or i < (n-1)/2. If n is even = 2*j, this is (2*j-1)/2 = j-1/2 so\n # j-1 is the largest, which is n//2 - 1. If n is odd = 2*j+1, this is\n # (2*j+1-1)/2 = j so j-1 is the largest, and that's again n//2-1.\n for i in reversed(range(n//2)):\n _siftup(x, i)"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef _heappop_max(heap):\n lastelt = heap.pop() # raises appropriate IndexError if heap is empty\n if heap:\n returnitem = heap[0]\n heap[0] = lastelt\n _siftup_max(heap, 0)\n return returnitem\n return lastelt", "response": "Maxheap version of a heappop."} {"instruction": "Given the following Python 3 function, write the documentation\ndef _heapreplace_max(heap, item):\n returnitem = heap[0] # raises appropriate IndexError if heap is empty\n heap[0] = item\n _siftup_max(heap, 0)\n return returnitem", "response": "Maxheap version of a heappop followed by a heappush."} {"instruction": "Can you implement a function in Python 3 that\ntransforms list into a maxheap in - place in O ( len ( x ) time.", "response": "def _heapify_max(x):\n \"\"\"Transform list into a maxheap, in-place, in O(len(x)) time.\"\"\"\n n = len(x)\n for i in reversed(range(n//2)):\n _siftup_max(x, i)"} {"instruction": "Here you have a function in Python 3, explain what it does\ndef _siftup_max(heap, pos):\n 'Maxheap variant of _siftup'\n endpos = len(heap)\n startpos = pos\n newitem = heap[pos]\n # Bubble up the larger child until hitting a leaf.\n childpos = 2*pos + 1 # leftmost child position\n while childpos < endpos:\n # Set childpos to index of larger child.\n rightpos = childpos + 1\n if rightpos < endpos and not heap[rightpos] < heap[childpos]:\n childpos = rightpos\n # Move the larger child up.\n heap[pos] = heap[childpos]\n pos = childpos\n childpos = 2*pos + 1\n # The leaf at pos is empty now. Put newitem there, and bubble it up\n # to its final resting place (by sifting its parents down).\n heap[pos] = newitem\n _siftdown_max(heap, startpos, pos)", "response": "Maxheap variant of _siftup"} {"instruction": "Explain what the following Python 3 code does\ndef merge(iterables, key=None, reverse=False):\n '''Merge multiple sorted inputs into a single sorted output.\n\n Similar to sorted(itertools.chain(*iterables)) but returns a generator,\n does not pull the data into memory all at once, and assumes that each of\n the input streams is already sorted (smallest to largest).\n\n >>> list(merge([1,3,5,7], [0,2,4,8], [5,10,15,20], [], [25]))\n [0, 1, 2, 3, 4, 5, 5, 7, 8, 10, 15, 20, 25]\n\n If *key* is not None, applies a key function to each element to determine\n its sort order.\n\n >>> list(merge(['dog', 'horse'], ['cat', 'fish', 'kangaroo'], key=len))\n ['dog', 'cat', 'fish', 'horse', 'kangaroo']\n\n '''\n\n h = []\n h_append = h.append\n\n if reverse:\n _heapify = _heapify_max\n _heappop = _heappop_max\n _heapreplace = _heapreplace_max\n direction = -1\n else:\n _heapify = heapify\n _heappop = heappop\n _heapreplace = heapreplace\n direction = 1\n\n if key is None:\n for order, it in enumerate(map(iter, iterables)):\n try:\n h_append([next(it), order * direction, it])\n except StopIteration:\n pass\n _heapify(h)\n while len(h) > 1:\n try:\n while True:\n value, order, it = s = h[0]\n yield value\n s[0] = next(it) # raises StopIteration when exhausted\n _heapreplace(h, s) # restore heap condition\n except StopIteration:\n _heappop(h) # remove empty iterator\n if h:\n # fast case when only a single iterator remains\n value, order, it = h[0]\n yield value\n for value in it:\n yield value\n return\n\n for order, it in enumerate(map(iter, iterables)):\n try:\n value = next(it)\n h_append([key(value), order * direction, value, it])\n except StopIteration:\n pass\n _heapify(h)\n while len(h) > 1:\n try:\n while True:\n key_value, order, value, it = s = h[0]\n yield value\n value = next(it)\n s[0] = key(value)\n s[2] = value\n _heapreplace(h, s)\n except StopIteration:\n _heappop(h)\n if h:\n key_value, order, value, it = h[0]\n yield value\n for value in it:\n yield value", "response": "Merge multiple sorted inputs into a single sorted output."} {"instruction": "Can you implement a function in Python 3 that\nfinds the n smallest elements in a dataset.", "response": "def nsmallest(n, iterable, key=None):\n \"\"\"Find the n smallest elements in a dataset.\n\n Equivalent to: sorted(iterable, key=key)[:n]\n \"\"\"\n\n # Short-cut for n==1 is to use min()\n if n == 1:\n it = iter(iterable)\n sentinel = object()\n if key is None:\n result = min(it, default=sentinel)\n else:\n result = min(it, default=sentinel, key=key)\n return [] if result is sentinel else [result]\n\n # When n>=size, it's faster to use sorted()\n try:\n size = len(iterable)\n except (TypeError, AttributeError):\n pass\n else:\n if n >= size:\n return sorted(iterable, key=key)[:n]\n\n # When key is none, use simpler decoration\n if key is None:\n it = iter(iterable)\n # put the range(n) first so that zip() doesn't\n # consume one too many elements from the iterator\n result = [(elem, i) for i, elem in zip(range(n), it)]\n if not result:\n return result\n _heapify_max(result)\n top = result[0][0]\n order = n\n _heapreplace = _heapreplace_max\n for elem in it:\n if elem < top:\n _heapreplace(result, (elem, order))\n top = result[0][0]\n order += 1\n result.sort()\n return [r[0] for r in result]\n\n # General case, slowest method\n it = iter(iterable)\n result = [(key(elem), i, elem) for i, elem in zip(range(n), it)]\n if not result:\n return result\n _heapify_max(result)\n top = result[0][0]\n order = n\n _heapreplace = _heapreplace_max\n for elem in it:\n k = key(elem)\n if k < top:\n _heapreplace(result, (k, order, elem))\n top = result[0][0]\n order += 1\n result.sort()\n return [r[2] for r in result]"} {"instruction": "Make a summary of the following Python 3 code\ndef nlargest(n, iterable, key=None):\n\n # Short-cut for n==1 is to use max()\n if n == 1:\n it = iter(iterable)\n sentinel = object()\n if key is None:\n result = max(it, default=sentinel)\n else:\n result = max(it, default=sentinel, key=key)\n return [] if result is sentinel else [result]\n\n # When n>=size, it's faster to use sorted()\n try:\n size = len(iterable)\n except (TypeError, AttributeError):\n pass\n else:\n if n >= size:\n return sorted(iterable, key=key, reverse=True)[:n]\n\n # When key is none, use simpler decoration\n if key is None:\n it = iter(iterable)\n result = [(elem, i) for i, elem in zip(range(0, -n, -1), it)]\n if not result:\n return result\n heapify(result)\n top = result[0][0]\n order = -n\n _heapreplace = heapreplace\n for elem in it:\n if top < elem:\n _heapreplace(result, (elem, order))\n top = result[0][0]\n order -= 1\n result.sort(reverse=True)\n return [r[0] for r in result]\n\n # General case, slowest method\n it = iter(iterable)\n result = [(key(elem), i, elem) for i, elem in zip(range(0, -n, -1), it)]\n if not result:\n return result\n heapify(result)\n top = result[0][0]\n order = -n\n _heapreplace = heapreplace\n for elem in it:\n k = key(elem)\n if top < k:\n _heapreplace(result, (k, order, elem))\n top = result[0][0]\n order -= 1\n result.sort(reverse=True)\n return [r[2] for r in result]", "response": "Find the n largest elements in a dataset."} {"instruction": "How would you explain what the following Python 3 function does\ndef corr(dataset, column, method=\"pearson\"):\n sc = SparkContext._active_spark_context\n javaCorrObj = _jvm().org.apache.spark.ml.stat.Correlation\n args = [_py2java(sc, arg) for arg in (dataset, column, method)]\n return _java2py(sc, javaCorrObj.corr(*args))", "response": "Compute the correlation matrix of a column of vectors using a dataset."} {"instruction": "Given the following Python 3 function, write the documentation\ndef metrics(*metrics):\n sc = SparkContext._active_spark_context\n js = JavaWrapper._new_java_obj(\"org.apache.spark.ml.stat.Summarizer.metrics\",\n _to_seq(sc, metrics))\n return SummaryBuilder(js)", "response": "This function takes a list of metrics and returns a builder that it turns computes metrics from a column."} {"instruction": "How would you explain what the following Python 3 function does\ndef summary(self, featuresCol, weightCol=None):\n featuresCol, weightCol = Summarizer._check_param(featuresCol, weightCol)\n return Column(self._java_obj.summary(featuresCol._jc, weightCol._jc))", "response": "Returns an aggregate object that contains the summary of the requested features Vector object and weight value."} {"instruction": "How would you explain what the following Python 3 function does\ndef corr(x, y=None, method=None):\n # Check inputs to determine whether a single value or a matrix is needed for output.\n # Since it's legal for users to use the method name as the second argument, we need to\n # check if y is used to specify the method name instead.\n if type(y) == str:\n raise TypeError(\"Use 'method=' to specify method name.\")\n\n if not y:\n return callMLlibFunc(\"corr\", x.map(_convert_to_vector), method).toArray()\n else:\n return callMLlibFunc(\"corr\", x.map(float), y.map(float), method)", "response": "Compute the correlation matrix for the input RDDs x and y."} {"instruction": "Write a Python 3 script to\nfit an estimator in parallel and evaluate the next set of training and validation data sets.", "response": "def _parallelFitTasks(est, train, eva, validation, epm, collectSubModel):\n \"\"\"\n Creates a list of callables which can be called from different threads to fit and evaluate\n an estimator in parallel. Each callable returns an `(index, metric)` pair.\n\n :param est: Estimator, the estimator to be fit.\n :param train: DataFrame, training data set, used for fitting.\n :param eva: Evaluator, used to compute `metric`\n :param validation: DataFrame, validation data set, used for evaluation.\n :param epm: Sequence of ParamMap, params maps to be used during fitting & evaluation.\n :param collectSubModel: Whether to collect sub model.\n :return: (int, float, subModel), an index into `epm` and the associated metric value.\n \"\"\"\n modelIter = est.fitMultiple(train, epm)\n\n def singleTask():\n index, model = next(modelIter)\n metric = eva.evaluate(model.transform(validation, epm[index]))\n return index, metric, model if collectSubModel else None\n\n return [singleTask] * len(epm)"} {"instruction": "Here you have a function in Python 3, explain what it does\ndef baseOn(self, *args):\n if isinstance(args[0], dict):\n self.baseOn(*args[0].items())\n else:\n for (param, value) in args:\n self.addGrid(param, [value])\n\n return self", "response": "Sets the given parameters in this grid to fixed values."} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef build(self):\n keys = self._param_grid.keys()\n grid_values = self._param_grid.values()\n\n def to_key_value_pairs(keys, values):\n return [(key, key.typeConverter(value)) for key, value in zip(keys, values)]\n\n return [dict(to_key_value_pairs(keys, prod)) for prod in itertools.product(*grid_values)]", "response": "Builds and returns all combinations of parameters specified\n by the param grid."} {"instruction": "Write a Python 3 function that can\nreturn Python estimator estimatorParamMaps and evaluator from a Java ValidatorParams.", "response": "def _from_java_impl(cls, java_stage):\n \"\"\"\n Return Python estimator, estimatorParamMaps, and evaluator from a Java ValidatorParams.\n \"\"\"\n\n # Load information from java_stage to the instance.\n estimator = JavaParams._from_java(java_stage.getEstimator())\n evaluator = JavaParams._from_java(java_stage.getEvaluator())\n epms = [estimator._transfer_param_map_from_java(epm)\n for epm in java_stage.getEstimatorParamMaps()]\n return estimator, epms, evaluator"} {"instruction": "Can you implement a function in Python 3 that\nreturns Java estimator estimatorParamMaps and evaluator from this Python instance.", "response": "def _to_java_impl(self):\n \"\"\"\n Return Java estimator, estimatorParamMaps, and evaluator from this Python instance.\n \"\"\"\n\n gateway = SparkContext._gateway\n cls = SparkContext._jvm.org.apache.spark.ml.param.ParamMap\n\n java_epms = gateway.new_array(cls, len(self.getEstimatorParamMaps()))\n for idx, epm in enumerate(self.getEstimatorParamMaps()):\n java_epms[idx] = self.getEstimator()._transfer_param_map_to_java(epm)\n\n java_estimator = self.getEstimator()._to_java()\n java_evaluator = self.getEvaluator()._to_java()\n return java_estimator, java_epms, java_evaluator"} {"instruction": "How would you explain what the following Python 3 function does\ndef _from_java(cls, java_stage):\n\n estimator, epms, evaluator = super(CrossValidator, cls)._from_java_impl(java_stage)\n numFolds = java_stage.getNumFolds()\n seed = java_stage.getSeed()\n parallelism = java_stage.getParallelism()\n collectSubModels = java_stage.getCollectSubModels()\n # Create a new instance of this stage.\n py_stage = cls(estimator=estimator, estimatorParamMaps=epms, evaluator=evaluator,\n numFolds=numFolds, seed=seed, parallelism=parallelism,\n collectSubModels=collectSubModels)\n py_stage._resetUid(java_stage.uid())\n return py_stage", "response": "Given a Java CrossValidator create and return a Python wrapper of it."} {"instruction": "Write a Python 3 script for\ntransferring this instance to a Java CrossValidator. Used for ML persistence.", "response": "def _to_java(self):\n \"\"\"\n Transfer this instance to a Java CrossValidator. Used for ML persistence.\n\n :return: Java object equivalent to this instance.\n \"\"\"\n\n estimator, epms, evaluator = super(CrossValidator, self)._to_java_impl()\n\n _java_obj = JavaParams._new_java_obj(\"org.apache.spark.ml.tuning.CrossValidator\", self.uid)\n _java_obj.setEstimatorParamMaps(epms)\n _java_obj.setEvaluator(evaluator)\n _java_obj.setEstimator(estimator)\n _java_obj.setSeed(self.getSeed())\n _java_obj.setNumFolds(self.getNumFolds())\n _java_obj.setParallelism(self.getParallelism())\n _java_obj.setCollectSubModels(self.getCollectSubModels())\n\n return _java_obj"} {"instruction": "Explain what the following Python 3 code does\ndef copy(self, extra=None):\n if extra is None:\n extra = dict()\n bestModel = self.bestModel.copy(extra)\n avgMetrics = self.avgMetrics\n subModels = self.subModels\n return CrossValidatorModel(bestModel, avgMetrics, subModels)", "response": "Creates a deep copy of the current instance with a randomly generated uid\n and extra parameters."} {"instruction": "Write a Python 3 script for\nsetting the parameters for the train validation split.", "response": "def setParams(self, estimator=None, estimatorParamMaps=None, evaluator=None, trainRatio=0.75,\n parallelism=1, collectSubModels=False, seed=None):\n \"\"\"\n setParams(self, estimator=None, estimatorParamMaps=None, evaluator=None, trainRatio=0.75,\\\n parallelism=1, collectSubModels=False, seed=None):\n Sets params for the train validation split.\n \"\"\"\n kwargs = self._input_kwargs\n return self._set(**kwargs)"} {"instruction": "Explain what the following Python 3 code does\ndef copy(self, extra=None):\n if extra is None:\n extra = dict()\n newTVS = Params.copy(self, extra)\n if self.isSet(self.estimator):\n newTVS.setEstimator(self.getEstimator().copy(extra))\n # estimatorParamMaps remain the same\n if self.isSet(self.evaluator):\n newTVS.setEvaluator(self.getEvaluator().copy(extra))\n return newTVS", "response": "Creates a deep copy of this instance with a randomly generated uid\n and some extra parameters."} {"instruction": "Write a Python 3 script for\ncreating and return a Python wrapper of the given Java TrainValidationSplit.", "response": "def _from_java(cls, java_stage):\n \"\"\"\n Given a Java TrainValidationSplit, create and return a Python wrapper of it.\n Used for ML persistence.\n \"\"\"\n\n estimator, epms, evaluator = super(TrainValidationSplit, cls)._from_java_impl(java_stage)\n trainRatio = java_stage.getTrainRatio()\n seed = java_stage.getSeed()\n parallelism = java_stage.getParallelism()\n collectSubModels = java_stage.getCollectSubModels()\n # Create a new instance of this stage.\n py_stage = cls(estimator=estimator, estimatorParamMaps=epms, evaluator=evaluator,\n trainRatio=trainRatio, seed=seed, parallelism=parallelism,\n collectSubModels=collectSubModels)\n py_stage._resetUid(java_stage.uid())\n return py_stage"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef _to_java(self):\n\n estimator, epms, evaluator = super(TrainValidationSplit, self)._to_java_impl()\n\n _java_obj = JavaParams._new_java_obj(\"org.apache.spark.ml.tuning.TrainValidationSplit\",\n self.uid)\n _java_obj.setEstimatorParamMaps(epms)\n _java_obj.setEvaluator(evaluator)\n _java_obj.setEstimator(estimator)\n _java_obj.setTrainRatio(self.getTrainRatio())\n _java_obj.setSeed(self.getSeed())\n _java_obj.setParallelism(self.getParallelism())\n _java_obj.setCollectSubModels(self.getCollectSubModels())\n return _java_obj", "response": "Transfer this instance to a Java TrainValidationSplit. Used for ML persistence."} {"instruction": "Here you have a function in Python 3, explain what it does\ndef copy(self, extra=None):\n if extra is None:\n extra = dict()\n bestModel = self.bestModel.copy(extra)\n validationMetrics = list(self.validationMetrics)\n subModels = self.subModels\n return TrainValidationSplitModel(bestModel, validationMetrics, subModels)", "response": "Creates a deep copy of the current instance with a randomly generated uid\n and extra parameters."} {"instruction": "How would you explain what the following Python 3 function does\ndef _from_java(cls, java_stage):\n\n # Load information from java_stage to the instance.\n bestModel = JavaParams._from_java(java_stage.bestModel())\n estimator, epms, evaluator = super(TrainValidationSplitModel,\n cls)._from_java_impl(java_stage)\n # Create a new instance of this stage.\n py_stage = cls(bestModel=bestModel).setEstimator(estimator)\n py_stage = py_stage.setEstimatorParamMaps(epms).setEvaluator(evaluator)\n\n if java_stage.hasSubModels():\n py_stage.subModels = [JavaParams._from_java(sub_model)\n for sub_model in java_stage.subModels()]\n\n py_stage._resetUid(java_stage.uid())\n return py_stage", "response": "Given a Java TrainValidationSplitModel create and return a Python wrapper of it."} {"instruction": "Create a Python 3 function to\ntransfer this instance to a Java TrainValidationSplitModel. Used for ML persistence.", "response": "def _to_java(self):\n \"\"\"\n Transfer this instance to a Java TrainValidationSplitModel. Used for ML persistence.\n :return: Java object equivalent to this instance.\n \"\"\"\n\n sc = SparkContext._active_spark_context\n # TODO: persst validation metrics as well\n _java_obj = JavaParams._new_java_obj(\n \"org.apache.spark.ml.tuning.TrainValidationSplitModel\",\n self.uid,\n self.bestModel._to_java(),\n _py2java(sc, []))\n estimator, epms, evaluator = super(TrainValidationSplitModel, self)._to_java_impl()\n\n _java_obj.set(\"evaluator\", evaluator)\n _java_obj.set(\"estimator\", estimator)\n _java_obj.set(\"estimatorParamMaps\", epms)\n\n if self.subModels is not None:\n java_sub_models = [sub_model._to_java() for sub_model in self.subModels]\n _java_obj.setSubModels(java_sub_models)\n\n return _java_obj"} {"instruction": "Make a summary of the following Python 3 code\ndef get(self, key, default=_NoValue):\n self._checkType(key, \"key\")\n if default is _NoValue:\n return self._jconf.get(key)\n else:\n if default is not None:\n self._checkType(default, \"default\")\n return self._jconf.get(key, default)", "response": "Returns the value of Spark runtime configuration property for the given key assuming it is set."} {"instruction": "Write a Python 3 script for\nasserting that an object is of type str.", "response": "def _checkType(self, obj, identifier):\n \"\"\"Assert that an object is of type str.\"\"\"\n if not isinstance(obj, basestring):\n raise TypeError(\"expected %s '%s' to be a string (was '%s')\" %\n (identifier, obj, type(obj).__name__))"} {"instruction": "Implement a Python 3 function for\ncreating a PySpark function by its name and doc", "response": "def _create_function(name, doc=\"\"):\n \"\"\"Create a PySpark function by its name\"\"\"\n def _(col):\n sc = SparkContext._active_spark_context\n jc = getattr(sc._jvm.functions, name)(col._jc if isinstance(col, Column) else col)\n return Column(jc)\n _.__name__ = name\n _.__doc__ = doc\n return _"} {"instruction": "Create a Python 3 function to\nwrap the deprecated function to print out deprecation warnings", "response": "def _wrap_deprecated_function(func, message):\n \"\"\" Wrap the deprecated function to print out deprecation warnings\"\"\"\n def _(col):\n warnings.warn(message, DeprecationWarning)\n return func(col)\n return functools.wraps(func)(_)"} {"instruction": "Given the following Python 3 function, write the documentation\ndef _create_binary_mathfunction(name, doc=\"\"):\n def _(col1, col2):\n sc = SparkContext._active_spark_context\n # For legacy reasons, the arguments here can be implicitly converted into floats,\n # if they are not columns or strings.\n if isinstance(col1, Column):\n arg1 = col1._jc\n elif isinstance(col1, basestring):\n arg1 = _create_column_from_name(col1)\n else:\n arg1 = float(col1)\n\n if isinstance(col2, Column):\n arg2 = col2._jc\n elif isinstance(col2, basestring):\n arg2 = _create_column_from_name(col2)\n else:\n arg2 = float(col2)\n\n jc = getattr(sc._jvm.functions, name)(arg1, arg2)\n return Column(jc)\n _.__name__ = name\n _.__doc__ = doc\n return _", "response": "Create a binary math function by name"} {"instruction": "How would you code a function in Python 3 to\ncreate a window function by name.", "response": "def _create_window_function(name, doc=''):\n \"\"\" Create a window function by name \"\"\"\n def _():\n sc = SparkContext._active_spark_context\n jc = getattr(sc._jvm.functions, name)()\n return Column(jc)\n _.__name__ = name\n _.__doc__ = 'Window function: ' + doc\n return _"} {"instruction": "How would you implement a function in Python 3 that\naggregates function that returns a new column for approximate distinct count of column col.", "response": "def approx_count_distinct(col, rsd=None):\n \"\"\"Aggregate function: returns a new :class:`Column` for approximate distinct count of\n column `col`.\n\n :param rsd: maximum estimation error allowed (default = 0.05). For rsd < 0.01, it is more\n efficient to use :func:`countDistinct`\n\n >>> df.agg(approx_count_distinct(df.age).alias('distinct_ages')).collect()\n [Row(distinct_ages=2)]\n \"\"\"\n sc = SparkContext._active_spark_context\n if rsd is None:\n jc = sc._jvm.functions.approx_count_distinct(_to_java_column(col))\n else:\n jc = sc._jvm.functions.approx_count_distinct(_to_java_column(col), rsd)\n return Column(jc)"} {"instruction": "Given the following Python 3 function, write the documentation\ndef broadcast(df):\n\n sc = SparkContext._active_spark_context\n return DataFrame(sc._jvm.functions.broadcast(df._jdf), df.sql_ctx)", "response": "Marks a DataFrame as small enough for use in broadcast joins."} {"instruction": "Create a Python 3 function to\nreturn a new column for distinct count of col or cols.", "response": "def countDistinct(col, *cols):\n \"\"\"Returns a new :class:`Column` for distinct count of ``col`` or ``cols``.\n\n >>> df.agg(countDistinct(df.age, df.name).alias('c')).collect()\n [Row(c=2)]\n\n >>> df.agg(countDistinct(\"age\", \"name\").alias('c')).collect()\n [Row(c=2)]\n \"\"\"\n sc = SparkContext._active_spark_context\n jc = sc._jvm.functions.countDistinct(_to_java_column(col), _to_seq(sc, cols, _to_java_column))\n return Column(jc)"} {"instruction": "Make a summary of the following Python 3 code\ndef last(col, ignorenulls=False):\n sc = SparkContext._active_spark_context\n jc = sc._jvm.functions.last(_to_java_column(col), ignorenulls)\n return Column(jc)", "response": "Aggregate function that returns the last value in a group."} {"instruction": "Create a Python 3 function for\nreturning col1 if it is not NaN or col2 if col1 is NaN.", "response": "def nanvl(col1, col2):\n \"\"\"Returns col1 if it is not NaN, or col2 if col1 is NaN.\n\n Both inputs should be floating point columns (:class:`DoubleType` or :class:`FloatType`).\n\n >>> df = spark.createDataFrame([(1.0, float('nan')), (float('nan'), 2.0)], (\"a\", \"b\"))\n >>> df.select(nanvl(\"a\", \"b\").alias(\"r1\"), nanvl(df.a, df.b).alias(\"r2\")).collect()\n [Row(r1=1.0, r2=1.0), Row(r1=2.0, r2=2.0)]\n \"\"\"\n sc = SparkContext._active_spark_context\n return Column(sc._jvm.functions.nanvl(_to_java_column(col1), _to_java_column(col2)))"} {"instruction": "Can you create a Python 3 function that\ngenerates a random column with independent and identically distributed from U [ 0 1 ).", "response": "def rand(seed=None):\n \"\"\"Generates a random column with independent and identically distributed (i.i.d.) samples\n from U[0.0, 1.0].\n\n .. note:: The function is non-deterministic in general case.\n\n >>> df.withColumn('rand', rand(seed=42) * 3).collect()\n [Row(age=2, name=u'Alice', rand=2.4052597283576684),\n Row(age=5, name=u'Bob', rand=2.3913904055683974)]\n \"\"\"\n sc = SparkContext._active_spark_context\n if seed is not None:\n jc = sc._jvm.functions.rand(seed)\n else:\n jc = sc._jvm.functions.rand()\n return Column(jc)"} {"instruction": "Can you write a function in Python 3 where it\nrounds the given value to a given scale decimal places using HALF_UP rounding mode.", "response": "def round(col, scale=0):\n \"\"\"\n Round the given value to `scale` decimal places using HALF_UP rounding mode if `scale` >= 0\n or at integral part when `scale` < 0.\n\n >>> spark.createDataFrame([(2.5,)], ['a']).select(round('a', 0).alias('r')).collect()\n [Row(r=3.0)]\n \"\"\"\n sc = SparkContext._active_spark_context\n return Column(sc._jvm.functions.round(_to_java_column(col), scale))"} {"instruction": "Can you create a Python 3 function that\nshifts the given value numBits left.", "response": "def shiftLeft(col, numBits):\n \"\"\"Shift the given value numBits left.\n\n >>> spark.createDataFrame([(21,)], ['a']).select(shiftLeft('a', 1).alias('r')).collect()\n [Row(r=42)]\n \"\"\"\n sc = SparkContext._active_spark_context\n return Column(sc._jvm.functions.shiftLeft(_to_java_column(col), numBits))"} {"instruction": "Implement a Python 3 function for\nshifting the given value numBits right.", "response": "def shiftRight(col, numBits):\n \"\"\"(Signed) shift the given value numBits right.\n\n >>> spark.createDataFrame([(42,)], ['a']).select(shiftRight('a', 1).alias('r')).collect()\n [Row(r=21)]\n \"\"\"\n sc = SparkContext._active_spark_context\n jc = sc._jvm.functions.shiftRight(_to_java_column(col), numBits)\n return Column(jc)"} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef expr(str):\n sc = SparkContext._active_spark_context\n return Column(sc._jvm.functions.expr(str))", "response": "Parses the expression string into the column that it represents\n "} {"instruction": "Write a Python 3 function that can\nevaluate a list of conditions and returns one of multiple possible result expressions.", "response": "def when(condition, value):\n \"\"\"Evaluates a list of conditions and returns one of multiple possible result expressions.\n If :func:`Column.otherwise` is not invoked, None is returned for unmatched conditions.\n\n :param condition: a boolean :class:`Column` expression.\n :param value: a literal value, or a :class:`Column` expression.\n\n >>> df.select(when(df['age'] == 2, 3).otherwise(4).alias(\"age\")).collect()\n [Row(age=3), Row(age=4)]\n\n >>> df.select(when(df.age == 2, df.age + 1).alias(\"age\")).collect()\n [Row(age=3), Row(age=None)]\n \"\"\"\n sc = SparkContext._active_spark_context\n if not isinstance(condition, Column):\n raise TypeError(\"condition should be a Column\")\n v = value._jc if isinstance(value, Column) else value\n jc = sc._jvm.functions.when(condition._jc, v)\n return Column(jc)"} {"instruction": "How would you code a function in Python 3 to\nreturn the first argument - based logarithm of the second argument.", "response": "def log(arg1, arg2=None):\n \"\"\"Returns the first argument-based logarithm of the second argument.\n\n If there is only one argument, then this takes the natural logarithm of the argument.\n\n >>> df.select(log(10.0, df.age).alias('ten')).rdd.map(lambda l: str(l.ten)[:7]).collect()\n ['0.30102', '0.69897']\n\n >>> df.select(log(df.age).alias('e')).rdd.map(lambda l: str(l.e)[:7]).collect()\n ['0.69314', '1.60943']\n \"\"\"\n sc = SparkContext._active_spark_context\n if arg2 is None:\n jc = sc._jvm.functions.log(_to_java_column(arg1))\n else:\n jc = sc._jvm.functions.log(arg1, _to_java_column(arg2))\n return Column(jc)"} {"instruction": "Create a Python 3 function for\nconverting a number in a string column from one base to another.", "response": "def conv(col, fromBase, toBase):\n \"\"\"\n Convert a number in a string column from one base to another.\n\n >>> df = spark.createDataFrame([(\"010101\",)], ['n'])\n >>> df.select(conv(df.n, 2, 16).alias('hex')).collect()\n [Row(hex=u'15')]\n \"\"\"\n sc = SparkContext._active_spark_context\n return Column(sc._jvm.functions.conv(_to_java_column(col), fromBase, toBase))"} {"instruction": "Make a summary of the following Python 3 code\ndef lag(col, offset=1, default=None):\n sc = SparkContext._active_spark_context\n return Column(sc._jvm.functions.lag(_to_java_column(col), offset, default))", "response": "Returns the value that is less than or equal to the current row."} {"instruction": "Can you implement a function in Python 3 that\nwindows function: returns the ntile group id (from 1 to `n` inclusive) in an ordered window partition. For example, if `n` is 4, the first quarter of the rows will get value 1, the second quarter will get 2, the third quarter will get 3, and the last quarter will get 4. This is equivalent to the NTILE function in SQL. :param n: an integer", "response": "def ntile(n):\n \"\"\"\n Window function: returns the ntile group id (from 1 to `n` inclusive)\n in an ordered window partition. For example, if `n` is 4, the first\n quarter of the rows will get value 1, the second quarter will get 2,\n the third quarter will get 3, and the last quarter will get 4.\n\n This is equivalent to the NTILE function in SQL.\n\n :param n: an integer\n \"\"\"\n sc = SparkContext._active_spark_context\n return Column(sc._jvm.functions.ntile(int(n)))"} {"instruction": "Create a Python 3 function to\nconvert a date to a value of string in the format specified by the date tuple format given by the second argument.", "response": "def date_format(date, format):\n \"\"\"\n Converts a date/timestamp/string to a value of string in the format specified by the date\n format given by the second argument.\n\n A pattern could be for instance `dd.MM.yyyy` and could return a string like '18.03.1993'. All\n pattern letters of the Java class `java.time.format.DateTimeFormatter` can be used.\n\n .. note:: Use when ever possible specialized functions like `year`. These benefit from a\n specialized implementation.\n\n >>> df = spark.createDataFrame([('2015-04-08',)], ['dt'])\n >>> df.select(date_format('dt', 'MM/dd/yyy').alias('date')).collect()\n [Row(date=u'04/08/2015')]\n \"\"\"\n sc = SparkContext._active_spark_context\n return Column(sc._jvm.functions.date_format(_to_java_column(date), format))"} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef date_add(start, days):\n sc = SparkContext._active_spark_context\n return Column(sc._jvm.functions.date_add(_to_java_column(start), days))", "response": "Returns the date that is days after start"} {"instruction": "Explain what the following Python 3 code does\ndef datediff(end, start):\n sc = SparkContext._active_spark_context\n return Column(sc._jvm.functions.datediff(_to_java_column(end), _to_java_column(start)))", "response": "Returns the number of days from start to end."} {"instruction": "Write a Python 3 script for\nreturning the date that is months after start", "response": "def add_months(start, months):\n \"\"\"\n Returns the date that is `months` months after `start`\n\n >>> df = spark.createDataFrame([('2015-04-08',)], ['dt'])\n >>> df.select(add_months(df.dt, 1).alias('next_month')).collect()\n [Row(next_month=datetime.date(2015, 5, 8))]\n \"\"\"\n sc = SparkContext._active_spark_context\n return Column(sc._jvm.functions.add_months(_to_java_column(start), months))"} {"instruction": "How would you explain what the following Python 3 function does\ndef months_between(date1, date2, roundOff=True):\n sc = SparkContext._active_spark_context\n return Column(sc._jvm.functions.months_between(\n _to_java_column(date1), _to_java_column(date2), roundOff))", "response": "Returns the number of months between dates date1 and date2."} {"instruction": "Given the following Python 3 function, write the documentation\ndef to_date(col, format=None):\n sc = SparkContext._active_spark_context\n if format is None:\n jc = sc._jvm.functions.to_date(_to_java_column(col))\n else:\n jc = sc._jvm.functions.to_date(_to_java_column(col), format)\n return Column(jc)", "response": "Converts a column of type datetime. date into a Spark date."} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef date_trunc(format, timestamp):\n sc = SparkContext._active_spark_context\n return Column(sc._jvm.functions.date_trunc(format, _to_java_column(timestamp)))", "response": "Returns timestamp truncated to the unit specified by the format."} {"instruction": "Implement a Python 3 function for\nreturning the first date which is later than the value of the date column.", "response": "def next_day(date, dayOfWeek):\n \"\"\"\n Returns the first date which is later than the value of the date column.\n\n Day of the week parameter is case insensitive, and accepts:\n \"Mon\", \"Tue\", \"Wed\", \"Thu\", \"Fri\", \"Sat\", \"Sun\".\n\n >>> df = spark.createDataFrame([('2015-07-27',)], ['d'])\n >>> df.select(next_day(df.d, 'Sun').alias('date')).collect()\n [Row(date=datetime.date(2015, 8, 2))]\n \"\"\"\n sc = SparkContext._active_spark_context\n return Column(sc._jvm.functions.next_day(_to_java_column(date), dayOfWeek))"} {"instruction": "Explain what the following Python 3 code does\ndef last_day(date):\n sc = SparkContext._active_spark_context\n return Column(sc._jvm.functions.last_day(_to_java_column(date)))", "response": "Returns the last day of the month which the given date belongs to."} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef unix_timestamp(timestamp=None, format='yyyy-MM-dd HH:mm:ss'):\n sc = SparkContext._active_spark_context\n if timestamp is None:\n return Column(sc._jvm.functions.unix_timestamp())\n return Column(sc._jvm.functions.unix_timestamp(_to_java_column(timestamp), format))", "response": "Convert time string with given pattern ( YYYY - MM - dd HH : mm : ss to Unix time stamp."} {"instruction": "Explain what the following Python 3 code does\ndef hash(*cols):\n sc = SparkContext._active_spark_context\n jc = sc._jvm.functions.hash(_to_seq(sc, cols, _to_java_column))\n return Column(jc)", "response": "Calculates the hash code of given columns and returns the result as an int column."} {"instruction": "Make a summary of the following Python 3 code\ndef concat_ws(sep, *cols):\n sc = SparkContext._active_spark_context\n return Column(sc._jvm.functions.concat_ws(sep, _to_seq(sc, cols, _to_java_column)))", "response": "Concatenates multiple input string columns together into a single string column using the given separator."} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef decode(col, charset):\n sc = SparkContext._active_spark_context\n return Column(sc._jvm.functions.decode(_to_java_column(col), charset))", "response": "Decodes a binary column into a string using the provided character set."} {"instruction": "Can you write a function in Python 3 where it\nformats the number X to a format like '# -. -..", "response": "def format_number(col, d):\n \"\"\"\n Formats the number X to a format like '#,--#,--#.--', rounded to d decimal places\n with HALF_EVEN round mode, and returns the result as a string.\n\n :param col: the column name of the numeric value to be formatted\n :param d: the N decimal places\n\n >>> spark.createDataFrame([(5,)], ['a']).select(format_number('a', 4).alias('v')).collect()\n [Row(v=u'5.0000')]\n \"\"\"\n sc = SparkContext._active_spark_context\n return Column(sc._jvm.functions.format_number(_to_java_column(col), d))"} {"instruction": "How would you implement a function in Python 3 that\nformats the arguments in printf - style and returns the result as a string column.", "response": "def format_string(format, *cols):\n \"\"\"\n Formats the arguments in printf-style and returns the result as a string column.\n\n :param col: the column name of the numeric value to be formatted\n :param d: the N decimal places\n\n >>> df = spark.createDataFrame([(5, \"hello\")], ['a', 'b'])\n >>> df.select(format_string('%d %s', df.a, df.b).alias('v')).collect()\n [Row(v=u'5 hello')]\n \"\"\"\n sc = SparkContext._active_spark_context\n return Column(sc._jvm.functions.format_string(format, _to_seq(sc, cols, _to_java_column)))"} {"instruction": "Here you have a function in Python 3, explain what it does\ndef instr(str, substr):\n sc = SparkContext._active_spark_context\n return Column(sc._jvm.functions.instr(_to_java_column(str), substr))", "response": "Locate the position of the first occurrence of substr in the given string. Returns null if either of the arguments are null."} {"instruction": "Given the following Python 3 function, write the documentation\ndef substring_index(str, delim, count):\n sc = SparkContext._active_spark_context\n return Column(sc._jvm.functions.substring_index(_to_java_column(str), delim, count))", "response": "Returns the substring from string str before count occurrences of the delimiter delim."} {"instruction": "Write a Python 3 function for\ncomputing the Levenshtein distance of the two given strings.", "response": "def levenshtein(left, right):\n \"\"\"Computes the Levenshtein distance of the two given strings.\n\n >>> df0 = spark.createDataFrame([('kitten', 'sitting',)], ['l', 'r'])\n >>> df0.select(levenshtein('l', 'r').alias('d')).collect()\n [Row(d=3)]\n \"\"\"\n sc = SparkContext._active_spark_context\n jc = sc._jvm.functions.levenshtein(_to_java_column(left), _to_java_column(right))\n return Column(jc)"} {"instruction": "Can you write a function in Python 3 where it\nlocates the first occurrence of substr in a string column after position pos. Returns 0 if substr could not be found in str.", "response": "def locate(substr, str, pos=1):\n \"\"\"\n Locate the position of the first occurrence of substr in a string column, after position pos.\n\n .. note:: The position is not zero based, but 1 based index. Returns 0 if substr\n could not be found in str.\n\n :param substr: a string\n :param str: a Column of :class:`pyspark.sql.types.StringType`\n :param pos: start position (zero based)\n\n >>> df = spark.createDataFrame([('abcd',)], ['s',])\n >>> df.select(locate('b', df.s, 1).alias('s')).collect()\n [Row(s=2)]\n \"\"\"\n sc = SparkContext._active_spark_context\n return Column(sc._jvm.functions.locate(substr, _to_java_column(str), pos))"} {"instruction": "Create a Python 3 function for\nleaving - pad the string column to width len with pad.", "response": "def lpad(col, len, pad):\n \"\"\"\n Left-pad the string column to width `len` with `pad`.\n\n >>> df = spark.createDataFrame([('abcd',)], ['s',])\n >>> df.select(lpad(df.s, 6, '#').alias('s')).collect()\n [Row(s=u'##abcd')]\n \"\"\"\n sc = SparkContext._active_spark_context\n return Column(sc._jvm.functions.lpad(_to_java_column(col), len, pad))"} {"instruction": "Create a Python 3 function for\nrepeating a string column n times and returns it as a new string column.", "response": "def repeat(col, n):\n \"\"\"\n Repeats a string column n times, and returns it as a new string column.\n\n >>> df = spark.createDataFrame([('ab',)], ['s',])\n >>> df.select(repeat(df.s, 3).alias('s')).collect()\n [Row(s=u'ababab')]\n \"\"\"\n sc = SparkContext._active_spark_context\n return Column(sc._jvm.functions.repeat(_to_java_column(col), n))"} {"instruction": "Given the following Python 3 function, write the documentation\ndef split(str, pattern, limit=-1):\n sc = SparkContext._active_spark_context\n return Column(sc._jvm.functions.split(_to_java_column(str), pattern, limit))", "response": "Splits a string into a sequence of words with the given pattern."} {"instruction": "How would you implement a function in Python 3 that\nreplaces all substrings of the specified string that match regexp with replacement.", "response": "def regexp_replace(str, pattern, replacement):\n r\"\"\"Replace all substrings of the specified string value that match regexp with rep.\n\n >>> df = spark.createDataFrame([('100-200',)], ['str'])\n >>> df.select(regexp_replace('str', r'(\\d+)', '--').alias('d')).collect()\n [Row(d=u'-----')]\n \"\"\"\n sc = SparkContext._active_spark_context\n jc = sc._jvm.functions.regexp_replace(_to_java_column(str), pattern, replacement)\n return Column(jc)"} {"instruction": "How would you explain what the following Python 3 function does\ndef translate(srcCol, matching, replace):\n sc = SparkContext._active_spark_context\n return Column(sc._jvm.functions.translate(_to_java_column(srcCol), matching, replace))", "response": "A function translate any character in srcCol by a character in matching."} {"instruction": "Here you have a function in Python 3, explain what it does\ndef arrays_overlap(a1, a2):\n sc = SparkContext._active_spark_context\n return Column(sc._jvm.functions.arrays_overlap(_to_java_column(a1), _to_java_column(a2)))", "response": "Returns true if the two arrays contain any common non - null element ; if not returns false otherwise."} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef slice(x, start, length):\n sc = SparkContext._active_spark_context\n return Column(sc._jvm.functions.slice(_to_java_column(x), start, length))", "response": "Collection function that returns all the elements in x from index start and length."} {"instruction": "How would you explain what the following Python 3 function does\ndef array_join(col, delimiter, null_replacement=None):\n sc = SparkContext._active_spark_context\n if null_replacement is None:\n return Column(sc._jvm.functions.array_join(_to_java_column(col), delimiter))\n else:\n return Column(sc._jvm.functions.array_join(\n _to_java_column(col), delimiter, null_replacement))", "response": "Concatenates the elements of column using the delimiter. Null values are replaced with\n null_replacement."} {"instruction": "Create a Python 3 function to\nreturn a new column that consists of multiple input columns together into a single column.", "response": "def concat(*cols):\n \"\"\"\n Concatenates multiple input columns together into a single column.\n The function works with strings, binary and compatible array columns.\n\n >>> df = spark.createDataFrame([('abcd','123')], ['s', 'd'])\n >>> df.select(concat(df.s, df.d).alias('s')).collect()\n [Row(s=u'abcd123')]\n\n >>> df = spark.createDataFrame([([1, 2], [3, 4], [5]), ([1, 2], None, [3])], ['a', 'b', 'c'])\n >>> df.select(concat(df.a, df.b, df.c).alias(\"arr\")).collect()\n [Row(arr=[1, 2, 3, 4, 5]), Row(arr=None)]\n \"\"\"\n sc = SparkContext._active_spark_context\n return Column(sc._jvm.functions.concat(_to_seq(sc, cols, _to_java_column)))"} {"instruction": "Write a Python 3 script for\nreturning the position of the first occurrence of the given value in the given column in the given array.", "response": "def array_position(col, value):\n \"\"\"\n Collection function: Locates the position of the first occurrence of the given value\n in the given array. Returns null if either of the arguments are null.\n\n .. note:: The position is not zero based, but 1 based index. Returns 0 if the given\n value could not be found in the array.\n\n >>> df = spark.createDataFrame([([\"c\", \"b\", \"a\"],), ([],)], ['data'])\n >>> df.select(array_position(df.data, \"a\")).collect()\n [Row(array_position(data, a)=3), Row(array_position(data, a)=0)]\n \"\"\"\n sc = SparkContext._active_spark_context\n return Column(sc._jvm.functions.array_position(_to_java_column(col), value))"} {"instruction": "Make a summary of the following Python 3 code\ndef element_at(col, extraction):\n sc = SparkContext._active_spark_context\n return Column(sc._jvm.functions.element_at(_to_java_column(col), extraction))", "response": "Collection function that returns element of array at given index in extraction if col is array or map."} {"instruction": "How would you explain what the following Python 3 function does\ndef array_remove(col, element):\n sc = SparkContext._active_spark_context\n return Column(sc._jvm.functions.array_remove(_to_java_column(col), element))", "response": "Collection function that removes all elements that equal to element from the given column."} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef explode(col):\n sc = SparkContext._active_spark_context\n jc = sc._jvm.functions.explode(_to_java_column(col))\n return Column(jc)", "response": "Returns a new row for each element in the given array or map."} {"instruction": "Here you have a function in Python 3, explain what it does\ndef get_json_object(col, path):\n sc = SparkContext._active_spark_context\n jc = sc._jvm.functions.get_json_object(_to_java_column(col), path)\n return Column(jc)", "response": "Extracts json object from a json string based on json path specified and returns json string of the extracted json object."} {"instruction": "Implement a Python 3 function for\ncreating a new row for a json column according to the given field names.", "response": "def json_tuple(col, *fields):\n \"\"\"Creates a new row for a json column according to the given field names.\n\n :param col: string column in json format\n :param fields: list of fields to extract\n\n >>> data = [(\"1\", '''{\"f1\": \"value1\", \"f2\": \"value2\"}'''), (\"2\", '''{\"f1\": \"value12\"}''')]\n >>> df = spark.createDataFrame(data, (\"key\", \"jstring\"))\n >>> df.select(df.key, json_tuple(df.jstring, 'f1', 'f2')).collect()\n [Row(key=u'1', c0=u'value1', c1=u'value2'), Row(key=u'2', c0=u'value12', c1=None)]\n \"\"\"\n sc = SparkContext._active_spark_context\n jc = sc._jvm.functions.json_tuple(_to_java_column(col), _to_seq(sc, fields))\n return Column(jc)"} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef schema_of_json(json, options={}):\n if isinstance(json, basestring):\n col = _create_column_from_literal(json)\n elif isinstance(json, Column):\n col = _to_java_column(json)\n else:\n raise TypeError(\"schema argument should be a column or string\")\n\n sc = SparkContext._active_spark_context\n jc = sc._jvm.functions.schema_of_json(col, options)\n return Column(jc)", "response": "Parses a JSON string and infers its schema in DDL format."} {"instruction": "Can you create a Python 3 function that\nparses a CSV string and infers its schema in DDL format.", "response": "def schema_of_csv(csv, options={}):\n \"\"\"\n Parses a CSV string and infers its schema in DDL format.\n\n :param col: a CSV string or a string literal containing a CSV string.\n :param options: options to control parsing. accepts the same options as the CSV datasource\n\n >>> df = spark.range(1)\n >>> df.select(schema_of_csv(lit('1|a'), {'sep':'|'}).alias(\"csv\")).collect()\n [Row(csv=u'struct<_c0:int,_c1:string>')]\n >>> df.select(schema_of_csv('1|a', {'sep':'|'}).alias(\"csv\")).collect()\n [Row(csv=u'struct<_c0:int,_c1:string>')]\n \"\"\"\n if isinstance(csv, basestring):\n col = _create_column_from_literal(csv)\n elif isinstance(csv, Column):\n col = _to_java_column(csv)\n else:\n raise TypeError(\"schema argument should be a column or string\")\n\n sc = SparkContext._active_spark_context\n jc = sc._jvm.functions.schema_of_csv(col, options)\n return Column(jc)"} {"instruction": "Can you create a Python 3 function that\nconverts a column containing a StructType into a CSV string.", "response": "def to_csv(col, options={}):\n \"\"\"\n Converts a column containing a :class:`StructType` into a CSV string.\n Throws an exception, in the case of an unsupported type.\n\n :param col: name of column containing a struct.\n :param options: options to control converting. accepts the same options as the CSV datasource.\n\n >>> from pyspark.sql import Row\n >>> data = [(1, Row(name='Alice', age=2))]\n >>> df = spark.createDataFrame(data, (\"key\", \"value\"))\n >>> df.select(to_csv(df.value).alias(\"csv\")).collect()\n [Row(csv=u'2,Alice')]\n \"\"\"\n\n sc = SparkContext._active_spark_context\n jc = sc._jvm.functions.to_csv(_to_java_column(col), options)\n return Column(jc)"} {"instruction": "Explain what the following Python 3 code does\ndef size(col):\n sc = SparkContext._active_spark_context\n return Column(sc._jvm.functions.size(_to_java_column(col)))", "response": "Returns the length of the array or map stored in the column."} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef sort_array(col, asc=True):\n sc = SparkContext._active_spark_context\n return Column(sc._jvm.functions.sort_array(_to_java_column(col), asc))", "response": "Collection function that sorts the input array in ascending or descending order according to natural ordering of the array elements."} {"instruction": "Can you implement a function in Python 3 that\nreturns an array containing a column repeated count times.", "response": "def array_repeat(col, count):\n \"\"\"\n Collection function: creates an array containing a column repeated count times.\n\n >>> df = spark.createDataFrame([('ab',)], ['data'])\n >>> df.select(array_repeat(df.data, 3).alias('r')).collect()\n [Row(r=[u'ab', u'ab', u'ab'])]\n \"\"\"\n sc = SparkContext._active_spark_context\n return Column(sc._jvm.functions.array_repeat(_to_java_column(col), count))"} {"instruction": "Can you create a Python 3 function that\nreturns the union of all the given maps.", "response": "def map_concat(*cols):\n \"\"\"Returns the union of all the given maps.\n\n :param cols: list of column names (string) or list of :class:`Column` expressions\n\n >>> from pyspark.sql.functions import map_concat\n >>> df = spark.sql(\"SELECT map(1, 'a', 2, 'b') as map1, map(3, 'c', 1, 'd') as map2\")\n >>> df.select(map_concat(\"map1\", \"map2\").alias(\"map3\")).show(truncate=False)\n +------------------------+\n |map3 |\n +------------------------+\n |[1 -> d, 2 -> b, 3 -> c]|\n +------------------------+\n \"\"\"\n sc = SparkContext._active_spark_context\n if len(cols) == 1 and isinstance(cols[0], (list, set)):\n cols = cols[0]\n jc = sc._jvm.functions.map_concat(_to_seq(sc, cols, _to_java_column))\n return Column(jc)"} {"instruction": "Explain what the following Python 3 code does\ndef sequence(start, stop, step=None):\n sc = SparkContext._active_spark_context\n if step is None:\n return Column(sc._jvm.functions.sequence(_to_java_column(start), _to_java_column(stop)))\n else:\n return Column(sc._jvm.functions.sequence(\n _to_java_column(start), _to_java_column(stop), _to_java_column(step)))", "response": "Generates a sequence of integers from start to stop incrementing by step."} {"instruction": "Given the following Python 3 function, write the documentation\ndef from_csv(col, schema, options={}):\n\n sc = SparkContext._active_spark_context\n if isinstance(schema, basestring):\n schema = _create_column_from_literal(schema)\n elif isinstance(schema, Column):\n schema = _to_java_column(schema)\n else:\n raise TypeError(\"schema argument should be a column or string\")\n\n jc = sc._jvm.functions.from_csv(_to_java_column(col), schema, options)\n return Column(jc)", "response": "Parses a column containing a CSV string to a row with the specified schema."} {"instruction": "Make a summary of the following Python 3 code\ndef udf(f=None, returnType=StringType()):\n\n # The following table shows most of Python data and SQL type conversions in normal UDFs that\n # are not yet visible to the user. Some of behaviors are buggy and might be changed in the near\n # future. The table might have to be eventually documented externally.\n # Please see SPARK-25666's PR to see the codes in order to generate the table below.\n #\n # +-----------------------------+--------------+----------+------+-------+---------------+---------------+--------------------+-----------------------------+----------+----------------------+---------+--------------------+-----------------+------------+--------------+------------------+----------------------+ # noqa\n # |SQL Type \\ Python Value(Type)|None(NoneType)|True(bool)|1(int)|1(long)| a(str)| a(unicode)| 1970-01-01(date)|1970-01-01 00:00:00(datetime)|1.0(float)|array('i', [1])(array)|[1](list)| (1,)(tuple)| ABC(bytearray)| 1(Decimal)|{'a': 1}(dict)|Row(kwargs=1)(Row)|Row(namedtuple=1)(Row)| # noqa\n # +-----------------------------+--------------+----------+------+-------+---------------+---------------+--------------------+-----------------------------+----------+----------------------+---------+--------------------+-----------------+------------+--------------+------------------+----------------------+ # noqa\n # | boolean| None| True| None| None| None| None| None| None| None| None| None| None| None| None| None| X| X| # noqa\n # | tinyint| None| None| 1| 1| None| None| None| None| None| None| None| None| None| None| None| X| X| # noqa\n # | smallint| None| None| 1| 1| None| None| None| None| None| None| None| None| None| None| None| X| X| # noqa\n # | int| None| None| 1| 1| None| None| None| None| None| None| None| None| None| None| None| X| X| # noqa\n # | bigint| None| None| 1| 1| None| None| None| None| None| None| None| None| None| None| None| X| X| # noqa\n # | string| None| u'true'| u'1'| u'1'| u'a'| u'a'|u'java.util.Grego...| u'java.util.Grego...| u'1.0'| u'[I@24a83055'| u'[1]'|u'[Ljava.lang.Obj...| u'[B@49093632'| u'1'| u'{a=1}'| X| X| # noqa\n # | date| None| X| X| X| X| X|datetime.date(197...| datetime.date(197...| X| X| X| X| X| X| X| X| X| # noqa\n # | timestamp| None| X| X| X| X| X| X| datetime.datetime...| X| X| X| X| X| X| X| X| X| # noqa\n # | float| None| None| None| None| None| None| None| None| 1.0| None| None| None| None| None| None| X| X| # noqa\n # | double| None| None| None| None| None| None| None| None| 1.0| None| None| None| None| None| None| X| X| # noqa\n # | array| None| None| None| None| None| None| None| None| None| [1]| [1]| [1]| [65, 66, 67]| None| None| X| X| # noqa\n # | binary| None| None| None| None|bytearray(b'a')|bytearray(b'a')| None| None| None| None| None| None|bytearray(b'ABC')| None| None| X| X| # noqa\n # | decimal(10,0)| None| None| None| None| None| None| None| None| None| None| None| None| None|Decimal('1')| None| X| X| # noqa\n # | map| None| None| None| None| None| None| None| None| None| None| None| None| None| None| {u'a': 1}| X| X| # noqa\n # | struct<_1:int>| None| X| X| X| X| X| X| X| X| X|Row(_1=1)| Row(_1=1)| X| X| Row(_1=None)| Row(_1=1)| Row(_1=1)| # noqa\n # +-----------------------------+--------------+----------+------+-------+---------------+---------------+--------------------+-----------------------------+----------+----------------------+---------+--------------------+-----------------+------------+--------------+------------------+----------------------+ # noqa\n #\n # Note: DDL formatted string is used for 'SQL Type' for simplicity. This string can be\n # used in `returnType`.\n # Note: The values inside of the table are generated by `repr`.\n # Note: Python 2 is used to generate this table since it is used to check the backward\n # compatibility often in practice.\n # Note: 'X' means it throws an exception during the conversion.\n\n # decorator @udf, @udf(), @udf(dataType())\n if f is None or isinstance(f, (str, DataType)):\n # If DataType has been passed as a positional argument\n # for decorator use it as a returnType\n return_type = f or returnType\n return functools.partial(_create_udf, returnType=return_type,\n evalType=PythonEvalType.SQL_BATCHED_UDF)\n else:\n return _create_udf(f=f, returnType=returnType,\n evalType=PythonEvalType.SQL_BATCHED_UDF)", "response": "Creates a user - defined function that returns a random number of words."} {"instruction": "Create a Python 3 function to\ncreate a vectorized user defined function (UDF). :param f: user-defined function. A python function if used as a standalone function :param returnType: the return type of the user-defined function. The value can be either a :class:`pyspark.sql.types.DataType` object or a DDL-formatted type string. :param functionType: an enum value in :class:`pyspark.sql.functions.PandasUDFType`. Default: SCALAR. .. note:: Experimental The function type of the UDF can be one of the following: 1. SCALAR A scalar UDF defines a transformation: One or more `pandas.Series` -> A `pandas.Series`. The length of the returned `pandas.Series` must be of the same as the input `pandas.Series`. If the return type is :class:`StructType`, the returned value should be a `pandas.DataFrame`. :class:`MapType`, nested :class:`StructType` are currently not supported as output types. Scalar UDFs are used with :meth:`pyspark.sql.DataFrame.withColumn` and :meth:`pyspark.sql.DataFrame.select`. >>> from pyspark.sql.functions import pandas_udf, PandasUDFType >>> from pyspark.sql.types import IntegerType, StringType >>> slen = pandas_udf(lambda s: s.str.len(), IntegerType()) # doctest: +SKIP >>> @pandas_udf(StringType()) # doctest: +SKIP ... def to_upper(s): ... return s.str.upper() ... >>> @pandas_udf(\"integer\", PandasUDFType.SCALAR) # doctest: +SKIP ... def add_one(x): ... return x + 1 ... >>> df = spark.createDataFrame([(1, \"John Doe\", 21)], ... (\"id\", \"name\", \"age\")) # doctest: +SKIP >>> df.select(slen(\"name\").alias(\"slen(name)\"), to_upper(\"name\"), add_one(\"age\")) \\\\ ... .show() # doctest: +SKIP +----------+--------------+------------+ |slen(name)|to_upper(name)|add_one(age)| +----------+--------------+------------+ | 8| JOHN DOE| 22| +----------+--------------+------------+ >>> @pandas_udf(\"first string, last string\") # doctest: +SKIP ... def split_expand(n): ... return n.str.split(expand=True) >>> df.select(split_expand(\"name\")).show() # doctest: +SKIP +------------------+ |split_expand(name)| +------------------+ | [John, Doe]| +------------------+ .. note:: The length of `pandas.Series` within a scalar UDF is not that of the whole input column, but is the length of an internal batch used for each call to the function. Therefore, this can be used, for example, to ensure the length of each returned `pandas.Series`, and can not be used as the column length. 2. GROUPED_MAP A grouped map UDF defines transformation: A `pandas.DataFrame` -> A `pandas.DataFrame` The returnType should be a :class:`StructType` describing the schema of the returned `pandas.DataFrame`. The column labels of the returned `pandas.DataFrame` must either match the field names in the defined returnType schema if specified as strings, or match the field data types by position if not strings, e.g. integer indices. The length of the returned `pandas.DataFrame` can be arbitrary. Grouped map UDFs are used with :meth:`pyspark.sql.GroupedData.apply`. >>> from pyspark.sql.functions import pandas_udf, PandasUDFType >>> df = spark.createDataFrame( ... [(1, 1.0), (1, 2.0), (2, 3.0), (2, 5.0), (2, 10.0)], ... (\"id\", \"v\")) # doctest: +SKIP >>> @pandas_udf(\"id long, v double\", PandasUDFType.GROUPED_MAP) # doctest: +SKIP ... def normalize(pdf): ... v = pdf.v ... return pdf.assign(v=(v - v.mean()) / v.std()) >>> df.groupby(\"id\").apply(normalize).show() # doctest: +SKIP +---+-------------------+ | id| v| +---+-------------------+ | 1|-0.7071067811865475| | 1| 0.7071067811865475| | 2|-0.8320502943378437| | 2|-0.2773500981126146| | 2| 1.1094003924504583| +---+-------------------+ Alternatively, the user can define a function that takes two arguments. In this case, the grouping key(s) will be passed as the first argument and the data will be passed as the second argument. The grouping key(s) will be passed as a tuple of numpy data types, e.g., `numpy.int32` and `numpy.float64`. The data will still be passed in as a `pandas.DataFrame` containing all columns from the original Spark DataFrame. This is useful when the user does not want to hardcode grouping key(s) in the function. >>> import pandas as pd # doctest: +SKIP >>> from pyspark.sql.functions import pandas_udf, PandasUDFType >>> df = spark.createDataFrame( ... [(1, 1.0), (1, 2.0), (2, 3.0), (2, 5.0), (2, 10.0)], ... (\"id\", \"v\")) # doctest: +SKIP >>> @pandas_udf(\"id long, v double\", PandasUDFType.GROUPED_MAP) # doctest: +SKIP ... def mean_udf(key, pdf): ... # key is a tuple of one numpy.int64, which is the value ... # of 'id' for the current group ... return pd.DataFrame([key + (pdf.v.mean(),)]) >>> df.groupby('id').apply(mean_udf).show() # doctest: +SKIP +---+---+ | id| v| +---+---+ | 1|1.5| | 2|6.0| +---+---+ >>> @pandas_udf( ... \"id long, `ceil(v / 2)` long, v double\", ... PandasUDFType.GROUPED_MAP) # doctest: +SKIP >>> def sum_udf(key, pdf): ... # key is a tuple of two numpy.int64s, which is the values ... # of 'id' and 'ceil(df.v / 2)' for the current group ... return pd.DataFrame([key + (pdf.v.sum(),)]) >>> df.groupby(df.id, ceil(df.v / 2)).apply(sum_udf).show() # doctest: +SKIP +---+-----------+----+ | id|ceil(v / 2)| v| +---+-----------+----+ | 2| 5|10.0| | 1| 1| 3.0| | 2| 3| 5.0| | 2| 2| 3.0| +---+-----------+----+ .. note:: If returning a new `pandas.DataFrame` constructed with a dictionary, it is recommended to explicitly index the columns by name to ensure the positions are correct, or alternatively use an `OrderedDict`. For example, `pd.DataFrame({'id': ids, 'a': data}, columns=['id', 'a'])` or `pd.DataFrame(OrderedDict([('id', ids), ('a', data)]))`. .. seealso:: :meth:`pyspark.sql.GroupedData.apply` 3. GROUPED_AGG A grouped aggregate UDF defines a transformation: One or more `pandas.Series` -> A scalar The `returnType` should be a primitive data type, e.g., :class:`DoubleType`. The returned scalar can be either a python primitive type, e.g., `int` or `float` or a numpy data type, e.g., `numpy.int64` or `numpy.float64`. :class:`MapType` and :class:`StructType` are currently not supported as output types. Group aggregate UDFs are used with :meth:`pyspark.sql.GroupedData.agg` and :class:`pyspark.sql.Window` This example shows using grouped aggregated UDFs with groupby: >>> from pyspark.sql.functions import pandas_udf, PandasUDFType >>> df = spark.createDataFrame( ... [(1, 1.0), (1, 2.0), (2, 3.0), (2, 5.0), (2, 10.0)], ... (\"id\", \"v\")) >>> @pandas_udf(\"double\", PandasUDFType.GROUPED_AGG) # doctest: +SKIP ... def mean_udf(v): ... return v.mean() >>> df.groupby(\"id\").agg(mean_udf(df['v'])).show() # doctest: +SKIP +---+-----------+ | id|mean_udf(v)| +---+-----------+ | 1| 1.5| | 2| 6.0| +---+-----------+ This example shows using grouped aggregated UDFs as window functions. >>> from pyspark.sql.functions import pandas_udf, PandasUDFType >>> from pyspark.sql import Window >>> df = spark.createDataFrame( ... [(1, 1.0), (1, 2.0), (2, 3.0), (2, 5.0), (2, 10.0)], ... (\"id\", \"v\")) >>> @pandas_udf(\"double\", PandasUDFType.GROUPED_AGG) # doctest: +SKIP ... def mean_udf(v): ... return v.mean() >>> w = (Window.partitionBy('id') ... .orderBy('v') ... .rowsBetween(-1, 0)) >>> df.withColumn('mean_v', mean_udf(df['v']).over(w)).show() # doctest: +SKIP +---+----+------+ | id| v|mean_v| +---+----+------+ | 1| 1.0| 1.0| | 1| 2.0| 1.5| | 2| 3.0| 3.0| | 2| 5.0| 4.0| | 2|10.0| 7.5| +---+----+------+ .. note:: For performance reasons, the input series to window functions are not copied. Therefore, mutating the input series is not allowed and will cause incorrect results. For the same reason, users should also not rely on the index of the input series. .. seealso:: :meth:`pyspark.sql.GroupedData.agg` and :class:`pyspark.sql.Window` .. note:: The user-defined functions are considered deterministic by default. Due to optimization, duplicate invocations may be eliminated or the function may even be invoked more times than it is present in the query. If your function is not deterministic, call `asNondeterministic` on the user defined function. E.g.: >>> @pandas_udf('double', PandasUDFType.SCALAR) # doctest: +SKIP ... def random(v): ... import numpy as np ... import pandas as pd ... return pd.Series(np.random.randn(len(v)) >>> random = random.asNondeterministic() # doctest: +SKIP .. note:: The user-defined functions do not support conditional expressions or short circuiting in boolean expressions and it ends up with being executed all internally. If the functions can fail on special rows, the workaround is to incorporate the condition into the functions. .. note:: The user-defined functions do not take keyword arguments on the calling side. .. note:: The data type of returned `pandas.Series` from the user-defined functions should be matched with defined returnType (see :meth:`types.to_arrow_type` and :meth:`types.from_arrow_type`). When there is mismatch between them, Spark might do conversion on returned data. The conversion is not guaranteed to be correct and results should be checked for accuracy by users.", "response": "def pandas_udf(f=None, returnType=None, functionType=None):\n \"\"\"\n Creates a vectorized user defined function (UDF).\n\n :param f: user-defined function. A python function if used as a standalone function\n :param returnType: the return type of the user-defined function. The value can be either a\n :class:`pyspark.sql.types.DataType` object or a DDL-formatted type string.\n :param functionType: an enum value in :class:`pyspark.sql.functions.PandasUDFType`.\n Default: SCALAR.\n\n .. note:: Experimental\n\n The function type of the UDF can be one of the following:\n\n 1. SCALAR\n\n A scalar UDF defines a transformation: One or more `pandas.Series` -> A `pandas.Series`.\n The length of the returned `pandas.Series` must be of the same as the input `pandas.Series`.\n If the return type is :class:`StructType`, the returned value should be a `pandas.DataFrame`.\n\n :class:`MapType`, nested :class:`StructType` are currently not supported as output types.\n\n Scalar UDFs are used with :meth:`pyspark.sql.DataFrame.withColumn` and\n :meth:`pyspark.sql.DataFrame.select`.\n\n >>> from pyspark.sql.functions import pandas_udf, PandasUDFType\n >>> from pyspark.sql.types import IntegerType, StringType\n >>> slen = pandas_udf(lambda s: s.str.len(), IntegerType()) # doctest: +SKIP\n >>> @pandas_udf(StringType()) # doctest: +SKIP\n ... def to_upper(s):\n ... return s.str.upper()\n ...\n >>> @pandas_udf(\"integer\", PandasUDFType.SCALAR) # doctest: +SKIP\n ... def add_one(x):\n ... return x + 1\n ...\n >>> df = spark.createDataFrame([(1, \"John Doe\", 21)],\n ... (\"id\", \"name\", \"age\")) # doctest: +SKIP\n >>> df.select(slen(\"name\").alias(\"slen(name)\"), to_upper(\"name\"), add_one(\"age\")) \\\\\n ... .show() # doctest: +SKIP\n +----------+--------------+------------+\n |slen(name)|to_upper(name)|add_one(age)|\n +----------+--------------+------------+\n | 8| JOHN DOE| 22|\n +----------+--------------+------------+\n >>> @pandas_udf(\"first string, last string\") # doctest: +SKIP\n ... def split_expand(n):\n ... return n.str.split(expand=True)\n >>> df.select(split_expand(\"name\")).show() # doctest: +SKIP\n +------------------+\n |split_expand(name)|\n +------------------+\n | [John, Doe]|\n +------------------+\n\n .. note:: The length of `pandas.Series` within a scalar UDF is not that of the whole input\n column, but is the length of an internal batch used for each call to the function.\n Therefore, this can be used, for example, to ensure the length of each returned\n `pandas.Series`, and can not be used as the column length.\n\n 2. GROUPED_MAP\n\n A grouped map UDF defines transformation: A `pandas.DataFrame` -> A `pandas.DataFrame`\n The returnType should be a :class:`StructType` describing the schema of the returned\n `pandas.DataFrame`. The column labels of the returned `pandas.DataFrame` must either match\n the field names in the defined returnType schema if specified as strings, or match the\n field data types by position if not strings, e.g. integer indices.\n The length of the returned `pandas.DataFrame` can be arbitrary.\n\n Grouped map UDFs are used with :meth:`pyspark.sql.GroupedData.apply`.\n\n >>> from pyspark.sql.functions import pandas_udf, PandasUDFType\n >>> df = spark.createDataFrame(\n ... [(1, 1.0), (1, 2.0), (2, 3.0), (2, 5.0), (2, 10.0)],\n ... (\"id\", \"v\")) # doctest: +SKIP\n >>> @pandas_udf(\"id long, v double\", PandasUDFType.GROUPED_MAP) # doctest: +SKIP\n ... def normalize(pdf):\n ... v = pdf.v\n ... return pdf.assign(v=(v - v.mean()) / v.std())\n >>> df.groupby(\"id\").apply(normalize).show() # doctest: +SKIP\n +---+-------------------+\n | id| v|\n +---+-------------------+\n | 1|-0.7071067811865475|\n | 1| 0.7071067811865475|\n | 2|-0.8320502943378437|\n | 2|-0.2773500981126146|\n | 2| 1.1094003924504583|\n +---+-------------------+\n\n Alternatively, the user can define a function that takes two arguments.\n In this case, the grouping key(s) will be passed as the first argument and the data will\n be passed as the second argument. The grouping key(s) will be passed as a tuple of numpy\n data types, e.g., `numpy.int32` and `numpy.float64`. The data will still be passed in\n as a `pandas.DataFrame` containing all columns from the original Spark DataFrame.\n This is useful when the user does not want to hardcode grouping key(s) in the function.\n\n >>> import pandas as pd # doctest: +SKIP\n >>> from pyspark.sql.functions import pandas_udf, PandasUDFType\n >>> df = spark.createDataFrame(\n ... [(1, 1.0), (1, 2.0), (2, 3.0), (2, 5.0), (2, 10.0)],\n ... (\"id\", \"v\")) # doctest: +SKIP\n >>> @pandas_udf(\"id long, v double\", PandasUDFType.GROUPED_MAP) # doctest: +SKIP\n ... def mean_udf(key, pdf):\n ... # key is a tuple of one numpy.int64, which is the value\n ... # of 'id' for the current group\n ... return pd.DataFrame([key + (pdf.v.mean(),)])\n >>> df.groupby('id').apply(mean_udf).show() # doctest: +SKIP\n +---+---+\n | id| v|\n +---+---+\n | 1|1.5|\n | 2|6.0|\n +---+---+\n >>> @pandas_udf(\n ... \"id long, `ceil(v / 2)` long, v double\",\n ... PandasUDFType.GROUPED_MAP) # doctest: +SKIP\n >>> def sum_udf(key, pdf):\n ... # key is a tuple of two numpy.int64s, which is the values\n ... # of 'id' and 'ceil(df.v / 2)' for the current group\n ... return pd.DataFrame([key + (pdf.v.sum(),)])\n >>> df.groupby(df.id, ceil(df.v / 2)).apply(sum_udf).show() # doctest: +SKIP\n +---+-----------+----+\n | id|ceil(v / 2)| v|\n +---+-----------+----+\n | 2| 5|10.0|\n | 1| 1| 3.0|\n | 2| 3| 5.0|\n | 2| 2| 3.0|\n +---+-----------+----+\n\n .. note:: If returning a new `pandas.DataFrame` constructed with a dictionary, it is\n recommended to explicitly index the columns by name to ensure the positions are correct,\n or alternatively use an `OrderedDict`.\n For example, `pd.DataFrame({'id': ids, 'a': data}, columns=['id', 'a'])` or\n `pd.DataFrame(OrderedDict([('id', ids), ('a', data)]))`.\n\n .. seealso:: :meth:`pyspark.sql.GroupedData.apply`\n\n 3. GROUPED_AGG\n\n A grouped aggregate UDF defines a transformation: One or more `pandas.Series` -> A scalar\n The `returnType` should be a primitive data type, e.g., :class:`DoubleType`.\n The returned scalar can be either a python primitive type, e.g., `int` or `float`\n or a numpy data type, e.g., `numpy.int64` or `numpy.float64`.\n\n :class:`MapType` and :class:`StructType` are currently not supported as output types.\n\n Group aggregate UDFs are used with :meth:`pyspark.sql.GroupedData.agg` and\n :class:`pyspark.sql.Window`\n\n This example shows using grouped aggregated UDFs with groupby:\n\n >>> from pyspark.sql.functions import pandas_udf, PandasUDFType\n >>> df = spark.createDataFrame(\n ... [(1, 1.0), (1, 2.0), (2, 3.0), (2, 5.0), (2, 10.0)],\n ... (\"id\", \"v\"))\n >>> @pandas_udf(\"double\", PandasUDFType.GROUPED_AGG) # doctest: +SKIP\n ... def mean_udf(v):\n ... return v.mean()\n >>> df.groupby(\"id\").agg(mean_udf(df['v'])).show() # doctest: +SKIP\n +---+-----------+\n | id|mean_udf(v)|\n +---+-----------+\n | 1| 1.5|\n | 2| 6.0|\n +---+-----------+\n\n This example shows using grouped aggregated UDFs as window functions.\n\n >>> from pyspark.sql.functions import pandas_udf, PandasUDFType\n >>> from pyspark.sql import Window\n >>> df = spark.createDataFrame(\n ... [(1, 1.0), (1, 2.0), (2, 3.0), (2, 5.0), (2, 10.0)],\n ... (\"id\", \"v\"))\n >>> @pandas_udf(\"double\", PandasUDFType.GROUPED_AGG) # doctest: +SKIP\n ... def mean_udf(v):\n ... return v.mean()\n >>> w = (Window.partitionBy('id')\n ... .orderBy('v')\n ... .rowsBetween(-1, 0))\n >>> df.withColumn('mean_v', mean_udf(df['v']).over(w)).show() # doctest: +SKIP\n +---+----+------+\n | id| v|mean_v|\n +---+----+------+\n | 1| 1.0| 1.0|\n | 1| 2.0| 1.5|\n | 2| 3.0| 3.0|\n | 2| 5.0| 4.0|\n | 2|10.0| 7.5|\n +---+----+------+\n\n .. note:: For performance reasons, the input series to window functions are not copied.\n Therefore, mutating the input series is not allowed and will cause incorrect results.\n For the same reason, users should also not rely on the index of the input series.\n\n .. seealso:: :meth:`pyspark.sql.GroupedData.agg` and :class:`pyspark.sql.Window`\n\n .. note:: The user-defined functions are considered deterministic by default. Due to\n optimization, duplicate invocations may be eliminated or the function may even be invoked\n more times than it is present in the query. If your function is not deterministic, call\n `asNondeterministic` on the user defined function. E.g.:\n\n >>> @pandas_udf('double', PandasUDFType.SCALAR) # doctest: +SKIP\n ... def random(v):\n ... import numpy as np\n ... import pandas as pd\n ... return pd.Series(np.random.randn(len(v))\n >>> random = random.asNondeterministic() # doctest: +SKIP\n\n .. note:: The user-defined functions do not support conditional expressions or short circuiting\n in boolean expressions and it ends up with being executed all internally. If the functions\n can fail on special rows, the workaround is to incorporate the condition into the functions.\n\n .. note:: The user-defined functions do not take keyword arguments on the calling side.\n\n .. note:: The data type of returned `pandas.Series` from the user-defined functions should be\n matched with defined returnType (see :meth:`types.to_arrow_type` and\n :meth:`types.from_arrow_type`). When there is mismatch between them, Spark might do\n conversion on returned data. The conversion is not guaranteed to be correct and results\n should be checked for accuracy by users.\n \"\"\"\n\n # The following table shows most of Pandas data and SQL type conversions in Pandas UDFs that\n # are not yet visible to the user. Some of behaviors are buggy and might be changed in the near\n # future. The table might have to be eventually documented externally.\n # Please see SPARK-25798's PR to see the codes in order to generate the table below.\n #\n # +-----------------------------+----------------------+----------+-------+--------+--------------------+--------------------+--------+---------+---------+---------+------------+------------+------------+-----------------------------------+-----------------------------------------------------+-----------------+--------------------+-----------------------------+-------------+-----------------+------------------+-----------+--------------------------------+ # noqa\n # |SQL Type \\ Pandas Value(Type)|None(object(NoneType))|True(bool)|1(int8)|1(int16)| 1(int32)| 1(int64)|1(uint8)|1(uint16)|1(uint32)|1(uint64)|1.0(float16)|1.0(float32)|1.0(float64)|1970-01-01 00:00:00(datetime64[ns])|1970-01-01 00:00:00-05:00(datetime64[ns, US/Eastern])|a(object(string))| 1(object(Decimal))|[1 2 3](object(array[int32]))|1.0(float128)|(1+0j)(complex64)|(1+0j)(complex128)|A(category)|1 days 00:00:00(timedelta64[ns])| # noqa\n # +-----------------------------+----------------------+----------+-------+--------+--------------------+--------------------+--------+---------+---------+---------+------------+------------+------------+-----------------------------------+-----------------------------------------------------+-----------------+--------------------+-----------------------------+-------------+-----------------+------------------+-----------+--------------------------------+ # noqa\n # | boolean| None| True| True| True| True| True| True| True| True| True| False| False| False| False| False| X| X| X| False| False| False| X| False| # noqa\n # | tinyint| None| 1| 1| 1| 1| 1| X| X| X| X| 1| 1| 1| X| X| X| X| X| X| X| X| 0| X| # noqa\n # | smallint| None| 1| 1| 1| 1| 1| 1| X| X| X| 1| 1| 1| X| X| X| X| X| X| X| X| X| X| # noqa\n # | int| None| 1| 1| 1| 1| 1| 1| 1| X| X| 1| 1| 1| X| X| X| X| X| X| X| X| X| X| # noqa\n # | bigint| None| 1| 1| 1| 1| 1| 1| 1| 1| X| 1| 1| 1| 0| 18000000000000| X| X| X| X| X| X| X| X| # noqa\n # | float| None| 1.0| 1.0| 1.0| 1.0| 1.0| 1.0| 1.0| 1.0| 1.0| 1.0| 1.0| 1.0| X| X| X|1.401298464324817...| X| X| X| X| X| X| # noqa\n # | double| None| 1.0| 1.0| 1.0| 1.0| 1.0| 1.0| 1.0| 1.0| 1.0| 1.0| 1.0| 1.0| X| X| X| X| X| X| X| X| X| X| # noqa\n # | date| None| X| X| X|datetime.date(197...| X| X| X| X| X| X| X| X| datetime.date(197...| X| X| X| X| X| X| X| X| X| # noqa\n # | timestamp| None| X| X| X| X|datetime.datetime...| X| X| X| X| X| X| X| datetime.datetime...| datetime.datetime...| X| X| X| X| X| X| X| X| # noqa\n # | string| None| u''|u'\\x01'| u'\\x01'| u'\\x01'| u'\\x01'| u'\\x01'| u'\\x01'| u'\\x01'| u'\\x01'| u''| u''| u''| X| X| u'a'| X| X| u''| u''| u''| X| X| # noqa\n # | decimal(10,0)| None| X| X| X| X| X| X| X| X| X| X| X| X| X| X| X| Decimal('1')| X| X| X| X| X| X| # noqa\n # | array| None| X| X| X| X| X| X| X| X| X| X| X| X| X| X| X| X| [1, 2, 3]| X| X| X| X| X| # noqa\n # | map| X| X| X| X| X| X| X| X| X| X| X| X| X| X| X| X| X| X| X| X| X| X| X| # noqa\n # | struct<_1:int>| X| X| X| X| X| X| X| X| X| X| X| X| X| X| X| X| X| X| X| X| X| X| X| # noqa\n # | binary| X| X| X| X| X| X| X| X| X| X| X| X| X| X| X| X| X| X| X| X| X| X| X| # noqa\n # +-----------------------------+----------------------+----------+-------+--------+--------------------+--------------------+--------+---------+---------+---------+------------+------------+------------+-----------------------------------+-----------------------------------------------------+-----------------+--------------------+-----------------------------+-------------+-----------------+------------------+-----------+--------------------------------+ # noqa\n #\n # Note: DDL formatted string is used for 'SQL Type' for simplicity. This string can be\n # used in `returnType`.\n # Note: The values inside of the table are generated by `repr`.\n # Note: Python 2 is used to generate this table since it is used to check the backward\n # compatibility often in practice.\n # Note: Pandas 0.19.2 and PyArrow 0.9.0 are used.\n # Note: Timezone is Singapore timezone.\n # Note: 'X' means it throws an exception during the conversion.\n # Note: 'binary' type is only supported with PyArrow 0.10.0+ (SPARK-23555).\n\n # decorator @pandas_udf(returnType, functionType)\n is_decorator = f is None or isinstance(f, (str, DataType))\n\n if is_decorator:\n # If DataType has been passed as a positional argument\n # for decorator use it as a returnType\n return_type = f or returnType\n\n if functionType is not None:\n # @pandas_udf(dataType, functionType=functionType)\n # @pandas_udf(returnType=dataType, functionType=functionType)\n eval_type = functionType\n elif returnType is not None and isinstance(returnType, int):\n # @pandas_udf(dataType, functionType)\n eval_type = returnType\n else:\n # @pandas_udf(dataType) or @pandas_udf(returnType=dataType)\n eval_type = PythonEvalType.SQL_SCALAR_PANDAS_UDF\n else:\n return_type = returnType\n\n if functionType is not None:\n eval_type = functionType\n else:\n eval_type = PythonEvalType.SQL_SCALAR_PANDAS_UDF\n\n if return_type is None:\n raise ValueError(\"Invalid returnType: returnType can not be None\")\n\n if eval_type not in [PythonEvalType.SQL_SCALAR_PANDAS_UDF,\n PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF,\n PythonEvalType.SQL_GROUPED_AGG_PANDAS_UDF]:\n raise ValueError(\"Invalid functionType: \"\n \"functionType must be one the values from PandasUDFType\")\n\n if is_decorator:\n return functools.partial(_create_udf, returnType=return_type, evalType=eval_type)\n else:\n return _create_udf(f=f, returnType=return_type, evalType=eval_type)"} {"instruction": "Make a summary of the following Python 3 code\ndef to_str(value):\n if isinstance(value, bool):\n return str(value).lower()\n elif value is None:\n return value\n else:\n return str(value)", "response": "A wrapper over str that converts bool values to lower case strings."} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef _set_opts(self, schema=None, **options):\n if schema is not None:\n self.schema(schema)\n for k, v in options.items():\n if v is not None:\n self.option(k, v)", "response": "Set named options for the log entry."} {"instruction": "Explain what the following Python 3 code does\ndef format(self, source):\n self._jreader = self._jreader.format(source)\n return self", "response": "Specifies the input data source format."} {"instruction": "Can you write a function in Python 3 where it\nspecifies the input schema.", "response": "def schema(self, schema):\n \"\"\"Specifies the input schema.\n\n Some data sources (e.g. JSON) can infer the input schema automatically from data.\n By specifying the schema here, the underlying data source can skip the schema\n inference step, and thus speed up data loading.\n\n :param schema: a :class:`pyspark.sql.types.StructType` object or a DDL-formatted string\n (For example ``col0 INT, col1 DOUBLE``).\n\n >>> s = spark.read.schema(\"col0 INT, col1 DOUBLE\")\n \"\"\"\n from pyspark.sql import SparkSession\n spark = SparkSession.builder.getOrCreate()\n if isinstance(schema, StructType):\n jschema = spark._jsparkSession.parseDataType(schema.json())\n self._jreader = self._jreader.schema(jschema)\n elif isinstance(schema, basestring):\n self._jreader = self._jreader.schema(schema)\n else:\n raise TypeError(\"schema should be StructType or string\")\n return self"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef option(self, key, value):\n self._jreader = self._jreader.option(key, to_str(value))\n return self", "response": "Adds an option to the underlying data source."} {"instruction": "How would you code a function in Python 3 to\nadd input options for the underlying data source.", "response": "def options(self, **options):\n \"\"\"Adds input options for the underlying data source.\n\n You can set the following option(s) for reading files:\n * ``timeZone``: sets the string that indicates a timezone to be used to parse timestamps\n in the JSON/CSV datasources or partition values.\n If it isn't set, it uses the default value, session local timezone.\n \"\"\"\n for k in options:\n self._jreader = self._jreader.option(k, to_str(options[k]))\n return self"} {"instruction": "Make a summary of the following Python 3 code\ndef load(self, path=None, format=None, schema=None, **options):\n if format is not None:\n self.format(format)\n if schema is not None:\n self.schema(schema)\n self.options(**options)\n if isinstance(path, basestring):\n return self._df(self._jreader.load(path))\n elif path is not None:\n if type(path) != list:\n path = [path]\n return self._df(self._jreader.load(self._spark._sc._jvm.PythonUtils.toSeq(path)))\n else:\n return self._df(self._jreader.load())", "response": "Loads data from a file - system backed data source and returns it as a : class : DataFrame."} {"instruction": "Can you write a function in Python 3 where it\nloads the JSON file and return the result as a DataFrame.", "response": "def json(self, path, schema=None, primitivesAsString=None, prefersDecimal=None,\n allowComments=None, allowUnquotedFieldNames=None, allowSingleQuotes=None,\n allowNumericLeadingZero=None, allowBackslashEscapingAnyCharacter=None,\n mode=None, columnNameOfCorruptRecord=None, dateFormat=None, timestampFormat=None,\n multiLine=None, allowUnquotedControlChars=None, lineSep=None, samplingRatio=None,\n dropFieldIfAllNull=None, encoding=None, locale=None):\n \"\"\"\n Loads JSON files and returns the results as a :class:`DataFrame`.\n\n `JSON Lines `_ (newline-delimited JSON) is supported by default.\n For JSON (one record per file), set the ``multiLine`` parameter to ``true``.\n\n If the ``schema`` parameter is not specified, this function goes\n through the input once to determine the input schema.\n\n :param path: string represents path to the JSON dataset, or a list of paths,\n or RDD of Strings storing JSON objects.\n :param schema: an optional :class:`pyspark.sql.types.StructType` for the input schema or\n a DDL-formatted string (For example ``col0 INT, col1 DOUBLE``).\n :param primitivesAsString: infers all primitive values as a string type. If None is set,\n it uses the default value, ``false``.\n :param prefersDecimal: infers all floating-point values as a decimal type. If the values\n do not fit in decimal, then it infers them as doubles. If None is\n set, it uses the default value, ``false``.\n :param allowComments: ignores Java/C++ style comment in JSON records. If None is set,\n it uses the default value, ``false``.\n :param allowUnquotedFieldNames: allows unquoted JSON field names. If None is set,\n it uses the default value, ``false``.\n :param allowSingleQuotes: allows single quotes in addition to double quotes. If None is\n set, it uses the default value, ``true``.\n :param allowNumericLeadingZero: allows leading zeros in numbers (e.g. 00012). If None is\n set, it uses the default value, ``false``.\n :param allowBackslashEscapingAnyCharacter: allows accepting quoting of all character\n using backslash quoting mechanism. If None is\n set, it uses the default value, ``false``.\n :param mode: allows a mode for dealing with corrupt records during parsing. If None is\n set, it uses the default value, ``PERMISSIVE``.\n\n * ``PERMISSIVE`` : when it meets a corrupted record, puts the malformed string \\\n into a field configured by ``columnNameOfCorruptRecord``, and sets malformed \\\n fields to ``null``. To keep corrupt records, an user can set a string type \\\n field named ``columnNameOfCorruptRecord`` in an user-defined schema. If a \\\n schema does not have the field, it drops corrupt records during parsing. \\\n When inferring a schema, it implicitly adds a ``columnNameOfCorruptRecord`` \\\n field in an output schema.\n * ``DROPMALFORMED`` : ignores the whole corrupted records.\n * ``FAILFAST`` : throws an exception when it meets corrupted records.\n\n :param columnNameOfCorruptRecord: allows renaming the new field having malformed string\n created by ``PERMISSIVE`` mode. This overrides\n ``spark.sql.columnNameOfCorruptRecord``. If None is set,\n it uses the value specified in\n ``spark.sql.columnNameOfCorruptRecord``.\n :param dateFormat: sets the string that indicates a date format. Custom date formats\n follow the formats at ``java.time.format.DateTimeFormatter``. This\n applies to date type. If None is set, it uses the\n default value, ``yyyy-MM-dd``.\n :param timestampFormat: sets the string that indicates a timestamp format.\n Custom date formats follow the formats at\n ``java.time.format.DateTimeFormatter``.\n This applies to timestamp type. If None is set, it uses the\n default value, ``yyyy-MM-dd'T'HH:mm:ss.SSSXXX``.\n :param multiLine: parse one record, which may span multiple lines, per file. If None is\n set, it uses the default value, ``false``.\n :param allowUnquotedControlChars: allows JSON Strings to contain unquoted control\n characters (ASCII characters with value less than 32,\n including tab and line feed characters) or not.\n :param encoding: allows to forcibly set one of standard basic or extended encoding for\n the JSON files. For example UTF-16BE, UTF-32LE. If None is set,\n the encoding of input JSON will be detected automatically\n when the multiLine option is set to ``true``.\n :param lineSep: defines the line separator that should be used for parsing. If None is\n set, it covers all ``\\\\r``, ``\\\\r\\\\n`` and ``\\\\n``.\n :param samplingRatio: defines fraction of input JSON objects used for schema inferring.\n If None is set, it uses the default value, ``1.0``.\n :param dropFieldIfAllNull: whether to ignore column of all null values or empty\n array/struct during schema inference. If None is set, it\n uses the default value, ``false``.\n :param locale: sets a locale as language tag in IETF BCP 47 format. If None is set,\n it uses the default value, ``en-US``. For instance, ``locale`` is used while\n parsing dates and timestamps.\n\n >>> df1 = spark.read.json('python/test_support/sql/people.json')\n >>> df1.dtypes\n [('age', 'bigint'), ('name', 'string')]\n >>> rdd = sc.textFile('python/test_support/sql/people.json')\n >>> df2 = spark.read.json(rdd)\n >>> df2.dtypes\n [('age', 'bigint'), ('name', 'string')]\n\n \"\"\"\n self._set_opts(\n schema=schema, primitivesAsString=primitivesAsString, prefersDecimal=prefersDecimal,\n allowComments=allowComments, allowUnquotedFieldNames=allowUnquotedFieldNames,\n allowSingleQuotes=allowSingleQuotes, allowNumericLeadingZero=allowNumericLeadingZero,\n allowBackslashEscapingAnyCharacter=allowBackslashEscapingAnyCharacter,\n mode=mode, columnNameOfCorruptRecord=columnNameOfCorruptRecord, dateFormat=dateFormat,\n timestampFormat=timestampFormat, multiLine=multiLine,\n allowUnquotedControlChars=allowUnquotedControlChars, lineSep=lineSep,\n samplingRatio=samplingRatio, dropFieldIfAllNull=dropFieldIfAllNull, encoding=encoding,\n locale=locale)\n if isinstance(path, basestring):\n path = [path]\n if type(path) == list:\n return self._df(self._jreader.json(self._spark._sc._jvm.PythonUtils.toSeq(path)))\n elif isinstance(path, RDD):\n def func(iterator):\n for x in iterator:\n if not isinstance(x, basestring):\n x = unicode(x)\n if isinstance(x, unicode):\n x = x.encode(\"utf-8\")\n yield x\n keyed = path.mapPartitions(func)\n keyed._bypass_serializer = True\n jrdd = keyed._jrdd.map(self._spark._jvm.BytesToString())\n return self._df(self._jreader.json(jrdd))\n else:\n raise TypeError(\"path can be only string, list or RDD\")"} {"instruction": "Given the following Python 3 function, write the documentation\ndef parquet(self, *paths):\n return self._df(self._jreader.parquet(_to_seq(self._spark._sc, paths)))", "response": "Loads Parquet files returning the result as a DataFrame."} {"instruction": "Explain what the following Python 3 code does\ndef text(self, paths, wholetext=False, lineSep=None):\n self._set_opts(wholetext=wholetext, lineSep=lineSep)\n if isinstance(paths, basestring):\n paths = [paths]\n return self._df(self._jreader.text(self._spark._sc._jvm.PythonUtils.toSeq(paths)))", "response": "Loads text files and returns a DataFrame containing the schema starts with a\n string column named value followed by partitioned columns if there are any."} {"instruction": "Write a Python 3 function that can\nload a CSV file and returns a DataFrame.", "response": "def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=None,\n comment=None, header=None, inferSchema=None, ignoreLeadingWhiteSpace=None,\n ignoreTrailingWhiteSpace=None, nullValue=None, nanValue=None, positiveInf=None,\n negativeInf=None, dateFormat=None, timestampFormat=None, maxColumns=None,\n maxCharsPerColumn=None, maxMalformedLogPerPartition=None, mode=None,\n columnNameOfCorruptRecord=None, multiLine=None, charToEscapeQuoteEscaping=None,\n samplingRatio=None, enforceSchema=None, emptyValue=None, locale=None, lineSep=None):\n r\"\"\"Loads a CSV file and returns the result as a :class:`DataFrame`.\n\n This function will go through the input once to determine the input schema if\n ``inferSchema`` is enabled. To avoid going through the entire data once, disable\n ``inferSchema`` option or specify the schema explicitly using ``schema``.\n\n :param path: string, or list of strings, for input path(s),\n or RDD of Strings storing CSV rows.\n :param schema: an optional :class:`pyspark.sql.types.StructType` for the input schema\n or a DDL-formatted string (For example ``col0 INT, col1 DOUBLE``).\n :param sep: sets a single character as a separator for each field and value.\n If None is set, it uses the default value, ``,``.\n :param encoding: decodes the CSV files by the given encoding type. If None is set,\n it uses the default value, ``UTF-8``.\n :param quote: sets a single character used for escaping quoted values where the\n separator can be part of the value. If None is set, it uses the default\n value, ``\"``. If you would like to turn off quotations, you need to set an\n empty string.\n :param escape: sets a single character used for escaping quotes inside an already\n quoted value. If None is set, it uses the default value, ``\\``.\n :param comment: sets a single character used for skipping lines beginning with this\n character. By default (None), it is disabled.\n :param header: uses the first line as names of columns. If None is set, it uses the\n default value, ``false``.\n :param inferSchema: infers the input schema automatically from data. It requires one extra\n pass over the data. If None is set, it uses the default value, ``false``.\n :param enforceSchema: If it is set to ``true``, the specified or inferred schema will be\n forcibly applied to datasource files, and headers in CSV files will be\n ignored. If the option is set to ``false``, the schema will be\n validated against all headers in CSV files or the first header in RDD\n if the ``header`` option is set to ``true``. Field names in the schema\n and column names in CSV headers are checked by their positions\n taking into account ``spark.sql.caseSensitive``. If None is set,\n ``true`` is used by default. Though the default value is ``true``,\n it is recommended to disable the ``enforceSchema`` option\n to avoid incorrect results.\n :param ignoreLeadingWhiteSpace: A flag indicating whether or not leading whitespaces from\n values being read should be skipped. If None is set, it\n uses the default value, ``false``.\n :param ignoreTrailingWhiteSpace: A flag indicating whether or not trailing whitespaces from\n values being read should be skipped. If None is set, it\n uses the default value, ``false``.\n :param nullValue: sets the string representation of a null value. If None is set, it uses\n the default value, empty string. Since 2.0.1, this ``nullValue`` param\n applies to all supported types including the string type.\n :param nanValue: sets the string representation of a non-number value. If None is set, it\n uses the default value, ``NaN``.\n :param positiveInf: sets the string representation of a positive infinity value. If None\n is set, it uses the default value, ``Inf``.\n :param negativeInf: sets the string representation of a negative infinity value. If None\n is set, it uses the default value, ``Inf``.\n :param dateFormat: sets the string that indicates a date format. Custom date formats\n follow the formats at ``java.time.format.DateTimeFormatter``. This\n applies to date type. If None is set, it uses the\n default value, ``yyyy-MM-dd``.\n :param timestampFormat: sets the string that indicates a timestamp format.\n Custom date formats follow the formats at\n ``java.time.format.DateTimeFormatter``.\n This applies to timestamp type. If None is set, it uses the\n default value, ``yyyy-MM-dd'T'HH:mm:ss.SSSXXX``.\n :param maxColumns: defines a hard limit of how many columns a record can have. If None is\n set, it uses the default value, ``20480``.\n :param maxCharsPerColumn: defines the maximum number of characters allowed for any given\n value being read. If None is set, it uses the default value,\n ``-1`` meaning unlimited length.\n :param maxMalformedLogPerPartition: this parameter is no longer used since Spark 2.2.0.\n If specified, it is ignored.\n :param mode: allows a mode for dealing with corrupt records during parsing. If None is\n set, it uses the default value, ``PERMISSIVE``.\n\n * ``PERMISSIVE`` : when it meets a corrupted record, puts the malformed string \\\n into a field configured by ``columnNameOfCorruptRecord``, and sets malformed \\\n fields to ``null``. To keep corrupt records, an user can set a string type \\\n field named ``columnNameOfCorruptRecord`` in an user-defined schema. If a \\\n schema does not have the field, it drops corrupt records during parsing. \\\n A record with less/more tokens than schema is not a corrupted record to CSV. \\\n When it meets a record having fewer tokens than the length of the schema, \\\n sets ``null`` to extra fields. When the record has more tokens than the \\\n length of the schema, it drops extra tokens.\n * ``DROPMALFORMED`` : ignores the whole corrupted records.\n * ``FAILFAST`` : throws an exception when it meets corrupted records.\n\n :param columnNameOfCorruptRecord: allows renaming the new field having malformed string\n created by ``PERMISSIVE`` mode. This overrides\n ``spark.sql.columnNameOfCorruptRecord``. If None is set,\n it uses the value specified in\n ``spark.sql.columnNameOfCorruptRecord``.\n :param multiLine: parse records, which may span multiple lines. If None is\n set, it uses the default value, ``false``.\n :param charToEscapeQuoteEscaping: sets a single character used for escaping the escape for\n the quote character. If None is set, the default value is\n escape character when escape and quote characters are\n different, ``\\0`` otherwise.\n :param samplingRatio: defines fraction of rows used for schema inferring.\n If None is set, it uses the default value, ``1.0``.\n :param emptyValue: sets the string representation of an empty value. If None is set, it uses\n the default value, empty string.\n :param locale: sets a locale as language tag in IETF BCP 47 format. If None is set,\n it uses the default value, ``en-US``. For instance, ``locale`` is used while\n parsing dates and timestamps.\n :param lineSep: defines the line separator that should be used for parsing. If None is\n set, it covers all ``\\\\r``, ``\\\\r\\\\n`` and ``\\\\n``.\n Maximum length is 1 character.\n\n >>> df = spark.read.csv('python/test_support/sql/ages.csv')\n >>> df.dtypes\n [('_c0', 'string'), ('_c1', 'string')]\n >>> rdd = sc.textFile('python/test_support/sql/ages.csv')\n >>> df2 = spark.read.csv(rdd)\n >>> df2.dtypes\n [('_c0', 'string'), ('_c1', 'string')]\n \"\"\"\n self._set_opts(\n schema=schema, sep=sep, encoding=encoding, quote=quote, escape=escape, comment=comment,\n header=header, inferSchema=inferSchema, ignoreLeadingWhiteSpace=ignoreLeadingWhiteSpace,\n ignoreTrailingWhiteSpace=ignoreTrailingWhiteSpace, nullValue=nullValue,\n nanValue=nanValue, positiveInf=positiveInf, negativeInf=negativeInf,\n dateFormat=dateFormat, timestampFormat=timestampFormat, maxColumns=maxColumns,\n maxCharsPerColumn=maxCharsPerColumn,\n maxMalformedLogPerPartition=maxMalformedLogPerPartition, mode=mode,\n columnNameOfCorruptRecord=columnNameOfCorruptRecord, multiLine=multiLine,\n charToEscapeQuoteEscaping=charToEscapeQuoteEscaping, samplingRatio=samplingRatio,\n enforceSchema=enforceSchema, emptyValue=emptyValue, locale=locale, lineSep=lineSep)\n if isinstance(path, basestring):\n path = [path]\n if type(path) == list:\n return self._df(self._jreader.csv(self._spark._sc._jvm.PythonUtils.toSeq(path)))\n elif isinstance(path, RDD):\n def func(iterator):\n for x in iterator:\n if not isinstance(x, basestring):\n x = unicode(x)\n if isinstance(x, unicode):\n x = x.encode(\"utf-8\")\n yield x\n keyed = path.mapPartitions(func)\n keyed._bypass_serializer = True\n jrdd = keyed._jrdd.map(self._spark._jvm.BytesToString())\n # see SPARK-22112\n # There aren't any jvm api for creating a dataframe from rdd storing csv.\n # We can do it through creating a jvm dataset firstly and using the jvm api\n # for creating a dataframe from dataset storing csv.\n jdataset = self._spark._ssql_ctx.createDataset(\n jrdd.rdd(),\n self._spark._jvm.Encoders.STRING())\n return self._df(self._jreader.csv(jdataset))\n else:\n raise TypeError(\"path can be only string, list or RDD\")"} {"instruction": "Explain what the following Python 3 code does\ndef orc(self, path):\n if isinstance(path, basestring):\n path = [path]\n return self._df(self._jreader.orc(_to_seq(self._spark._sc, path)))", "response": "Loads ORC files returning the result as a DataFrame."} {"instruction": "Create a Python 3 function for\nconstructing a :class:`DataFrame` representing the database table named ``table`` accessible via JDBC URL ``url`` and connection ``properties``. Partitions of the table will be retrieved in parallel if either ``column`` or ``predicates`` is specified. ``lowerBound`, ``upperBound`` and ``numPartitions`` is needed when ``column`` is specified. If both ``column`` and ``predicates`` are specified, ``column`` will be used. .. note:: Don't create too many partitions in parallel on a large cluster; otherwise Spark might crash your external database systems. :param url: a JDBC URL of the form ``jdbc:subprotocol:subname`` :param table: the name of the table :param column: the name of an integer column that will be used for partitioning; if this parameter is specified, then ``numPartitions``, ``lowerBound`` (inclusive), and ``upperBound`` (exclusive) will form partition strides for generated WHERE clause expressions used to split the column ``column`` evenly :param lowerBound: the minimum value of ``column`` used to decide partition stride :param upperBound: the maximum value of ``column`` used to decide partition stride :param numPartitions: the number of partitions :param predicates: a list of expressions suitable for inclusion in WHERE clauses; each one defines one partition of the :class:`DataFrame` :param properties: a dictionary of JDBC database connection arguments. Normally at least properties \"user\" and \"password\" with their corresponding values. For example { 'user' : 'SYSTEM', 'password' : 'mypassword' } :return: a DataFrame", "response": "def jdbc(self, url, table, column=None, lowerBound=None, upperBound=None, numPartitions=None,\n predicates=None, properties=None):\n \"\"\"\n Construct a :class:`DataFrame` representing the database table named ``table``\n accessible via JDBC URL ``url`` and connection ``properties``.\n\n Partitions of the table will be retrieved in parallel if either ``column`` or\n ``predicates`` is specified. ``lowerBound`, ``upperBound`` and ``numPartitions``\n is needed when ``column`` is specified.\n\n If both ``column`` and ``predicates`` are specified, ``column`` will be used.\n\n .. note:: Don't create too many partitions in parallel on a large cluster;\n otherwise Spark might crash your external database systems.\n\n :param url: a JDBC URL of the form ``jdbc:subprotocol:subname``\n :param table: the name of the table\n :param column: the name of an integer column that will be used for partitioning;\n if this parameter is specified, then ``numPartitions``, ``lowerBound``\n (inclusive), and ``upperBound`` (exclusive) will form partition strides\n for generated WHERE clause expressions used to split the column\n ``column`` evenly\n :param lowerBound: the minimum value of ``column`` used to decide partition stride\n :param upperBound: the maximum value of ``column`` used to decide partition stride\n :param numPartitions: the number of partitions\n :param predicates: a list of expressions suitable for inclusion in WHERE clauses;\n each one defines one partition of the :class:`DataFrame`\n :param properties: a dictionary of JDBC database connection arguments. Normally at\n least properties \"user\" and \"password\" with their corresponding values.\n For example { 'user' : 'SYSTEM', 'password' : 'mypassword' }\n :return: a DataFrame\n \"\"\"\n if properties is None:\n properties = dict()\n jprop = JavaClass(\"java.util.Properties\", self._spark._sc._gateway._gateway_client)()\n for k in properties:\n jprop.setProperty(k, properties[k])\n if column is not None:\n assert lowerBound is not None, \"lowerBound can not be None when ``column`` is specified\"\n assert upperBound is not None, \"upperBound can not be None when ``column`` is specified\"\n assert numPartitions is not None, \\\n \"numPartitions can not be None when ``column`` is specified\"\n return self._df(self._jreader.jdbc(url, table, column, int(lowerBound), int(upperBound),\n int(numPartitions), jprop))\n if predicates is not None:\n gateway = self._spark._sc._gateway\n jpredicates = utils.toJArray(gateway, gateway.jvm.java.lang.String, predicates)\n return self._df(self._jreader.jdbc(url, table, jpredicates, jprop))\n return self._df(self._jreader.jdbc(url, table, jprop))"} {"instruction": "Can you implement a function in Python 3 that\nspecifies the behavior when data or table already exists.", "response": "def mode(self, saveMode):\n \"\"\"Specifies the behavior when data or table already exists.\n\n Options include:\n\n * `append`: Append contents of this :class:`DataFrame` to existing data.\n * `overwrite`: Overwrite existing data.\n * `error` or `errorifexists`: Throw an exception if data already exists.\n * `ignore`: Silently ignore this operation if data already exists.\n\n >>> df.write.mode('append').parquet(os.path.join(tempfile.mkdtemp(), 'data'))\n \"\"\"\n # At the JVM side, the default value of mode is already set to \"error\".\n # So, if the given saveMode is None, we will not call JVM-side's mode method.\n if saveMode is not None:\n self._jwrite = self._jwrite.mode(saveMode)\n return self"} {"instruction": "How would you explain what the following Python 3 function does\ndef format(self, source):\n self._jwrite = self._jwrite.format(source)\n return self", "response": "Specifies the underlying output data source."} {"instruction": "Write a Python 3 script for\nadding an option to the underlying data source.", "response": "def option(self, key, value):\n \"\"\"Adds an output option for the underlying data source.\n\n You can set the following option(s) for writing files:\n * ``timeZone``: sets the string that indicates a timezone to be used to format\n timestamps in the JSON/CSV datasources or partition values.\n If it isn't set, it uses the default value, session local timezone.\n \"\"\"\n self._jwrite = self._jwrite.option(key, to_str(value))\n return self"} {"instruction": "How would you implement a function in Python 3 that\nadds output options for the underlying data source.", "response": "def options(self, **options):\n \"\"\"Adds output options for the underlying data source.\n\n You can set the following option(s) for writing files:\n * ``timeZone``: sets the string that indicates a timezone to be used to format\n timestamps in the JSON/CSV datasources or partition values.\n If it isn't set, it uses the default value, session local timezone.\n \"\"\"\n for k in options:\n self._jwrite = self._jwrite.option(k, to_str(options[k]))\n return self"} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef partitionBy(self, *cols):\n if len(cols) == 1 and isinstance(cols[0], (list, tuple)):\n cols = cols[0]\n self._jwrite = self._jwrite.partitionBy(_to_seq(self._spark._sc, cols))\n return self", "response": "Partitions the output by the given columns on the file system."} {"instruction": "Explain what the following Python 3 code does\ndef sortBy(self, col, *cols):\n if isinstance(col, (list, tuple)):\n if cols:\n raise ValueError(\"col is a {0} but cols are not empty\".format(type(col)))\n\n col, cols = col[0], col[1:]\n\n if not all(isinstance(c, basestring) for c in cols) or not(isinstance(col, basestring)):\n raise TypeError(\"all names should be `str`\")\n\n self._jwrite = self._jwrite.sortBy(col, _to_seq(self._spark._sc, cols))\n return self", "response": "Sorts the output in each bucket by the given columns on the file system."} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef save(self, path=None, format=None, mode=None, partitionBy=None, **options):\n self.mode(mode).options(**options)\n if partitionBy is not None:\n self.partitionBy(partitionBy)\n if format is not None:\n self.format(format)\n if path is None:\n self._jwrite.save()\n else:\n self._jwrite.save(path)", "response": "Saves the contents of the current DataFrame to a HDF5 file system."} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef insertInto(self, tableName, overwrite=False):\n self._jwrite.mode(\"overwrite\" if overwrite else \"append\").insertInto(tableName)", "response": "Inserts the content of the class:`DataFrame` into the specified table."} {"instruction": "Create a Python 3 function to\nsave the contents of the current DataFrame to the specified table.", "response": "def saveAsTable(self, name, format=None, mode=None, partitionBy=None, **options):\n \"\"\"Saves the content of the :class:`DataFrame` as the specified table.\n\n In the case the table already exists, behavior of this function depends on the\n save mode, specified by the `mode` function (default to throwing an exception).\n When `mode` is `Overwrite`, the schema of the :class:`DataFrame` does not need to be\n the same as that of the existing table.\n\n * `append`: Append contents of this :class:`DataFrame` to existing data.\n * `overwrite`: Overwrite existing data.\n * `error` or `errorifexists`: Throw an exception if data already exists.\n * `ignore`: Silently ignore this operation if data already exists.\n\n :param name: the table name\n :param format: the format used to save\n :param mode: one of `append`, `overwrite`, `error`, `errorifexists`, `ignore` \\\n (default: error)\n :param partitionBy: names of partitioning columns\n :param options: all other string options\n \"\"\"\n self.mode(mode).options(**options)\n if partitionBy is not None:\n self.partitionBy(partitionBy)\n if format is not None:\n self.format(format)\n self._jwrite.saveAsTable(name)"} {"instruction": "Write a Python 3 script to\nsave the contents of the DataFrame in JSON format at the specified path.", "response": "def json(self, path, mode=None, compression=None, dateFormat=None, timestampFormat=None,\n lineSep=None, encoding=None):\n \"\"\"Saves the content of the :class:`DataFrame` in JSON format\n (`JSON Lines text format or newline-delimited JSON `_) at the\n specified path.\n\n :param path: the path in any Hadoop supported file system\n :param mode: specifies the behavior of the save operation when data already exists.\n\n * ``append``: Append contents of this :class:`DataFrame` to existing data.\n * ``overwrite``: Overwrite existing data.\n * ``ignore``: Silently ignore this operation if data already exists.\n * ``error`` or ``errorifexists`` (default case): Throw an exception if data already \\\n exists.\n :param compression: compression codec to use when saving to file. This can be one of the\n known case-insensitive shorten names (none, bzip2, gzip, lz4,\n snappy and deflate).\n :param dateFormat: sets the string that indicates a date format. Custom date formats\n follow the formats at ``java.time.format.DateTimeFormatter``. This\n applies to date type. If None is set, it uses the\n default value, ``yyyy-MM-dd``.\n :param timestampFormat: sets the string that indicates a timestamp format.\n Custom date formats follow the formats at\n ``java.time.format.DateTimeFormatter``.\n This applies to timestamp type. If None is set, it uses the\n default value, ``yyyy-MM-dd'T'HH:mm:ss.SSSXXX``.\n :param encoding: specifies encoding (charset) of saved json files. If None is set,\n the default UTF-8 charset will be used.\n :param lineSep: defines the line separator that should be used for writing. If None is\n set, it uses the default value, ``\\\\n``.\n\n >>> df.write.json(os.path.join(tempfile.mkdtemp(), 'data'))\n \"\"\"\n self.mode(mode)\n self._set_opts(\n compression=compression, dateFormat=dateFormat, timestampFormat=timestampFormat,\n lineSep=lineSep, encoding=encoding)\n self._jwrite.json(path)"} {"instruction": "Can you create a Python 3 function that\nsaves the contents of the DataFrame in Parquet format at the specified path.", "response": "def parquet(self, path, mode=None, partitionBy=None, compression=None):\n \"\"\"Saves the content of the :class:`DataFrame` in Parquet format at the specified path.\n\n :param path: the path in any Hadoop supported file system\n :param mode: specifies the behavior of the save operation when data already exists.\n\n * ``append``: Append contents of this :class:`DataFrame` to existing data.\n * ``overwrite``: Overwrite existing data.\n * ``ignore``: Silently ignore this operation if data already exists.\n * ``error`` or ``errorifexists`` (default case): Throw an exception if data already \\\n exists.\n :param partitionBy: names of partitioning columns\n :param compression: compression codec to use when saving to file. This can be one of the\n known case-insensitive shorten names (none, uncompressed, snappy, gzip,\n lzo, brotli, lz4, and zstd). This will override\n ``spark.sql.parquet.compression.codec``. If None is set, it uses the\n value specified in ``spark.sql.parquet.compression.codec``.\n\n >>> df.write.parquet(os.path.join(tempfile.mkdtemp(), 'data'))\n \"\"\"\n self.mode(mode)\n if partitionBy is not None:\n self.partitionBy(partitionBy)\n self._set_opts(compression=compression)\n self._jwrite.parquet(path)"} {"instruction": "Implement a Python 3 function for\nsaving the content of the DataFrame in a text file at the specified path.", "response": "def text(self, path, compression=None, lineSep=None):\n \"\"\"Saves the content of the DataFrame in a text file at the specified path.\n The text files will be encoded as UTF-8.\n\n :param path: the path in any Hadoop supported file system\n :param compression: compression codec to use when saving to file. This can be one of the\n known case-insensitive shorten names (none, bzip2, gzip, lz4,\n snappy and deflate).\n :param lineSep: defines the line separator that should be used for writing. If None is\n set, it uses the default value, ``\\\\n``.\n\n The DataFrame must have only one column that is of string type.\n Each row becomes a new line in the output file.\n \"\"\"\n self._set_opts(compression=compression, lineSep=lineSep)\n self._jwrite.text(path)"} {"instruction": "Write a Python 3 script for\nsaving the contents of the DataFrame in ORC format at the specified path.", "response": "def orc(self, path, mode=None, partitionBy=None, compression=None):\n \"\"\"Saves the content of the :class:`DataFrame` in ORC format at the specified path.\n\n :param path: the path in any Hadoop supported file system\n :param mode: specifies the behavior of the save operation when data already exists.\n\n * ``append``: Append contents of this :class:`DataFrame` to existing data.\n * ``overwrite``: Overwrite existing data.\n * ``ignore``: Silently ignore this operation if data already exists.\n * ``error`` or ``errorifexists`` (default case): Throw an exception if data already \\\n exists.\n :param partitionBy: names of partitioning columns\n :param compression: compression codec to use when saving to file. This can be one of the\n known case-insensitive shorten names (none, snappy, zlib, and lzo).\n This will override ``orc.compress`` and\n ``spark.sql.orc.compression.codec``. If None is set, it uses the value\n specified in ``spark.sql.orc.compression.codec``.\n\n >>> orc_df = spark.read.orc('python/test_support/sql/orc_partitioned')\n >>> orc_df.write.orc(os.path.join(tempfile.mkdtemp(), 'data'))\n \"\"\"\n self.mode(mode)\n if partitionBy is not None:\n self.partitionBy(partitionBy)\n self._set_opts(compression=compression)\n self._jwrite.orc(path)"} {"instruction": "Here you have a function in Python 3, explain what it does\ndef jdbc(self, url, table, mode=None, properties=None):\n if properties is None:\n properties = dict()\n jprop = JavaClass(\"java.util.Properties\", self._spark._sc._gateway._gateway_client)()\n for k in properties:\n jprop.setProperty(k, properties[k])\n self.mode(mode)._jwrite.jdbc(url, table, jprop)", "response": "Saves the contents of the DataFrame to an external database table via JDBC."} {"instruction": "Implement a function in Python 3 to\ncreate an input stream that pulls messages from a Kinesis stream.", "response": "def createStream(ssc, kinesisAppName, streamName, endpointUrl, regionName,\n initialPositionInStream, checkpointInterval,\n storageLevel=StorageLevel.MEMORY_AND_DISK_2,\n awsAccessKeyId=None, awsSecretKey=None, decoder=utf8_decoder,\n stsAssumeRoleArn=None, stsSessionName=None, stsExternalId=None):\n \"\"\"\n Create an input stream that pulls messages from a Kinesis stream. This uses the\n Kinesis Client Library (KCL) to pull messages from Kinesis.\n\n .. note:: The given AWS credentials will get saved in DStream checkpoints if checkpointing\n is enabled. Make sure that your checkpoint directory is secure.\n\n :param ssc: StreamingContext object\n :param kinesisAppName: Kinesis application name used by the Kinesis Client Library (KCL) to\n update DynamoDB\n :param streamName: Kinesis stream name\n :param endpointUrl: Url of Kinesis service (e.g., https://kinesis.us-east-1.amazonaws.com)\n :param regionName: Name of region used by the Kinesis Client Library (KCL) to update\n DynamoDB (lease coordination and checkpointing) and CloudWatch (metrics)\n :param initialPositionInStream: In the absence of Kinesis checkpoint info, this is the\n worker's initial starting position in the stream. The\n values are either the beginning of the stream per Kinesis'\n limit of 24 hours (InitialPositionInStream.TRIM_HORIZON) or\n the tip of the stream (InitialPositionInStream.LATEST).\n :param checkpointInterval: Checkpoint interval for Kinesis checkpointing. See the Kinesis\n Spark Streaming documentation for more details on the different\n types of checkpoints.\n :param storageLevel: Storage level to use for storing the received objects (default is\n StorageLevel.MEMORY_AND_DISK_2)\n :param awsAccessKeyId: AWS AccessKeyId (default is None. If None, will use\n DefaultAWSCredentialsProviderChain)\n :param awsSecretKey: AWS SecretKey (default is None. If None, will use\n DefaultAWSCredentialsProviderChain)\n :param decoder: A function used to decode value (default is utf8_decoder)\n :param stsAssumeRoleArn: ARN of IAM role to assume when using STS sessions to read from\n the Kinesis stream (default is None).\n :param stsSessionName: Name to uniquely identify STS sessions used to read from Kinesis\n stream, if STS is being used (default is None).\n :param stsExternalId: External ID that can be used to validate against the assumed IAM\n role's trust policy, if STS is being used (default is None).\n :return: A DStream object\n \"\"\"\n jlevel = ssc._sc._getJavaStorageLevel(storageLevel)\n jduration = ssc._jduration(checkpointInterval)\n\n try:\n # Use KinesisUtilsPythonHelper to access Scala's KinesisUtils\n helper = ssc._jvm.org.apache.spark.streaming.kinesis.KinesisUtilsPythonHelper()\n except TypeError as e:\n if str(e) == \"'JavaPackage' object is not callable\":\n _print_missing_jar(\n \"Streaming's Kinesis\",\n \"streaming-kinesis-asl\",\n \"streaming-kinesis-asl-assembly\",\n ssc.sparkContext.version)\n raise\n jstream = helper.createStream(ssc._jssc, kinesisAppName, streamName, endpointUrl,\n regionName, initialPositionInStream, jduration, jlevel,\n awsAccessKeyId, awsSecretKey, stsAssumeRoleArn,\n stsSessionName, stsExternalId)\n stream = DStream(jstream, ssc, NoOpSerializer())\n return stream.map(lambda v: decoder(v))"} {"instruction": "Write a Python 3 script to\nprompt the user to assign the issue to in jira.", "response": "def choose_jira_assignee(issue, asf_jira):\n \"\"\"\n Prompt the user to choose who to assign the issue to in jira, given a list of candidates,\n including the original reporter and all commentors\n \"\"\"\n while True:\n try:\n reporter = issue.fields.reporter\n commentors = map(lambda x: x.author, issue.fields.comment.comments)\n candidates = set(commentors)\n candidates.add(reporter)\n candidates = list(candidates)\n print(\"JIRA is unassigned, choose assignee\")\n for idx, author in enumerate(candidates):\n if author.key == \"apachespark\":\n continue\n annotations = [\"Reporter\"] if author == reporter else []\n if author in commentors:\n annotations.append(\"Commentor\")\n print(\"[%d] %s (%s)\" % (idx, author.displayName, \",\".join(annotations)))\n raw_assignee = input(\n \"Enter number of user, or userid, to assign to (blank to leave unassigned):\")\n if raw_assignee == \"\":\n return None\n else:\n try:\n id = int(raw_assignee)\n assignee = candidates[id]\n except:\n # assume it's a user id, and try to assign (might fail, we just prompt again)\n assignee = asf_jira.user(raw_assignee)\n asf_jira.assign_issue(issue.key, assignee.key)\n return assignee\n except KeyboardInterrupt:\n raise\n except:\n traceback.print_exc()\n print(\"Error assigning JIRA, try again (or leave blank and fix manually)\")"} {"instruction": "How would you code a function in Python 3 to\nstandardize the JIRA ref.", "response": "def standardize_jira_ref(text):\n \"\"\"\n Standardize the [SPARK-XXXXX] [MODULE] prefix\n Converts \"[SPARK-XXX][mllib] Issue\", \"[MLLib] SPARK-XXX. Issue\" or \"SPARK XXX [MLLIB]: Issue\" to\n \"[SPARK-XXX][MLLIB] Issue\"\n\n >>> standardize_jira_ref(\n ... \"[SPARK-5821] [SQL] ParquetRelation2 CTAS should check if delete is successful\")\n '[SPARK-5821][SQL] ParquetRelation2 CTAS should check if delete is successful'\n >>> standardize_jira_ref(\n ... \"[SPARK-4123][Project Infra][WIP]: Show new dependencies added in pull requests\")\n '[SPARK-4123][PROJECT INFRA][WIP] Show new dependencies added in pull requests'\n >>> standardize_jira_ref(\"[MLlib] Spark 5954: Top by key\")\n '[SPARK-5954][MLLIB] Top by key'\n >>> standardize_jira_ref(\"[SPARK-979] a LRU scheduler for load balancing in TaskSchedulerImpl\")\n '[SPARK-979] a LRU scheduler for load balancing in TaskSchedulerImpl'\n >>> standardize_jira_ref(\n ... \"SPARK-1094 Support MiMa for reporting binary compatibility across versions.\")\n '[SPARK-1094] Support MiMa for reporting binary compatibility across versions.'\n >>> standardize_jira_ref(\"[WIP] [SPARK-1146] Vagrant support for Spark\")\n '[SPARK-1146][WIP] Vagrant support for Spark'\n >>> standardize_jira_ref(\n ... \"SPARK-1032. If Yarn app fails before registering, app master stays aroun...\")\n '[SPARK-1032] If Yarn app fails before registering, app master stays aroun...'\n >>> standardize_jira_ref(\n ... \"[SPARK-6250][SPARK-6146][SPARK-5911][SQL] Types are now reserved words in DDL parser.\")\n '[SPARK-6250][SPARK-6146][SPARK-5911][SQL] Types are now reserved words in DDL parser.'\n >>> standardize_jira_ref(\"Additional information for users building from source code\")\n 'Additional information for users building from source code'\n \"\"\"\n jira_refs = []\n components = []\n\n # If the string is compliant, no need to process any further\n if (re.search(r'^\\[SPARK-[0-9]{3,6}\\](\\[[A-Z0-9_\\s,]+\\] )+\\S+', text)):\n return text\n\n # Extract JIRA ref(s):\n pattern = re.compile(r'(SPARK[-\\s]*[0-9]{3,6})+', re.IGNORECASE)\n for ref in pattern.findall(text):\n # Add brackets, replace spaces with a dash, & convert to uppercase\n jira_refs.append('[' + re.sub(r'\\s+', '-', ref.upper()) + ']')\n text = text.replace(ref, '')\n\n # Extract spark component(s):\n # Look for alphanumeric chars, spaces, dashes, periods, and/or commas\n pattern = re.compile(r'(\\[[\\w\\s,.-]+\\])', re.IGNORECASE)\n for component in pattern.findall(text):\n components.append(component.upper())\n text = text.replace(component, '')\n\n # Cleanup any remaining symbols:\n pattern = re.compile(r'^\\W+(.*)', re.IGNORECASE)\n if (pattern.search(text) is not None):\n text = pattern.search(text).groups()[0]\n\n # Assemble full text (JIRA ref(s), module(s), remaining text)\n clean_text = ''.join(jira_refs).strip() + ''.join(components).strip() + \" \" + text.strip()\n\n # Replace multiple spaces with a single space, e.g. if no jira refs and/or components were\n # included\n clean_text = re.sub(r'\\s+', ' ', clean_text.strip())\n\n return clean_text"} {"instruction": "Can you write a function in Python 3 where it\nparses a LIBSVM line into label indices and values.", "response": "def _parse_libsvm_line(line):\n \"\"\"\n Parses a line in LIBSVM format into (label, indices, values).\n \"\"\"\n items = line.split(None)\n label = float(items[0])\n nnz = len(items) - 1\n indices = np.zeros(nnz, dtype=np.int32)\n values = np.zeros(nnz)\n for i in xrange(nnz):\n index, value = items[1 + i].split(\":\")\n indices[i] = int(index) - 1\n values[i] = float(value)\n return label, indices, values"} {"instruction": "Make a summary of the following Python 3 code\ndef _convert_labeled_point_to_libsvm(p):\n from pyspark.mllib.regression import LabeledPoint\n assert isinstance(p, LabeledPoint)\n items = [str(p.label)]\n v = _convert_to_vector(p.features)\n if isinstance(v, SparseVector):\n nnz = len(v.indices)\n for i in xrange(nnz):\n items.append(str(v.indices[i] + 1) + \":\" + str(v.values[i]))\n else:\n for i in xrange(len(v)):\n items.append(str(i + 1) + \":\" + str(v[i]))\n return \" \".join(items)", "response": "Converts a LabeledPoint to a string in LIBSVM format."} {"instruction": "How would you explain what the following Python 3 function does\ndef loadLibSVMFile(sc, path, numFeatures=-1, minPartitions=None):\n from pyspark.mllib.regression import LabeledPoint\n\n lines = sc.textFile(path, minPartitions)\n parsed = lines.map(lambda l: MLUtils._parse_libsvm_line(l))\n if numFeatures <= 0:\n parsed.cache()\n numFeatures = parsed.map(lambda x: -1 if x[1].size == 0 else x[1][-1]).reduce(max) + 1\n return parsed.map(lambda x: LabeledPoint(x[0], Vectors.sparse(numFeatures, x[1], x[2])))", "response": "Loads labeled data from a LIBSVM file."} {"instruction": "Explain what the following Python 3 code does\ndef saveAsLibSVMFile(data, dir):\n lines = data.map(lambda p: MLUtils._convert_labeled_point_to_libsvm(p))\n lines.saveAsTextFile(dir)", "response": "Save labeled data in LIBSVM format."} {"instruction": "Write a Python 3 script for\nloading labeled points from a file or directory path.", "response": "def loadLabeledPoints(sc, path, minPartitions=None):\n \"\"\"\n Load labeled points saved using RDD.saveAsTextFile.\n\n :param sc: Spark context\n :param path: file or directory path in any Hadoop-supported file\n system URI\n :param minPartitions: min number of partitions\n @return: labeled data stored as an RDD of LabeledPoint\n\n >>> from tempfile import NamedTemporaryFile\n >>> from pyspark.mllib.util import MLUtils\n >>> from pyspark.mllib.regression import LabeledPoint\n >>> examples = [LabeledPoint(1.1, Vectors.sparse(3, [(0, -1.23), (2, 4.56e-7)])),\n ... LabeledPoint(0.0, Vectors.dense([1.01, 2.02, 3.03]))]\n >>> tempFile = NamedTemporaryFile(delete=True)\n >>> tempFile.close()\n >>> sc.parallelize(examples, 1).saveAsTextFile(tempFile.name)\n >>> MLUtils.loadLabeledPoints(sc, tempFile.name).collect()\n [LabeledPoint(1.1, (3,[0,2],[-1.23,4.56e-07])), LabeledPoint(0.0, [1.01,2.02,3.03])]\n \"\"\"\n minPartitions = minPartitions or min(sc.defaultParallelism, 2)\n return callMLlibFunc(\"loadLabeledPoints\", sc, path, minPartitions)"} {"instruction": "Implement a function in Python 3 to\nreturn a new vector with 1. 0 ( bias ) appended to the end of the input vector.", "response": "def appendBias(data):\n \"\"\"\n Returns a new vector with `1.0` (bias) appended to\n the end of the input vector.\n \"\"\"\n vec = _convert_to_vector(data)\n if isinstance(vec, SparseVector):\n newIndices = np.append(vec.indices, len(vec))\n newValues = np.append(vec.values, 1.0)\n return SparseVector(len(vec) + 1, newIndices, newValues)\n else:\n return _convert_to_vector(np.append(vec.toArray(), 1.0))"} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef convertVectorColumnsToML(dataset, *cols):\n if not isinstance(dataset, DataFrame):\n raise TypeError(\"Input dataset must be a DataFrame but got {}.\".format(type(dataset)))\n return callMLlibFunc(\"convertVectorColumnsToML\", dataset, list(cols))", "response": "Converts a list of vector columns in an input DataFrame from the pyspark. ml. linalg. Vector type to the newml."} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef generateLinearInput(intercept, weights, xMean, xVariance,\n nPoints, seed, eps):\n \"\"\"\n :param: intercept bias factor, the term c in X'w + c\n :param: weights feature vector, the term w in X'w + c\n :param: xMean Point around which the data X is centered.\n :param: xVariance Variance of the given data\n :param: nPoints Number of points to be generated\n :param: seed Random Seed\n :param: eps Used to scale the noise. If eps is set high,\n the amount of gaussian noise added is more.\n\n Returns a list of LabeledPoints of length nPoints\n \"\"\"\n weights = [float(weight) for weight in weights]\n xMean = [float(mean) for mean in xMean]\n xVariance = [float(var) for var in xVariance]\n return list(callMLlibFunc(\n \"generateLinearInputWrapper\", float(intercept), weights, xMean,\n xVariance, int(nPoints), int(seed), float(eps)))", "response": "This function generates a linear input for the given data."} {"instruction": "Can you write a function in Python 3 where it\ngenerates an RDD of LabeledPoints.", "response": "def generateLinearRDD(sc, nexamples, nfeatures, eps,\n nParts=2, intercept=0.0):\n \"\"\"\n Generate an RDD of LabeledPoints.\n \"\"\"\n return callMLlibFunc(\n \"generateLinearRDDWrapper\", sc, int(nexamples), int(nfeatures),\n float(eps), int(nParts), float(intercept))"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef train(cls, data, iterations=100, step=1.0, miniBatchFraction=1.0,\n initialWeights=None, regParam=0.0, regType=None, intercept=False,\n validateData=True, convergenceTol=0.001):\n \"\"\"\n Train a linear regression model using Stochastic Gradient\n Descent (SGD). This solves the least squares regression\n formulation\n\n f(weights) = 1/(2n) ||A weights - y||^2\n\n which is the mean squared error. Here the data matrix has n rows,\n and the input RDD holds the set of rows of A, each with its\n corresponding right hand side label y.\n See also the documentation for the precise formulation.\n\n :param data:\n The training data, an RDD of LabeledPoint.\n :param iterations:\n The number of iterations.\n (default: 100)\n :param step:\n The step parameter used in SGD.\n (default: 1.0)\n :param miniBatchFraction:\n Fraction of data to be used for each SGD iteration.\n (default: 1.0)\n :param initialWeights:\n The initial weights.\n (default: None)\n :param regParam:\n The regularizer parameter.\n (default: 0.0)\n :param regType:\n The type of regularizer used for training our model.\n Supported values:\n\n - \"l1\" for using L1 regularization\n - \"l2\" for using L2 regularization\n - None for no regularization (default)\n :param intercept:\n Boolean parameter which indicates the use or not of the\n augmented representation for training data (i.e., whether bias\n features are activated or not).\n (default: False)\n :param validateData:\n Boolean parameter which indicates if the algorithm should\n validate data before training.\n (default: True)\n :param convergenceTol:\n A condition which decides iteration termination.\n (default: 0.001)\n \"\"\"\n warnings.warn(\n \"Deprecated in 2.0.0. Use ml.regression.LinearRegression.\", DeprecationWarning)\n\n def train(rdd, i):\n return callMLlibFunc(\"trainLinearRegressionModelWithSGD\", rdd, int(iterations),\n float(step), float(miniBatchFraction), i, float(regParam),\n regType, bool(intercept), bool(validateData),\n float(convergenceTol))\n\n return _regression_train_wrapper(train, LinearRegressionModel, data, initialWeights)", "response": "Train a linear regression model using Stochastic Gradient\n Descent SGD."} {"instruction": "Can you tell what is the following Python 3 function doing\ndef predict(self, x):\n if isinstance(x, RDD):\n return x.map(lambda v: self.predict(v))\n return np.interp(x, self.boundaries, self.predictions)", "response": "Predict labels for provided features."} {"instruction": "Given the following Python 3 function, write the documentation\ndef save(self, sc, path):\n java_boundaries = _py2java(sc, self.boundaries.tolist())\n java_predictions = _py2java(sc, self.predictions.tolist())\n java_model = sc._jvm.org.apache.spark.mllib.regression.IsotonicRegressionModel(\n java_boundaries, java_predictions, self.isotonic)\n java_model.save(sc._jsc.sc(), path)", "response": "Save an IsotonicRegressionModel to a file."} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef load(cls, sc, path):\n java_model = sc._jvm.org.apache.spark.mllib.regression.IsotonicRegressionModel.load(\n sc._jsc.sc(), path)\n py_boundaries = _java2py(sc, java_model.boundaryVector()).toArray()\n py_predictions = _java2py(sc, java_model.predictionVector()).toArray()\n return IsotonicRegressionModel(py_boundaries, py_predictions, java_model.isotonic)", "response": "Load an IsotonicRegressionModel from a file."} {"instruction": "Can you tell what is the following Python 3 function doing\ndef train(cls, data, isotonic=True):\n boundaries, predictions = callMLlibFunc(\"trainIsotonicRegressionModel\",\n data.map(_convert_to_vector), bool(isotonic))\n return IsotonicRegressionModel(boundaries.toArray(), predictions.toArray(), isotonic)", "response": "Train an isotonic regression model on the given data."} {"instruction": "Create a Python 3 function for\ncomputing the column similarities between the given column.", "response": "def columnSimilarities(self, threshold=0.0):\n \"\"\"\n Compute similarities between columns of this matrix.\n\n The threshold parameter is a trade-off knob between estimate\n quality and computational cost.\n\n The default threshold setting of 0 guarantees deterministically\n correct results, but uses the brute-force approach of computing\n normalized dot products.\n\n Setting the threshold to positive values uses a sampling\n approach and incurs strictly less computational cost than the\n brute-force approach. However the similarities computed will\n be estimates.\n\n The sampling guarantees relative-error correctness for those\n pairs of columns that have similarity greater than the given\n similarity threshold.\n\n To describe the guarantee, we set some notation:\n * Let A be the smallest in magnitude non-zero element of\n this matrix.\n * Let B be the largest in magnitude non-zero element of\n this matrix.\n * Let L be the maximum number of non-zeros per row.\n\n For example, for {0,1} matrices: A=B=1.\n Another example, for the Netflix matrix: A=1, B=5\n\n For those column pairs that are above the threshold, the\n computed similarity is correct to within 20% relative error\n with probability at least 1 - (0.981)^10/B^\n\n The shuffle size is bounded by the *smaller* of the following\n two expressions:\n\n * O(n log(n) L / (threshold * A))\n * O(m L^2^)\n\n The latter is the cost of the brute-force approach, so for\n non-zero thresholds, the cost is always cheaper than the\n brute-force approach.\n\n :param: threshold: Set to 0 for deterministic guaranteed\n correctness. Similarities above this\n threshold are estimated with the cost vs\n estimate quality trade-off described above.\n :return: An n x n sparse upper-triangular CoordinateMatrix of\n cosine similarities between columns of this matrix.\n\n >>> rows = sc.parallelize([[1, 2], [1, 5]])\n >>> mat = RowMatrix(rows)\n\n >>> sims = mat.columnSimilarities()\n >>> sims.entries.first().value\n 0.91914503...\n \"\"\"\n java_sims_mat = self._java_matrix_wrapper.call(\"columnSimilarities\", float(threshold))\n return CoordinateMatrix(java_sims_mat)"} {"instruction": "Create a Python 3 function to\ncompute the QR decomposition of this RowMatrix.", "response": "def tallSkinnyQR(self, computeQ=False):\n \"\"\"\n Compute the QR decomposition of this RowMatrix.\n\n The implementation is designed to optimize the QR decomposition\n (factorization) for the RowMatrix of a tall and skinny shape.\n\n Reference:\n Paul G. Constantine, David F. Gleich. \"Tall and skinny QR\n factorizations in MapReduce architectures\"\n ([[https://doi.org/10.1145/1996092.1996103]])\n\n :param: computeQ: whether to computeQ\n :return: QRDecomposition(Q: RowMatrix, R: Matrix), where\n Q = None if computeQ = false.\n\n >>> rows = sc.parallelize([[3, -6], [4, -8], [0, 1]])\n >>> mat = RowMatrix(rows)\n >>> decomp = mat.tallSkinnyQR(True)\n >>> Q = decomp.Q\n >>> R = decomp.R\n\n >>> # Test with absolute values\n >>> absQRows = Q.rows.map(lambda row: abs(row.toArray()).tolist())\n >>> absQRows.collect()\n [[0.6..., 0.0], [0.8..., 0.0], [0.0, 1.0]]\n\n >>> # Test with absolute values\n >>> abs(R.toArray()).tolist()\n [[5.0, 10.0], [0.0, 1.0]]\n \"\"\"\n decomp = JavaModelWrapper(self._java_matrix_wrapper.call(\"tallSkinnyQR\", computeQ))\n if computeQ:\n java_Q = decomp.call(\"Q\")\n Q = RowMatrix(java_Q)\n else:\n Q = None\n R = decomp.call(\"R\")\n return QRDecomposition(Q, R)"} {"instruction": "Given the following Python 3 function, write the documentation\ndef computeSVD(self, k, computeU=False, rCond=1e-9):\n j_model = self._java_matrix_wrapper.call(\n \"computeSVD\", int(k), bool(computeU), float(rCond))\n return SingularValueDecomposition(j_model)", "response": "Computes the singular value decomposition of the row matrix A."} {"instruction": "Given the following Python 3 function, write the documentation\ndef multiply(self, matrix):\n if not isinstance(matrix, DenseMatrix):\n raise ValueError(\"Only multiplication with DenseMatrix \"\n \"is supported.\")\n j_model = self._java_matrix_wrapper.call(\"multiply\", matrix)\n return RowMatrix(j_model)", "response": "Multiply this matrix by a dense matrix on the right."} {"instruction": "Can you write a function in Python 3 where it\nreturns a distributed matrix whose columns are the leftometry of the SingularValueDecomposition.", "response": "def U(self):\n \"\"\"\n Returns a distributed matrix whose columns are the left\n singular vectors of the SingularValueDecomposition if computeU was set to be True.\n \"\"\"\n u = self.call(\"U\")\n if u is not None:\n mat_name = u.getClass().getSimpleName()\n if mat_name == \"RowMatrix\":\n return RowMatrix(u)\n elif mat_name == \"IndexedRowMatrix\":\n return IndexedRowMatrix(u)\n else:\n raise TypeError(\"Expected RowMatrix/IndexedRowMatrix got %s\" % mat_name)"} {"instruction": "Implement a Python 3 function for\nreturning an RDD of IndexedRows.", "response": "def rows(self):\n \"\"\"\n Rows of the IndexedRowMatrix stored as an RDD of IndexedRows.\n\n >>> mat = IndexedRowMatrix(sc.parallelize([IndexedRow(0, [1, 2, 3]),\n ... IndexedRow(1, [4, 5, 6])]))\n >>> rows = mat.rows\n >>> rows.first()\n IndexedRow(0, [1.0,2.0,3.0])\n \"\"\"\n # We use DataFrames for serialization of IndexedRows from\n # Java, so we first convert the RDD of rows to a DataFrame\n # on the Scala/Java side. Then we map each Row in the\n # DataFrame back to an IndexedRow on this side.\n rows_df = callMLlibFunc(\"getIndexedRows\", self._java_matrix_wrapper._java_model)\n rows = rows_df.rdd.map(lambda row: IndexedRow(row[0], row[1]))\n return rows"} {"instruction": "Explain what the following Python 3 code does\ndef toBlockMatrix(self, rowsPerBlock=1024, colsPerBlock=1024):\n java_block_matrix = self._java_matrix_wrapper.call(\"toBlockMatrix\",\n rowsPerBlock,\n colsPerBlock)\n return BlockMatrix(java_block_matrix, rowsPerBlock, colsPerBlock)", "response": "Convert this matrix to a BlockMatrix."} {"instruction": "Write a Python 3 script to\nmultiply this matrix by a dense matrix on the right.", "response": "def multiply(self, matrix):\n \"\"\"\n Multiply this matrix by a local dense matrix on the right.\n\n :param matrix: a local dense matrix whose number of rows must match the number of columns\n of this matrix\n :returns: :py:class:`IndexedRowMatrix`\n\n >>> mat = IndexedRowMatrix(sc.parallelize([(0, (0, 1)), (1, (2, 3))]))\n >>> mat.multiply(DenseMatrix(2, 2, [0, 2, 1, 3])).rows.collect()\n [IndexedRow(0, [2.0,3.0]), IndexedRow(1, [6.0,11.0])]\n \"\"\"\n if not isinstance(matrix, DenseMatrix):\n raise ValueError(\"Only multiplication with DenseMatrix \"\n \"is supported.\")\n return IndexedRowMatrix(self._java_matrix_wrapper.call(\"multiply\", matrix))"} {"instruction": "How would you explain what the following Python 3 function does\ndef entries(self):\n # We use DataFrames for serialization of MatrixEntry entries\n # from Java, so we first convert the RDD of entries to a\n # DataFrame on the Scala/Java side. Then we map each Row in\n # the DataFrame back to a MatrixEntry on this side.\n entries_df = callMLlibFunc(\"getMatrixEntries\", self._java_matrix_wrapper._java_model)\n entries = entries_df.rdd.map(lambda row: MatrixEntry(row[0], row[1], row[2]))\n return entries", "response": "Return the entries of the CoordinateMatrix stored as an RDD of MatrixEntries."} {"instruction": "Here you have a function in Python 3, explain what it does\ndef blocks(self):\n # We use DataFrames for serialization of sub-matrix blocks\n # from Java, so we first convert the RDD of blocks to a\n # DataFrame on the Scala/Java side. Then we map each Row in\n # the DataFrame back to a sub-matrix block on this side.\n blocks_df = callMLlibFunc(\"getMatrixBlocks\", self._java_matrix_wrapper._java_model)\n blocks = blocks_df.rdd.map(lambda row: ((row[0][0], row[0][1]), row[1]))\n return blocks", "response": "Returns the RDD of blocks in this Scala matrix."} {"instruction": "Here you have a function in Python 3, explain what it does\ndef persist(self, storageLevel):\n if not isinstance(storageLevel, StorageLevel):\n raise TypeError(\"`storageLevel` should be a StorageLevel, got %s\" % type(storageLevel))\n javaStorageLevel = self._java_matrix_wrapper._sc._getJavaStorageLevel(storageLevel)\n self._java_matrix_wrapper.call(\"persist\", javaStorageLevel)\n return self", "response": "Persist the underlying RDD with the specified storage level."} {"instruction": "Create a Python 3 function for\nadding two block matrices together and returns a new matrix with the result.", "response": "def add(self, other):\n \"\"\"\n Adds two block matrices together. The matrices must have the\n same size and matching `rowsPerBlock` and `colsPerBlock` values.\n If one of the sub matrix blocks that are being added is a\n SparseMatrix, the resulting sub matrix block will also be a\n SparseMatrix, even if it is being added to a DenseMatrix. If\n two dense sub matrix blocks are added, the output block will\n also be a DenseMatrix.\n\n >>> dm1 = Matrices.dense(3, 2, [1, 2, 3, 4, 5, 6])\n >>> dm2 = Matrices.dense(3, 2, [7, 8, 9, 10, 11, 12])\n >>> sm = Matrices.sparse(3, 2, [0, 1, 3], [0, 1, 2], [7, 11, 12])\n >>> blocks1 = sc.parallelize([((0, 0), dm1), ((1, 0), dm2)])\n >>> blocks2 = sc.parallelize([((0, 0), dm1), ((1, 0), dm2)])\n >>> blocks3 = sc.parallelize([((0, 0), sm), ((1, 0), dm2)])\n >>> mat1 = BlockMatrix(blocks1, 3, 2)\n >>> mat2 = BlockMatrix(blocks2, 3, 2)\n >>> mat3 = BlockMatrix(blocks3, 3, 2)\n\n >>> mat1.add(mat2).toLocalMatrix()\n DenseMatrix(6, 2, [2.0, 4.0, 6.0, 14.0, 16.0, 18.0, 8.0, 10.0, 12.0, 20.0, 22.0, 24.0], 0)\n\n >>> mat1.add(mat3).toLocalMatrix()\n DenseMatrix(6, 2, [8.0, 2.0, 3.0, 14.0, 16.0, 18.0, 4.0, 16.0, 18.0, 20.0, 22.0, 24.0], 0)\n \"\"\"\n if not isinstance(other, BlockMatrix):\n raise TypeError(\"Other should be a BlockMatrix, got %s\" % type(other))\n\n other_java_block_matrix = other._java_matrix_wrapper._java_model\n java_block_matrix = self._java_matrix_wrapper.call(\"add\", other_java_block_matrix)\n return BlockMatrix(java_block_matrix, self.rowsPerBlock, self.colsPerBlock)"} {"instruction": "Given the following Python 3 function, write the documentation\ndef transpose(self):\n java_transposed_matrix = self._java_matrix_wrapper.call(\"transpose\")\n return BlockMatrix(java_transposed_matrix, self.colsPerBlock, self.rowsPerBlock)", "response": "Returns a new BlockMatrix with the same underlying data."} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef _vector_size(v):\n if isinstance(v, Vector):\n return len(v)\n elif type(v) in (array.array, list, tuple, xrange):\n return len(v)\n elif type(v) == np.ndarray:\n if v.ndim == 1 or (v.ndim == 2 and v.shape[1] == 1):\n return len(v)\n else:\n raise ValueError(\"Cannot treat an ndarray of shape %s as a vector\" % str(v.shape))\n elif _have_scipy and scipy.sparse.issparse(v):\n assert v.shape[1] == 1, \"Expected column vector\"\n return v.shape[0]\n else:\n raise TypeError(\"Cannot treat type %s as a vector\" % type(v))", "response": "Returns the size of the vector."} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef parse(s):\n start = s.find('[')\n if start == -1:\n raise ValueError(\"Array should start with '['.\")\n end = s.find(']')\n if end == -1:\n raise ValueError(\"Array should end with ']'.\")\n s = s[start + 1: end]\n\n try:\n values = [float(val) for val in s.split(',') if val]\n except ValueError:\n raise ValueError(\"Unable to parse values from %s\" % s)\n return DenseVector(values)", "response": "Parse string representation back into the DenseVector."} {"instruction": "Given the following Python 3 function, write the documentation\ndef dot(self, other):\n if type(other) == np.ndarray:\n if other.ndim > 1:\n assert len(self) == other.shape[0], \"dimension mismatch\"\n return np.dot(self.array, other)\n elif _have_scipy and scipy.sparse.issparse(other):\n assert len(self) == other.shape[0], \"dimension mismatch\"\n return other.transpose().dot(self.toArray())\n else:\n assert len(self) == _vector_size(other), \"dimension mismatch\"\n if isinstance(other, SparseVector):\n return other.dot(self)\n elif isinstance(other, Vector):\n return np.dot(self.toArray(), other.toArray())\n else:\n return np.dot(self.toArray(), other)", "response": "Compute the dot product of two Vectors."} {"instruction": "Can you implement a function in Python 3 that\nreturns the squared distance between two Vectors.", "response": "def squared_distance(self, other):\n \"\"\"\n Squared distance of two Vectors.\n\n >>> dense1 = DenseVector(array.array('d', [1., 2.]))\n >>> dense1.squared_distance(dense1)\n 0.0\n >>> dense2 = np.array([2., 1.])\n >>> dense1.squared_distance(dense2)\n 2.0\n >>> dense3 = [2., 1.]\n >>> dense1.squared_distance(dense3)\n 2.0\n >>> sparse1 = SparseVector(2, [0, 1], [2., 1.])\n >>> dense1.squared_distance(sparse1)\n 2.0\n >>> dense1.squared_distance([1.,])\n Traceback (most recent call last):\n ...\n AssertionError: dimension mismatch\n >>> dense1.squared_distance(SparseVector(1, [0,], [1.,]))\n Traceback (most recent call last):\n ...\n AssertionError: dimension mismatch\n \"\"\"\n assert len(self) == _vector_size(other), \"dimension mismatch\"\n if isinstance(other, SparseVector):\n return other.squared_distance(self)\n elif _have_scipy and scipy.sparse.issparse(other):\n return _convert_to_vector(other).squared_distance(self)\n\n if isinstance(other, Vector):\n other = other.toArray()\n elif not isinstance(other, np.ndarray):\n other = np.array(other)\n diff = self.toArray() - other\n return np.dot(diff, diff)"} {"instruction": "Implement a function in Python 3 to\nparse string representation back into the SparseVector.", "response": "def parse(s):\n \"\"\"\n Parse string representation back into the SparseVector.\n\n >>> SparseVector.parse(' (4, [0,1 ],[ 4.0,5.0] )')\n SparseVector(4, {0: 4.0, 1: 5.0})\n \"\"\"\n start = s.find('(')\n if start == -1:\n raise ValueError(\"Tuple should start with '('\")\n end = s.find(')')\n if end == -1:\n raise ValueError(\"Tuple should end with ')'\")\n s = s[start + 1: end].strip()\n\n size = s[: s.find(',')]\n try:\n size = int(size)\n except ValueError:\n raise ValueError(\"Cannot parse size %s.\" % size)\n\n ind_start = s.find('[')\n if ind_start == -1:\n raise ValueError(\"Indices array should start with '['.\")\n ind_end = s.find(']')\n if ind_end == -1:\n raise ValueError(\"Indices array should end with ']'\")\n new_s = s[ind_start + 1: ind_end]\n ind_list = new_s.split(',')\n try:\n indices = [int(ind) for ind in ind_list if ind]\n except ValueError:\n raise ValueError(\"Unable to parse indices from %s.\" % new_s)\n s = s[ind_end + 1:].strip()\n\n val_start = s.find('[')\n if val_start == -1:\n raise ValueError(\"Values array should start with '['.\")\n val_end = s.find(']')\n if val_end == -1:\n raise ValueError(\"Values array should end with ']'.\")\n val_list = s[val_start + 1: val_end].split(',')\n try:\n values = [float(val) for val in val_list if val]\n except ValueError:\n raise ValueError(\"Unable to parse values from %s.\" % s)\n return SparseVector(size, indices, values)"} {"instruction": "Can you implement a function in Python 3 that\ndots product of two SparseVector objects.", "response": "def dot(self, other):\n \"\"\"\n Dot product with a SparseVector or 1- or 2-dimensional Numpy array.\n\n >>> a = SparseVector(4, [1, 3], [3.0, 4.0])\n >>> a.dot(a)\n 25.0\n >>> a.dot(array.array('d', [1., 2., 3., 4.]))\n 22.0\n >>> b = SparseVector(4, [2], [1.0])\n >>> a.dot(b)\n 0.0\n >>> a.dot(np.array([[1, 1], [2, 2], [3, 3], [4, 4]]))\n array([ 22., 22.])\n >>> a.dot([1., 2., 3.])\n Traceback (most recent call last):\n ...\n AssertionError: dimension mismatch\n >>> a.dot(np.array([1., 2.]))\n Traceback (most recent call last):\n ...\n AssertionError: dimension mismatch\n >>> a.dot(DenseVector([1., 2.]))\n Traceback (most recent call last):\n ...\n AssertionError: dimension mismatch\n >>> a.dot(np.zeros((3, 2)))\n Traceback (most recent call last):\n ...\n AssertionError: dimension mismatch\n \"\"\"\n\n if isinstance(other, np.ndarray):\n if other.ndim not in [2, 1]:\n raise ValueError(\"Cannot call dot with %d-dimensional array\" % other.ndim)\n assert len(self) == other.shape[0], \"dimension mismatch\"\n return np.dot(self.values, other[self.indices])\n\n assert len(self) == _vector_size(other), \"dimension mismatch\"\n\n if isinstance(other, DenseVector):\n return np.dot(other.array[self.indices], self.values)\n\n elif isinstance(other, SparseVector):\n # Find out common indices.\n self_cmind = np.in1d(self.indices, other.indices, assume_unique=True)\n self_values = self.values[self_cmind]\n if self_values.size == 0:\n return 0.0\n else:\n other_cmind = np.in1d(other.indices, self.indices, assume_unique=True)\n return np.dot(self_values, other.values[other_cmind])\n\n else:\n return self.dot(_convert_to_vector(other))"} {"instruction": "Implement a function in Python 3 to\nreturn the squared distance between two SparseVectors or 1 - dimensional NumPy arrays.", "response": "def squared_distance(self, other):\n \"\"\"\n Squared distance from a SparseVector or 1-dimensional NumPy array.\n\n >>> a = SparseVector(4, [1, 3], [3.0, 4.0])\n >>> a.squared_distance(a)\n 0.0\n >>> a.squared_distance(array.array('d', [1., 2., 3., 4.]))\n 11.0\n >>> a.squared_distance(np.array([1., 2., 3., 4.]))\n 11.0\n >>> b = SparseVector(4, [2], [1.0])\n >>> a.squared_distance(b)\n 26.0\n >>> b.squared_distance(a)\n 26.0\n >>> b.squared_distance([1., 2.])\n Traceback (most recent call last):\n ...\n AssertionError: dimension mismatch\n >>> b.squared_distance(SparseVector(3, [1,], [1.0,]))\n Traceback (most recent call last):\n ...\n AssertionError: dimension mismatch\n \"\"\"\n assert len(self) == _vector_size(other), \"dimension mismatch\"\n\n if isinstance(other, np.ndarray) or isinstance(other, DenseVector):\n if isinstance(other, np.ndarray) and other.ndim != 1:\n raise Exception(\"Cannot call squared_distance with %d-dimensional array\" %\n other.ndim)\n if isinstance(other, DenseVector):\n other = other.array\n sparse_ind = np.zeros(other.size, dtype=bool)\n sparse_ind[self.indices] = True\n dist = other[sparse_ind] - self.values\n result = np.dot(dist, dist)\n\n other_ind = other[~sparse_ind]\n result += np.dot(other_ind, other_ind)\n return result\n\n elif isinstance(other, SparseVector):\n result = 0.0\n i, j = 0, 0\n while i < len(self.indices) and j < len(other.indices):\n if self.indices[i] == other.indices[j]:\n diff = self.values[i] - other.values[j]\n result += diff * diff\n i += 1\n j += 1\n elif self.indices[i] < other.indices[j]:\n result += self.values[i] * self.values[i]\n i += 1\n else:\n result += other.values[j] * other.values[j]\n j += 1\n while i < len(self.indices):\n result += self.values[i] * self.values[i]\n i += 1\n while j < len(other.indices):\n result += other.values[j] * other.values[j]\n j += 1\n return result\n else:\n return self.squared_distance(_convert_to_vector(other))"} {"instruction": "Can you write a function in Python 3 where it\nreturns a copy of this SparseVector as a 1 - dimensional NumPy array.", "response": "def toArray(self):\n \"\"\"\n Returns a copy of this SparseVector as a 1-dimensional NumPy array.\n \"\"\"\n arr = np.zeros((self.size,), dtype=np.float64)\n arr[self.indices] = self.values\n return arr"} {"instruction": "Implement a function in Python 3 to\nconvert this vector to the new mllib - local representation.", "response": "def asML(self):\n \"\"\"\n Convert this vector to the new mllib-local representation.\n This does NOT copy the data; it copies references.\n\n :return: :py:class:`pyspark.ml.linalg.SparseVector`\n\n .. versionadded:: 2.0.0\n \"\"\"\n return newlinalg.SparseVector(self.size, self.indices, self.values)"} {"instruction": "Can you implement a function in Python 3 that\ncreates a dense vector of 64 - bit floats from a Python list or numbers.", "response": "def dense(*elements):\n \"\"\"\n Create a dense vector of 64-bit floats from a Python list or numbers.\n\n >>> Vectors.dense([1, 2, 3])\n DenseVector([1.0, 2.0, 3.0])\n >>> Vectors.dense(1.0, 2.0)\n DenseVector([1.0, 2.0])\n \"\"\"\n if len(elements) == 1 and not isinstance(elements[0], (float, int, long)):\n # it's list, numpy.array or other iterable object.\n elements = elements[0]\n return DenseVector(elements)"} {"instruction": "Here you have a function in Python 3, explain what it does\ndef fromML(vec):\n if isinstance(vec, newlinalg.DenseVector):\n return DenseVector(vec.array)\n elif isinstance(vec, newlinalg.SparseVector):\n return SparseVector(vec.size, vec.indices, vec.values)\n else:\n raise TypeError(\"Unsupported vector type %s\" % type(vec))", "response": "Converts a vector from the new mllib - local representation."} {"instruction": "Create a Python 3 function for\nsquaring distance between two sets of vectors.", "response": "def squared_distance(v1, v2):\n \"\"\"\n Squared distance between two vectors.\n a and b can be of type SparseVector, DenseVector, np.ndarray\n or array.array.\n\n >>> a = Vectors.sparse(4, [(0, 1), (3, 4)])\n >>> b = Vectors.dense([2, 5, 4, 1])\n >>> a.squared_distance(b)\n 51.0\n \"\"\"\n v1, v2 = _convert_to_vector(v1), _convert_to_vector(v2)\n return v1.squared_distance(v2)"} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef parse(s):\n if s.find('(') == -1 and s.find('[') != -1:\n return DenseVector.parse(s)\n elif s.find('(') != -1:\n return SparseVector.parse(s)\n else:\n raise ValueError(\n \"Cannot find tokens '[' or '(' from the input string.\")", "response": "Parse a string representation back into the Vector."} {"instruction": "Explain what the following Python 3 code does\ndef _equals(v1_indices, v1_values, v2_indices, v2_values):\n v1_size = len(v1_values)\n v2_size = len(v2_values)\n k1 = 0\n k2 = 0\n all_equal = True\n while all_equal:\n while k1 < v1_size and v1_values[k1] == 0:\n k1 += 1\n while k2 < v2_size and v2_values[k2] == 0:\n k2 += 1\n\n if k1 >= v1_size or k2 >= v2_size:\n return k1 >= v1_size and k2 >= v2_size\n\n all_equal = v1_indices[k1] == v2_indices[k2] and v1_values[k1] == v2_values[k2]\n k1 += 1\n k2 += 1\n return all_equal", "response": "Check equality between sparse vectors v1_indices and v2_indices assume to be strictly increasing."} {"instruction": "How would you explain what the following Python 3 function does\ndef _convert_to_array(array_like, dtype):\n if isinstance(array_like, bytes):\n return np.frombuffer(array_like, dtype=dtype)\n return np.asarray(array_like, dtype=dtype)", "response": "Convert matrix attributes which are array - like or buffer to array."} {"instruction": "Can you write a function in Python 3 where it\nreturns an array of the ISO - 8601 related data.", "response": "def toArray(self):\n \"\"\"\n Return an numpy.ndarray\n\n >>> m = DenseMatrix(2, 2, range(4))\n >>> m.toArray()\n array([[ 0., 2.],\n [ 1., 3.]])\n \"\"\"\n if self.isTransposed:\n return np.asfortranarray(\n self.values.reshape((self.numRows, self.numCols)))\n else:\n return self.values.reshape((self.numRows, self.numCols), order='F')"} {"instruction": "Can you create a Python 3 function that\nconverts this matrix to the new mllib - local representation.", "response": "def asML(self):\n \"\"\"\n Convert this matrix to the new mllib-local representation.\n This does NOT copy the data; it copies references.\n\n :return: :py:class:`pyspark.ml.linalg.DenseMatrix`\n\n .. versionadded:: 2.0.0\n \"\"\"\n return newlinalg.DenseMatrix(self.numRows, self.numCols, self.values, self.isTransposed)"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef toArray(self):\n A = np.zeros((self.numRows, self.numCols), dtype=np.float64, order='F')\n for k in xrange(self.colPtrs.size - 1):\n startptr = self.colPtrs[k]\n endptr = self.colPtrs[k + 1]\n if self.isTransposed:\n A[k, self.rowIndices[startptr:endptr]] = self.values[startptr:endptr]\n else:\n A[self.rowIndices[startptr:endptr], k] = self.values[startptr:endptr]\n return A", "response": "Return an array of the ISO - 8601 entries."} {"instruction": "Can you tell what is the following Python 3 function doing\ndef asML(self):\n return newlinalg.SparseMatrix(self.numRows, self.numCols, self.colPtrs, self.rowIndices,\n self.values, self.isTransposed)", "response": "Convert this matrix to the new mllib - local representation."} {"instruction": "Given the following Python 3 function, write the documentation\ndef fromML(mat):\n if isinstance(mat, newlinalg.DenseMatrix):\n return DenseMatrix(mat.numRows, mat.numCols, mat.values, mat.isTransposed)\n elif isinstance(mat, newlinalg.SparseMatrix):\n return SparseMatrix(mat.numRows, mat.numCols, mat.colPtrs, mat.rowIndices,\n mat.values, mat.isTransposed)\n else:\n raise TypeError(\"Unsupported matrix type %s\" % type(mat))", "response": "Converts a matrix from the new mllib - local representation."} {"instruction": "How would you code a function in Python 3 to\ngive a large dataset and an item approximately find at most k items which have the nearest distance to the item.", "response": "def approxNearestNeighbors(self, dataset, key, numNearestNeighbors, distCol=\"distCol\"):\n \"\"\"\n Given a large dataset and an item, approximately find at most k items which have the\n closest distance to the item. If the :py:attr:`outputCol` is missing, the method will\n transform the data; if the :py:attr:`outputCol` exists, it will use that. This allows\n caching of the transformed data when necessary.\n\n .. note:: This method is experimental and will likely change behavior in the next release.\n\n :param dataset: The dataset to search for nearest neighbors of the key.\n :param key: Feature vector representing the item to search for.\n :param numNearestNeighbors: The maximum number of nearest neighbors.\n :param distCol: Output column for storing the distance between each result row and the key.\n Use \"distCol\" as default value if it's not specified.\n :return: A dataset containing at most k items closest to the key. A column \"distCol\" is\n added to show the distance between each row and the key.\n \"\"\"\n return self._call_java(\"approxNearestNeighbors\", dataset, key, numNearestNeighbors,\n distCol)"} {"instruction": "Given the following Python 3 function, write the documentation\ndef approxSimilarityJoin(self, datasetA, datasetB, threshold, distCol=\"distCol\"):\n threshold = TypeConverters.toFloat(threshold)\n return self._call_java(\"approxSimilarityJoin\", datasetA, datasetB, threshold, distCol)", "response": "This method joins two datasets to approximately find all pairs of rows whose distance is smaller than threshold."} {"instruction": "Given the following Python 3 function, write the documentation\ndef from_labels(cls, labels, inputCol, outputCol=None, handleInvalid=None):\n sc = SparkContext._active_spark_context\n java_class = sc._gateway.jvm.java.lang.String\n jlabels = StringIndexerModel._new_java_array(labels, java_class)\n model = StringIndexerModel._create_from_java_class(\n \"org.apache.spark.ml.feature.StringIndexerModel\", jlabels)\n model.setInputCol(inputCol)\n if outputCol is not None:\n model.setOutputCol(outputCol)\n if handleInvalid is not None:\n model.setHandleInvalid(handleInvalid)\n return model", "response": "Construct the model directly from an array of label strings."} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef from_arrays_of_labels(cls, arrayOfLabels, inputCols, outputCols=None,\n handleInvalid=None):\n \"\"\"\n Construct the model directly from an array of array of label strings,\n requires an active SparkContext.\n \"\"\"\n sc = SparkContext._active_spark_context\n java_class = sc._gateway.jvm.java.lang.String\n jlabels = StringIndexerModel._new_java_array(arrayOfLabels, java_class)\n model = StringIndexerModel._create_from_java_class(\n \"org.apache.spark.ml.feature.StringIndexerModel\", jlabels)\n model.setInputCols(inputCols)\n if outputCols is not None:\n model.setOutputCols(outputCols)\n if handleInvalid is not None:\n model.setHandleInvalid(handleInvalid)\n return model", "response": "Construct a model directly from an array of array of label strings."} {"instruction": "Here you have a function in Python 3, explain what it does\ndef setParams(self, inputCol=None, outputCol=None, stopWords=None, caseSensitive=False,\n locale=None):\n \"\"\"\n setParams(self, inputCol=None, outputCol=None, stopWords=None, caseSensitive=false, \\\n locale=None)\n Sets params for this StopWordRemover.\n \"\"\"\n kwargs = self._input_kwargs\n return self._set(**kwargs)", "response": "Sets the parameters for this StopWordRemover."} {"instruction": "Write a Python 3 script for\nloading the default stop words for the given language.", "response": "def loadDefaultStopWords(language):\n \"\"\"\n Loads the default stop words for the given language.\n Supported languages: danish, dutch, english, finnish, french, german, hungarian,\n italian, norwegian, portuguese, russian, spanish, swedish, turkish\n \"\"\"\n stopWordsObj = _jvm().org.apache.spark.ml.feature.StopWordsRemover\n return list(stopWordsObj.loadDefaultStopWords(language))"} {"instruction": "How would you explain what the following Python 3 function does\ndef findSynonyms(self, word, num):\n if not isinstance(word, basestring):\n word = _convert_to_vector(word)\n return self._call_java(\"findSynonyms\", word, num)", "response": "Find num number of words closest in similarity to word."} {"instruction": "How would you implement a function in Python 3 that\nfinds num number of words closest in similarity to word.", "response": "def findSynonymsArray(self, word, num):\n \"\"\"\n Find \"num\" number of words closest in similarity to \"word\".\n word can be a string or vector representation.\n Returns an array with two fields word and similarity (which\n gives the cosine similarity).\n \"\"\"\n if not isinstance(word, basestring):\n word = _convert_to_vector(word)\n tuples = self._java_obj.findSynonymsArray(word, num)\n return list(map(lambda st: (st._1(), st._2()), list(tuples)))"} {"instruction": "Explain what the following Python 3 code does\ndef install_exception_handler():\n original = py4j.protocol.get_return_value\n # The original `get_return_value` is not patched, it's idempotent.\n patched = capture_sql_exception(original)\n # only patch the one used in py4j.java_gateway (call Java API)\n py4j.java_gateway.get_return_value = patched", "response": "Installs an exception handler into the Py4j environment."} {"instruction": "Make a summary of the following Python 3 code\ndef toJArray(gateway, jtype, arr):\n jarr = gateway.new_array(jtype, len(arr))\n for i in range(0, len(arr)):\n jarr[i] = arr[i]\n return jarr", "response": "Convert python list to java type array"} {"instruction": "Create a Python 3 function for\nraising ImportError if minimum version of Pandas is not installed.", "response": "def require_minimum_pandas_version():\n \"\"\" Raise ImportError if minimum version of Pandas is not installed\n \"\"\"\n # TODO(HyukjinKwon): Relocate and deduplicate the version specification.\n minimum_pandas_version = \"0.19.2\"\n\n from distutils.version import LooseVersion\n try:\n import pandas\n have_pandas = True\n except ImportError:\n have_pandas = False\n if not have_pandas:\n raise ImportError(\"Pandas >= %s must be installed; however, \"\n \"it was not found.\" % minimum_pandas_version)\n if LooseVersion(pandas.__version__) < LooseVersion(minimum_pandas_version):\n raise ImportError(\"Pandas >= %s must be installed; however, \"\n \"your version was %s.\" % (minimum_pandas_version, pandas.__version__))"} {"instruction": "How would you code a function in Python 3 to\nraise ImportError if minimum version of pyarrow is not installed.", "response": "def require_minimum_pyarrow_version():\n \"\"\" Raise ImportError if minimum version of pyarrow is not installed\n \"\"\"\n # TODO(HyukjinKwon): Relocate and deduplicate the version specification.\n minimum_pyarrow_version = \"0.12.1\"\n\n from distutils.version import LooseVersion\n try:\n import pyarrow\n have_arrow = True\n except ImportError:\n have_arrow = False\n if not have_arrow:\n raise ImportError(\"PyArrow >= %s must be installed; however, \"\n \"it was not found.\" % minimum_pyarrow_version)\n if LooseVersion(pyarrow.__version__) < LooseVersion(minimum_pyarrow_version):\n raise ImportError(\"PyArrow >= %s must be installed; however, \"\n \"your version was %s.\" % (minimum_pyarrow_version, pyarrow.__version__))"} {"instruction": "Explain what the following Python 3 code does\ndef launch_gateway(conf=None, popen_kwargs=None):\n if \"PYSPARK_GATEWAY_PORT\" in os.environ:\n gateway_port = int(os.environ[\"PYSPARK_GATEWAY_PORT\"])\n gateway_secret = os.environ[\"PYSPARK_GATEWAY_SECRET\"]\n # Process already exists\n proc = None\n else:\n SPARK_HOME = _find_spark_home()\n # Launch the Py4j gateway using Spark's run command so that we pick up the\n # proper classpath and settings from spark-env.sh\n on_windows = platform.system() == \"Windows\"\n script = \"./bin/spark-submit.cmd\" if on_windows else \"./bin/spark-submit\"\n command = [os.path.join(SPARK_HOME, script)]\n if conf:\n for k, v in conf.getAll():\n command += ['--conf', '%s=%s' % (k, v)]\n submit_args = os.environ.get(\"PYSPARK_SUBMIT_ARGS\", \"pyspark-shell\")\n if os.environ.get(\"SPARK_TESTING\"):\n submit_args = ' '.join([\n \"--conf spark.ui.enabled=false\",\n submit_args\n ])\n command = command + shlex.split(submit_args)\n\n # Create a temporary directory where the gateway server should write the connection\n # information.\n conn_info_dir = tempfile.mkdtemp()\n try:\n fd, conn_info_file = tempfile.mkstemp(dir=conn_info_dir)\n os.close(fd)\n os.unlink(conn_info_file)\n\n env = dict(os.environ)\n env[\"_PYSPARK_DRIVER_CONN_INFO_PATH\"] = conn_info_file\n\n # Launch the Java gateway.\n popen_kwargs = {} if popen_kwargs is None else popen_kwargs\n # We open a pipe to stdin so that the Java gateway can die when the pipe is broken\n popen_kwargs['stdin'] = PIPE\n # We always set the necessary environment variables.\n popen_kwargs['env'] = env\n if not on_windows:\n # Don't send ctrl-c / SIGINT to the Java gateway:\n def preexec_func():\n signal.signal(signal.SIGINT, signal.SIG_IGN)\n popen_kwargs['preexec_fn'] = preexec_func\n proc = Popen(command, **popen_kwargs)\n else:\n # preexec_fn not supported on Windows\n proc = Popen(command, **popen_kwargs)\n\n # Wait for the file to appear, or for the process to exit, whichever happens first.\n while not proc.poll() and not os.path.isfile(conn_info_file):\n time.sleep(0.1)\n\n if not os.path.isfile(conn_info_file):\n raise Exception(\"Java gateway process exited before sending its port number\")\n\n with open(conn_info_file, \"rb\") as info:\n gateway_port = read_int(info)\n gateway_secret = UTF8Deserializer().loads(info)\n finally:\n shutil.rmtree(conn_info_dir)\n\n # In Windows, ensure the Java child processes do not linger after Python has exited.\n # In UNIX-based systems, the child process can kill itself on broken pipe (i.e. when\n # the parent process' stdin sends an EOF). In Windows, however, this is not possible\n # because java.lang.Process reads directly from the parent process' stdin, contending\n # with any opportunity to read an EOF from the parent. Note that this is only best\n # effort and will not take effect if the python process is violently terminated.\n if on_windows:\n # In Windows, the child process here is \"spark-submit.cmd\", not the JVM itself\n # (because the UNIX \"exec\" command is not available). This means we cannot simply\n # call proc.kill(), which kills only the \"spark-submit.cmd\" process but not the\n # JVMs. Instead, we use \"taskkill\" with the tree-kill option \"/t\" to terminate all\n # child processes in the tree (http://technet.microsoft.com/en-us/library/bb491009.aspx)\n def killChild():\n Popen([\"cmd\", \"/c\", \"taskkill\", \"/f\", \"/t\", \"/pid\", str(proc.pid)])\n atexit.register(killChild)\n\n # Connect to the gateway\n gateway = JavaGateway(\n gateway_parameters=GatewayParameters(port=gateway_port, auth_token=gateway_secret,\n auto_convert=True))\n # Store a reference to the Popen object for use by the caller (e.g., in reading stdout/stderr)\n gateway.proc = proc\n\n # Import the classes used by PySpark\n java_import(gateway.jvm, \"org.apache.spark.SparkConf\")\n java_import(gateway.jvm, \"org.apache.spark.api.java.*\")\n java_import(gateway.jvm, \"org.apache.spark.api.python.*\")\n java_import(gateway.jvm, \"org.apache.spark.ml.python.*\")\n java_import(gateway.jvm, \"org.apache.spark.mllib.api.python.*\")\n # TODO(davies): move into sql\n java_import(gateway.jvm, \"org.apache.spark.sql.*\")\n java_import(gateway.jvm, \"org.apache.spark.sql.api.python.*\")\n java_import(gateway.jvm, \"org.apache.spark.sql.hive.*\")\n java_import(gateway.jvm, \"scala.Tuple2\")\n\n return gateway", "response": "Launch the jvm gateway using Spark s submit command."} {"instruction": "Here you have a function in Python 3, explain what it does\ndef _do_server_auth(conn, auth_secret):\n write_with_length(auth_secret.encode(\"utf-8\"), conn)\n conn.flush()\n reply = UTF8Deserializer().loads(conn)\n if reply != \"ok\":\n conn.close()\n raise Exception(\"Unexpected reply from iterator server.\")", "response": "Performs the server authentication protocol on the given conn."} {"instruction": "Here you have a function in Python 3, explain what it does\ndef local_connect_and_auth(port, auth_secret):\n sock = None\n errors = []\n # Support for both IPv4 and IPv6.\n # On most of IPv6-ready systems, IPv6 will take precedence.\n for res in socket.getaddrinfo(\"127.0.0.1\", port, socket.AF_UNSPEC, socket.SOCK_STREAM):\n af, socktype, proto, _, sa = res\n try:\n sock = socket.socket(af, socktype, proto)\n sock.settimeout(15)\n sock.connect(sa)\n sockfile = sock.makefile(\"rwb\", 65536)\n _do_server_auth(sockfile, auth_secret)\n return (sockfile, sock)\n except socket.error as e:\n emsg = _exception_message(e)\n errors.append(\"tried to connect to %s, but an error occured: %s\" % (sa, emsg))\n sock.close()\n sock = None\n raise Exception(\"could not open socket: %s\" % errors)", "response": "Connect to local host authenticate with it and return a tuple with sockfile sock and auth_secret."} {"instruction": "Can you tell what is the following Python 3 function doing\ndef ensure_callback_server_started(gw):\n\n # getattr will fallback to JVM, so we cannot test by hasattr()\n if \"_callback_server\" not in gw.__dict__ or gw._callback_server is None:\n gw.callback_server_parameters.eager_load = True\n gw.callback_server_parameters.daemonize = True\n gw.callback_server_parameters.daemonize_connections = True\n gw.callback_server_parameters.port = 0\n gw.start_callback_server(gw.callback_server_parameters)\n cbport = gw._callback_server.server_socket.getsockname()[1]\n gw._callback_server.port = cbport\n # gateway with real port\n gw._python_proxy_port = gw._callback_server.port\n # get the GatewayServer object in JVM by ID\n jgws = JavaObject(\"GATEWAY_SERVER\", gw._gateway_client)\n # update the port of CallbackClient with real port\n jgws.resetCallbackClient(jgws.getCallbackClient().getAddress(), gw._python_proxy_port)", "response": "Ensure that callback server is started."} {"instruction": "Write a Python 3 function for\ncalculating URL contributions to the rank of other URLs.", "response": "def computeContribs(urls, rank):\n \"\"\"Calculates URL contributions to the rank of other URLs.\"\"\"\n num_urls = len(urls)\n for url in urls:\n yield (url, rank / num_urls)"} {"instruction": "How would you code a function in Python 3 to\nreturn the summary of the current model.", "response": "def summary(self):\n \"\"\"\n Gets summary (e.g. cluster assignments, cluster sizes) of the model trained on the\n training set. An exception is thrown if no summary exists.\n \"\"\"\n if self.hasSummary:\n return GaussianMixtureSummary(super(GaussianMixtureModel, self).summary)\n else:\n raise RuntimeError(\"No training summary available for this %s\" %\n self.__class__.__name__)"} {"instruction": "Given the following Python 3 function, write the documentation\ndef summary(self):\n if self.hasSummary:\n return KMeansSummary(super(KMeansModel, self).summary)\n else:\n raise RuntimeError(\"No training summary available for this %s\" %\n self.__class__.__name__)", "response": "Returns the training set summary."} {"instruction": "Can you implement a function in Python 3 that\nreturns the training set summary.", "response": "def summary(self):\n \"\"\"\n Gets summary (e.g. cluster assignments, cluster sizes) of the model trained on the\n training set. An exception is thrown if no summary exists.\n \"\"\"\n if self.hasSummary:\n return BisectingKMeansSummary(super(BisectingKMeansModel, self).summary)\n else:\n raise RuntimeError(\"No training summary available for this %s\" %\n self.__class__.__name__)"} {"instruction": "Here you have a function in Python 3, explain what it does\ndef imageSchema(self):\n\n if self._imageSchema is None:\n ctx = SparkContext._active_spark_context\n jschema = ctx._jvm.org.apache.spark.ml.image.ImageSchema.imageSchema()\n self._imageSchema = _parse_datatype_json_string(jschema.json())\n return self._imageSchema", "response": "Returns the image schema."} {"instruction": "Can you write a function in Python 3 where it\nreturns the OpenCV type mapping supported.", "response": "def ocvTypes(self):\n \"\"\"\n Returns the OpenCV type mapping supported.\n\n :return: a dictionary containing the OpenCV type mapping supported.\n\n .. versionadded:: 2.3.0\n \"\"\"\n\n if self._ocvTypes is None:\n ctx = SparkContext._active_spark_context\n self._ocvTypes = dict(ctx._jvm.org.apache.spark.ml.image.ImageSchema.javaOcvTypes())\n return self._ocvTypes"} {"instruction": "Given the following Python 3 function, write the documentation\ndef columnSchema(self):\n\n if self._columnSchema is None:\n ctx = SparkContext._active_spark_context\n jschema = ctx._jvm.org.apache.spark.ml.image.ImageSchema.columnSchema()\n self._columnSchema = _parse_datatype_json_string(jschema.json())\n return self._columnSchema", "response": "Returns the schema for the image column."} {"instruction": "Can you tell what is the following Python 3 function doing\ndef imageFields(self):\n\n if self._imageFields is None:\n ctx = SparkContext._active_spark_context\n self._imageFields = list(ctx._jvm.org.apache.spark.ml.image.ImageSchema.imageFields())\n return self._imageFields", "response": "Returns a list of field names of image columns."} {"instruction": "Make a summary of the following Python 3 code\ndef undefinedImageType(self):\n\n if self._undefinedImageType is None:\n ctx = SparkContext._active_spark_context\n self._undefinedImageType = \\\n ctx._jvm.org.apache.spark.ml.image.ImageSchema.undefinedImageType()\n return self._undefinedImageType", "response": "Returns the name of undefined image type for the invalid image."} {"instruction": "Explain what the following Python 3 code does\ndef toNDArray(self, image):\n\n if not isinstance(image, Row):\n raise TypeError(\n \"image argument should be pyspark.sql.types.Row; however, \"\n \"it got [%s].\" % type(image))\n\n if any(not hasattr(image, f) for f in self.imageFields):\n raise ValueError(\n \"image argument should have attributes specified in \"\n \"ImageSchema.imageSchema [%s].\" % \", \".join(self.imageFields))\n\n height = image.height\n width = image.width\n nChannels = image.nChannels\n return np.ndarray(\n shape=(height, width, nChannels),\n dtype=np.uint8,\n buffer=image.data,\n strides=(width * nChannels, nChannels, 1))", "response": "Converts an image to an array with metadata."} {"instruction": "Can you write a function in Python 3 where it\nconverts an array with metadata to a two - dimensional image.", "response": "def toImage(self, array, origin=\"\"):\n \"\"\"\n Converts an array with metadata to a two-dimensional image.\n\n :param `numpy.ndarray` array: The array to convert to image.\n :param str origin: Path to the image, optional.\n :return: a :class:`Row` that is a two dimensional image.\n\n .. versionadded:: 2.3.0\n \"\"\"\n\n if not isinstance(array, np.ndarray):\n raise TypeError(\n \"array argument should be numpy.ndarray; however, it got [%s].\" % type(array))\n\n if array.ndim != 3:\n raise ValueError(\"Invalid array shape\")\n\n height, width, nChannels = array.shape\n ocvTypes = ImageSchema.ocvTypes\n if nChannels == 1:\n mode = ocvTypes[\"CV_8UC1\"]\n elif nChannels == 3:\n mode = ocvTypes[\"CV_8UC3\"]\n elif nChannels == 4:\n mode = ocvTypes[\"CV_8UC4\"]\n else:\n raise ValueError(\"Invalid number of channels\")\n\n # Running `bytearray(numpy.array([1]))` fails in specific Python versions\n # with a specific Numpy version, for example in Python 3.6.0 and NumPy 1.13.3.\n # Here, it avoids it by converting it to bytes.\n if LooseVersion(np.__version__) >= LooseVersion('1.9'):\n data = bytearray(array.astype(dtype=np.uint8).ravel().tobytes())\n else:\n # Numpy prior to 1.9 don't have `tobytes` method.\n data = bytearray(array.astype(dtype=np.uint8).ravel())\n\n # Creating new Row with _create_row(), because Row(name = value, ... )\n # orders fields by name, which conflicts with expected schema order\n # when the new DataFrame is created by UDF\n return _create_row(self.imageFields,\n [origin, height, width, nChannels, mode, data])"} {"instruction": "Can you implement a function in Python 3 that\nreads the images from the local or remote source directory.", "response": "def readImages(self, path, recursive=False, numPartitions=-1,\n dropImageFailures=False, sampleRatio=1.0, seed=0):\n \"\"\"\n Reads the directory of images from the local or remote source.\n\n .. note:: If multiple jobs are run in parallel with different sampleRatio or recursive flag,\n there may be a race condition where one job overwrites the hadoop configs of another.\n\n .. note:: If sample ratio is less than 1, sampling uses a PathFilter that is efficient but\n potentially non-deterministic.\n\n .. note:: Deprecated in 2.4.0. Use `spark.read.format(\"image\").load(path)` instead and\n this `readImages` will be removed in 3.0.0.\n\n :param str path: Path to the image directory.\n :param bool recursive: Recursive search flag.\n :param int numPartitions: Number of DataFrame partitions.\n :param bool dropImageFailures: Drop the files that are not valid images.\n :param float sampleRatio: Fraction of the images loaded.\n :param int seed: Random number seed.\n :return: a :class:`DataFrame` with a single column of \"images\",\n see ImageSchema for details.\n\n >>> df = ImageSchema.readImages('data/mllib/images/origin/kittens', recursive=True)\n >>> df.count()\n 5\n\n .. versionadded:: 2.3.0\n \"\"\"\n warnings.warn(\"`ImageSchema.readImage` is deprecated. \" +\n \"Use `spark.read.format(\\\"image\\\").load(path)` instead.\", DeprecationWarning)\n spark = SparkSession.builder.getOrCreate()\n image_schema = spark._jvm.org.apache.spark.ml.image.ImageSchema\n jsession = spark._jsparkSession\n jresult = image_schema.readImages(path, jsession, recursive, numPartitions,\n dropImageFailures, float(sampleRatio), seed)\n return DataFrame(jresult, spark._wrapped)"} {"instruction": "How would you explain what the following Python 3 function does\ndef _create_from_java_class(cls, java_class, *args):\n java_obj = JavaWrapper._new_java_obj(java_class, *args)\n return cls(java_obj)", "response": "Construct this object from given Java classname and arguments."} {"instruction": "Can you tell what is the following Python 3 function doing\ndef _new_java_array(pylist, java_class):\n sc = SparkContext._active_spark_context\n java_array = None\n if len(pylist) > 0 and isinstance(pylist[0], list):\n # If pylist is a 2D array, then a 2D java array will be created.\n # The 2D array is a square, non-jagged 2D array that is big enough for all elements.\n inner_array_length = 0\n for i in xrange(len(pylist)):\n inner_array_length = max(inner_array_length, len(pylist[i]))\n java_array = sc._gateway.new_array(java_class, len(pylist), inner_array_length)\n for i in xrange(len(pylist)):\n for j in xrange(len(pylist[i])):\n java_array[i][j] = pylist[i][j]\n else:\n java_array = sc._gateway.new_array(java_class, len(pylist))\n for i in xrange(len(pylist)):\n java_array[i] = pylist[i]\n return java_array", "response": "Create a Java array of given java_class type from a Python list."} {"instruction": "Write a Python 3 function for\nconverting EPITEXT to EPITEXT.", "response": "def _convert_epytext(line):\n \"\"\"\n >>> _convert_epytext(\"L{A}\")\n :class:`A`\n \"\"\"\n line = line.replace('@', ':')\n for p, sub in RULES:\n line = re.sub(p, sub, line)\n return line"} {"instruction": "Explain what the following Python 3 code does\ndef rddToFileName(prefix, suffix, timestamp):\n if isinstance(timestamp, datetime):\n seconds = time.mktime(timestamp.timetuple())\n timestamp = int(seconds * 1000) + timestamp.microsecond // 1000\n if suffix is None:\n return prefix + \"-\" + str(timestamp)\n else:\n return prefix + \"-\" + str(timestamp) + \".\" + suffix", "response": "Convert a RDD to a file name."} {"instruction": "Write a Python 3 function for\nadding a profiler for RDD id", "response": "def add_profiler(self, id, profiler):\n \"\"\" Add a profiler for RDD `id` \"\"\"\n if not self.profilers:\n if self.profile_dump_path:\n atexit.register(self.dump_profiles, self.profile_dump_path)\n else:\n atexit.register(self.show_profiles)\n\n self.profilers.append([id, profiler, False])"} {"instruction": "Can you write a function in Python 3 where it\ndumps the profile stats into a directory path.", "response": "def dump_profiles(self, path):\n \"\"\" Dump the profile stats into directory `path` \"\"\"\n for id, profiler, _ in self.profilers:\n profiler.dump(id, path)\n self.profilers = []"} {"instruction": "Make a summary of the following Python 3 code\ndef show_profiles(self):\n for i, (id, profiler, showed) in enumerate(self.profilers):\n if not showed and profiler:\n profiler.show(id)\n # mark it as showed\n self.profilers[i][2] = True", "response": "Print the profile stats to stdout"} {"instruction": "Create a Python 3 function for\nprinting the profile stats to stdout", "response": "def show(self, id):\n \"\"\" Print the profile stats to stdout, id is the RDD id \"\"\"\n stats = self.stats()\n if stats:\n print(\"=\" * 60)\n print(\"Profile of RDD\" % id)\n print(\"=\" * 60)\n stats.sort_stats(\"time\", \"cumulative\").print_stats()"} {"instruction": "Given the following Python 3 function, write the documentation\ndef dump(self, id, path):\n if not os.path.exists(path):\n os.makedirs(path)\n stats = self.stats()\n if stats:\n p = os.path.join(path, \"rdd_%d.pstats\" % id)\n stats.dump_stats(p)", "response": "Dump the profile into path"} {"instruction": "Here you have a function in Python 3, explain what it does\ndef profile(self, func):\n pr = cProfile.Profile()\n pr.runcall(func)\n st = pstats.Stats(pr)\n st.stream = None # make it picklable\n st.strip_dirs()\n\n # Adds a new profile to the existing accumulated value\n self._accumulator.add(st)", "response": "Runs and profiles the method passed in. A profile object is returned."} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef getOrCreate(cls, sc):\n if cls._instantiatedContext is None:\n jsqlContext = sc._jvm.SQLContext.getOrCreate(sc._jsc.sc())\n sparkSession = SparkSession(sc, jsqlContext.sparkSession())\n cls(sc, sparkSession, jsqlContext)\n return cls._instantiatedContext", "response": "Get the existing SQLContext or create a new one with given SparkContext."} {"instruction": "Can you implement a function in Python 3 that\nsets the given Spark SQL configuration property.", "response": "def setConf(self, key, value):\n \"\"\"Sets the given Spark SQL configuration property.\n \"\"\"\n self.sparkSession.conf.set(key, value)"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef getConf(self, key, defaultValue=_NoValue):\n return self.sparkSession.conf.get(key, defaultValue)", "response": "Returns the value of Spark SQL configuration property for the given key."} {"instruction": "Implement a Python 3 function for\ncreating a DataFrame with all the elements in a range from start to end.", "response": "def range(self, start, end=None, step=1, numPartitions=None):\n \"\"\"\n Create a :class:`DataFrame` with single :class:`pyspark.sql.types.LongType` column named\n ``id``, containing elements in a range from ``start`` to ``end`` (exclusive) with\n step value ``step``.\n\n :param start: the start value\n :param end: the end value (exclusive)\n :param step: the incremental step (default: 1)\n :param numPartitions: the number of partitions of the DataFrame\n :return: :class:`DataFrame`\n\n >>> sqlContext.range(1, 7, 2).collect()\n [Row(id=1), Row(id=3), Row(id=5)]\n\n If only one argument is specified, it will be used as the end value.\n\n >>> sqlContext.range(3).collect()\n [Row(id=0), Row(id=1), Row(id=2)]\n \"\"\"\n return self.sparkSession.range(start, end, step, numPartitions)"} {"instruction": "How would you explain what the following Python 3 function does\ndef registerFunction(self, name, f, returnType=None):\n warnings.warn(\n \"Deprecated in 2.3.0. Use spark.udf.register instead.\",\n DeprecationWarning)\n return self.sparkSession.udf.register(name, f, returnType)", "response": "Register a function f with the specified name."} {"instruction": "How would you implement a function in Python 3 that\nregisters a Java function.", "response": "def registerJavaFunction(self, name, javaClassName, returnType=None):\n \"\"\"An alias for :func:`spark.udf.registerJavaFunction`.\n See :meth:`pyspark.sql.UDFRegistration.registerJavaFunction`.\n\n .. note:: Deprecated in 2.3.0. Use :func:`spark.udf.registerJavaFunction` instead.\n \"\"\"\n warnings.warn(\n \"Deprecated in 2.3.0. Use spark.udf.registerJavaFunction instead.\",\n DeprecationWarning)\n return self.sparkSession.udf.registerJavaFunction(name, javaClassName, returnType)"} {"instruction": "Given the following Python 3 function, write the documentation\ndef createDataFrame(self, data, schema=None, samplingRatio=None, verifySchema=True):\n return self.sparkSession.createDataFrame(data, schema, samplingRatio, verifySchema)", "response": "Creates a DataFrame from an RDD of data."} {"instruction": "Create a Python 3 function for\ncreating an external table based on the dataset in a data source.", "response": "def createExternalTable(self, tableName, path=None, source=None, schema=None, **options):\n \"\"\"Creates an external table based on the dataset in a data source.\n\n It returns the DataFrame associated with the external table.\n\n The data source is specified by the ``source`` and a set of ``options``.\n If ``source`` is not specified, the default data source configured by\n ``spark.sql.sources.default`` will be used.\n\n Optionally, a schema can be provided as the schema of the returned :class:`DataFrame` and\n created external table.\n\n :return: :class:`DataFrame`\n \"\"\"\n return self.sparkSession.catalog.createExternalTable(\n tableName, path, source, schema, **options)"} {"instruction": "Write a Python 3 script for\nreturning a DataFrame containing names of tables in the given database.", "response": "def tables(self, dbName=None):\n \"\"\"Returns a :class:`DataFrame` containing names of tables in the given database.\n\n If ``dbName`` is not specified, the current database will be used.\n\n The returned DataFrame has two columns: ``tableName`` and ``isTemporary``\n (a column with :class:`BooleanType` indicating if a table is a temporary one or not).\n\n :param dbName: string, name of the database to use.\n :return: :class:`DataFrame`\n\n >>> sqlContext.registerDataFrameAsTable(df, \"table1\")\n >>> df2 = sqlContext.tables()\n >>> df2.filter(\"tableName = 'table1'\").first()\n Row(database=u'', tableName=u'table1', isTemporary=True)\n \"\"\"\n if dbName is None:\n return DataFrame(self._ssql_ctx.tables(), self)\n else:\n return DataFrame(self._ssql_ctx.tables(dbName), self)"} {"instruction": "Implement a Python 3 function for\nreturning a list of names of tables in the database dbName.", "response": "def tableNames(self, dbName=None):\n \"\"\"Returns a list of names of tables in the database ``dbName``.\n\n :param dbName: string, name of the database to use. Default to the current database.\n :return: list of table names, in string\n\n >>> sqlContext.registerDataFrameAsTable(df, \"table1\")\n >>> \"table1\" in sqlContext.tableNames()\n True\n >>> \"table1\" in sqlContext.tableNames(\"default\")\n True\n \"\"\"\n if dbName is None:\n return [name for name in self._ssql_ctx.tableNames()]\n else:\n return [name for name in self._ssql_ctx.tableNames(dbName)]"} {"instruction": "Here you have a function in Python 3, explain what it does\ndef streams(self):\n from pyspark.sql.streaming import StreamingQueryManager\n return StreamingQueryManager(self._ssql_ctx.streams())", "response": "Returns a new StreamingQueryManager that allows managing all the\n "} {"instruction": "Implement a Python 3 function for\nconverting a binary column of avro format into its corresponding catalyst value.", "response": "def from_avro(data, jsonFormatSchema, options={}):\n \"\"\"\n Converts a binary column of avro format into its corresponding catalyst value. The specified\n schema must match the read data, otherwise the behavior is undefined: it may fail or return\n arbitrary result.\n\n Note: Avro is built-in but external data source module since Spark 2.4. Please deploy the\n application as per the deployment section of \"Apache Avro Data Source Guide\".\n\n :param data: the binary column.\n :param jsonFormatSchema: the avro schema in JSON string format.\n :param options: options to control how the Avro record is parsed.\n\n >>> from pyspark.sql import Row\n >>> from pyspark.sql.avro.functions import from_avro, to_avro\n >>> data = [(1, Row(name='Alice', age=2))]\n >>> df = spark.createDataFrame(data, (\"key\", \"value\"))\n >>> avroDf = df.select(to_avro(df.value).alias(\"avro\"))\n >>> avroDf.collect()\n [Row(avro=bytearray(b'\\\\x00\\\\x00\\\\x04\\\\x00\\\\nAlice'))]\n >>> jsonFormatSchema = '''{\"type\":\"record\",\"name\":\"topLevelRecord\",\"fields\":\n ... [{\"name\":\"avro\",\"type\":[{\"type\":\"record\",\"name\":\"value\",\"namespace\":\"topLevelRecord\",\n ... \"fields\":[{\"name\":\"age\",\"type\":[\"long\",\"null\"]},\n ... {\"name\":\"name\",\"type\":[\"string\",\"null\"]}]},\"null\"]}]}'''\n >>> avroDf.select(from_avro(avroDf.avro, jsonFormatSchema).alias(\"value\")).collect()\n [Row(value=Row(avro=Row(age=2, name=u'Alice')))]\n \"\"\"\n\n sc = SparkContext._active_spark_context\n try:\n jc = sc._jvm.org.apache.spark.sql.avro.functions.from_avro(\n _to_java_column(data), jsonFormatSchema, options)\n except TypeError as e:\n if str(e) == \"'JavaPackage' object is not callable\":\n _print_missing_jar(\"Avro\", \"avro\", \"avro\", sc.version)\n raise\n return Column(jc)"} {"instruction": "Write a Python 3 script to\nget the absolute path of a file added through SparkContext. addFile.", "response": "def get(cls, filename):\n \"\"\"\n Get the absolute path of a file added through C{SparkContext.addFile()}.\n \"\"\"\n path = os.path.join(SparkFiles.getRootDirectory(), filename)\n return os.path.abspath(path)"} {"instruction": "How would you code a function in Python 3 to\nget the root directory that contains files added through SparkContext. addFile.", "response": "def getRootDirectory(cls):\n \"\"\"\n Get the root directory that contains files added through\n C{SparkContext.addFile()}.\n \"\"\"\n if cls._is_running_on_worker:\n return cls._root_directory\n else:\n # This will have to change if we support multiple SparkContexts:\n return cls._sc._jvm.org.apache.spark.SparkFiles.getRootDirectory()"} {"instruction": "How would you code a function in Python 3 to\nreturn the training summary of the current model.", "response": "def summary(self):\n \"\"\"\n Gets summary (e.g. accuracy/precision/recall, objective history, total iterations) of model\n trained on the training set. An exception is thrown if `trainingSummary is None`.\n \"\"\"\n if self.hasSummary:\n if self.numClasses <= 2:\n return BinaryLogisticRegressionTrainingSummary(super(LogisticRegressionModel,\n self).summary)\n else:\n return LogisticRegressionTrainingSummary(super(LogisticRegressionModel,\n self).summary)\n else:\n raise RuntimeError(\"No training summary available for this %s\" %\n self.__class__.__name__)"} {"instruction": "How would you code a function in Python 3 to\nevaluate the model on a test dataset.", "response": "def evaluate(self, dataset):\n \"\"\"\n Evaluates the model on a test dataset.\n\n :param dataset:\n Test dataset to evaluate model on, where dataset is an\n instance of :py:class:`pyspark.sql.DataFrame`\n \"\"\"\n if not isinstance(dataset, DataFrame):\n raise ValueError(\"dataset must be a DataFrame but got %s.\" % type(dataset))\n java_blr_summary = self._call_java(\"evaluate\", dataset)\n return BinaryLogisticRegressionSummary(java_blr_summary)"} {"instruction": "Create a Python 3 function for\ncreating a deep copy of this instance with a randomly generated uid and some extra parameters.", "response": "def copy(self, extra=None):\n \"\"\"\n Creates a copy of this instance with a randomly generated uid\n and some extra params. This creates a deep copy of the embedded paramMap,\n and copies the embedded and extra parameters over.\n\n :param extra: Extra parameters to copy to the new instance\n :return: Copy of this instance\n \"\"\"\n if extra is None:\n extra = dict()\n newModel = Params.copy(self, extra)\n newModel.models = [model.copy(extra) for model in self.models]\n return newModel"} {"instruction": "Write a Python 3 function that can\ngive a Java OneVsRestModel create and return a Python wrapper of it.", "response": "def _from_java(cls, java_stage):\n \"\"\"\n Given a Java OneVsRestModel, create and return a Python wrapper of it.\n Used for ML persistence.\n \"\"\"\n featuresCol = java_stage.getFeaturesCol()\n labelCol = java_stage.getLabelCol()\n predictionCol = java_stage.getPredictionCol()\n classifier = JavaParams._from_java(java_stage.getClassifier())\n models = [JavaParams._from_java(model) for model in java_stage.models()]\n py_stage = cls(models=models).setPredictionCol(predictionCol).setLabelCol(labelCol)\\\n .setFeaturesCol(featuresCol).setClassifier(classifier)\n py_stage._resetUid(java_stage.uid())\n return py_stage"} {"instruction": "How would you explain what the following Python 3 function does\ndef _to_java(self):\n sc = SparkContext._active_spark_context\n java_models = [model._to_java() for model in self.models]\n java_models_array = JavaWrapper._new_java_array(\n java_models, sc._gateway.jvm.org.apache.spark.ml.classification.ClassificationModel)\n metadata = JavaParams._new_java_obj(\"org.apache.spark.sql.types.Metadata\")\n _java_obj = JavaParams._new_java_obj(\"org.apache.spark.ml.classification.OneVsRestModel\",\n self.uid, metadata.empty(), java_models_array)\n _java_obj.set(\"classifier\", self.getClassifier()._to_java())\n _java_obj.set(\"featuresCol\", self.getFeaturesCol())\n _java_obj.set(\"labelCol\", self.getLabelCol())\n _java_obj.set(\"predictionCol\", self.getPredictionCol())\n return _java_obj", "response": "Transfer this instance to a Java OneVsRestModel. Used for ML persistence."} {"instruction": "Make a summary of the following Python 3 code\ndef _exception_message(excp):\n if isinstance(excp, Py4JJavaError):\n # 'Py4JJavaError' doesn't contain the stack trace available on the Java side in 'message'\n # attribute in Python 2. We should call 'str' function on this exception in general but\n # 'Py4JJavaError' has an issue about addressing non-ascii strings. So, here we work\n # around by the direct call, '__str__()'. Please see SPARK-23517.\n return excp.__str__()\n if hasattr(excp, \"message\"):\n return excp.message\n return str(excp)", "response": "Return the message from an exception as either a str or unicode object."} {"instruction": "Write a Python 3 script to\nget argspec of a function. Supports both Python 2 and Python 3.", "response": "def _get_argspec(f):\n \"\"\"\n Get argspec of a function. Supports both Python 2 and Python 3.\n \"\"\"\n if sys.version_info[0] < 3:\n argspec = inspect.getargspec(f)\n else:\n # `getargspec` is deprecated since python3.0 (incompatible with function annotations).\n # See SPARK-23569.\n argspec = inspect.getfullargspec(f)\n return argspec"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef fail_on_stopiteration(f):\n def wrapper(*args, **kwargs):\n try:\n return f(*args, **kwargs)\n except StopIteration as exc:\n raise RuntimeError(\n \"Caught StopIteration thrown from user's code; failing the task\",\n exc\n )\n\n return wrapper", "response": "A decorator that can be used to fail on a StopIteration raised from Spark code."} {"instruction": "Can you tell what is the following Python 3 function doing\ndef majorMinorVersion(sparkVersion):\n m = re.search(r'^(\\d+)\\.(\\d+)(\\..*)?$', sparkVersion)\n if m is not None:\n return (int(m.group(1)), int(m.group(2)))\n else:\n raise ValueError(\"Spark tried to parse '%s' as a Spark\" % sparkVersion +\n \" version string, but it could not find the major and minor\" +\n \" version numbers.\")", "response": "Given a Spark version string return the major and minor version numbers."} {"instruction": "Make a summary of the following Python 3 code\ndef _ensure_initialized(cls, instance=None, gateway=None, conf=None):\n with SparkContext._lock:\n if not SparkContext._gateway:\n SparkContext._gateway = gateway or launch_gateway(conf)\n SparkContext._jvm = SparkContext._gateway.jvm\n\n if instance:\n if (SparkContext._active_spark_context and\n SparkContext._active_spark_context != instance):\n currentMaster = SparkContext._active_spark_context.master\n currentAppName = SparkContext._active_spark_context.appName\n callsite = SparkContext._active_spark_context._callsite\n\n # Raise error if there is already a running Spark context\n raise ValueError(\n \"Cannot run multiple SparkContexts at once; \"\n \"existing SparkContext(app=%s, master=%s)\"\n \" created by %s at %s:%s \"\n % (currentAppName, currentMaster,\n callsite.function, callsite.file, callsite.linenum))\n else:\n SparkContext._active_spark_context = instance", "response": "Checks whether a SparkContext is initialized or not."} {"instruction": "Here you have a function in Python 3, explain what it does\ndef getOrCreate(cls, conf=None):\n with SparkContext._lock:\n if SparkContext._active_spark_context is None:\n SparkContext(conf=conf or SparkConf())\n return SparkContext._active_spark_context", "response": "Get or instantiate a SparkContext and register it as a singleton object."} {"instruction": "Write a Python 3 function that can\nset a Java system property such as spark. executor. memory.", "response": "def setSystemProperty(cls, key, value):\n \"\"\"\n Set a Java system property, such as spark.executor.memory. This must\n must be invoked before instantiating SparkContext.\n \"\"\"\n SparkContext._ensure_initialized()\n SparkContext._jvm.java.lang.System.setProperty(key, value)"} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef stop(self):\n if getattr(self, \"_jsc\", None):\n try:\n self._jsc.stop()\n except Py4JError:\n # Case: SPARK-18523\n warnings.warn(\n 'Unable to cleanly shutdown Spark JVM process.'\n ' It is possible that the process has crashed,'\n ' been killed or may also be in a zombie state.',\n RuntimeWarning\n )\n finally:\n self._jsc = None\n if getattr(self, \"_accumulatorServer\", None):\n self._accumulatorServer.shutdown()\n self._accumulatorServer = None\n with SparkContext._lock:\n SparkContext._active_spark_context = None", "response": "Shut down the SparkContext."} {"instruction": "Write a Python 3 function for\ncreating a new RDD of integers from start to end.", "response": "def range(self, start, end=None, step=1, numSlices=None):\n \"\"\"\n Create a new RDD of int containing elements from `start` to `end`\n (exclusive), increased by `step` every element. Can be called the same\n way as python's built-in range() function. If called with a single argument,\n the argument is interpreted as `end`, and `start` is set to 0.\n\n :param start: the start value\n :param end: the end value (exclusive)\n :param step: the incremental step (default: 1)\n :param numSlices: the number of partitions of the new RDD\n :return: An RDD of int\n\n >>> sc.range(5).collect()\n [0, 1, 2, 3, 4]\n >>> sc.range(2, 4).collect()\n [2, 3]\n >>> sc.range(1, 7, 2).collect()\n [1, 3, 5]\n \"\"\"\n if end is None:\n end = start\n start = 0\n\n return self.parallelize(xrange(start, end, step), numSlices)"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef parallelize(self, c, numSlices=None):\n numSlices = int(numSlices) if numSlices is not None else self.defaultParallelism\n if isinstance(c, xrange):\n size = len(c)\n if size == 0:\n return self.parallelize([], numSlices)\n step = c[1] - c[0] if size > 1 else 1\n start0 = c[0]\n\n def getStart(split):\n return start0 + int((split * size / numSlices)) * step\n\n def f(split, iterator):\n # it's an empty iterator here but we need this line for triggering the\n # logic of signal handling in FramedSerializer.load_stream, for instance,\n # SpecialLengths.END_OF_DATA_SECTION in _read_with_length. Since\n # FramedSerializer.load_stream produces a generator, the control should\n # at least be in that function once. Here we do it by explicitly converting\n # the empty iterator to a list, thus make sure worker reuse takes effect.\n # See more details in SPARK-26549.\n assert len(list(iterator)) == 0\n return xrange(getStart(split), getStart(split + 1), step)\n\n return self.parallelize([], numSlices).mapPartitionsWithIndex(f)\n\n # Make sure we distribute data evenly if it's smaller than self.batchSize\n if \"__len__\" not in dir(c):\n c = list(c) # Make it a list so we can compute its length\n batchSize = max(1, min(len(c) // numSlices, self._batchSize or 1024))\n serializer = BatchedSerializer(self._unbatched_serializer, batchSize)\n\n def reader_func(temp_filename):\n return self._jvm.PythonRDD.readRDDFromFile(self._jsc, temp_filename, numSlices)\n\n def createRDDServer():\n return self._jvm.PythonParallelizeServer(self._jsc.sc(), numSlices)\n\n jrdd = self._serialize_to_jvm(c, serializer, reader_func, createRDDServer)\n return RDD(jrdd, self, serializer)", "response": "Distribute a local Python collection to form an RDD."} {"instruction": "Create a Python 3 function for\nserializing data to the jvm and return a JavaRDD.", "response": "def _serialize_to_jvm(self, data, serializer, reader_func, createRDDServer):\n \"\"\"\n Using py4j to send a large dataset to the jvm is really slow, so we use either a file\n or a socket if we have encryption enabled.\n :param data:\n :param serializer:\n :param reader_func: A function which takes a filename and reads in the data in the jvm and\n returns a JavaRDD. Only used when encryption is disabled.\n :param createRDDServer: A function which creates a PythonRDDServer in the jvm to\n accept the serialized data, for use when encryption is enabled.\n :return:\n \"\"\"\n if self._encryption_enabled:\n # with encryption, we open a server in java and send the data directly\n server = createRDDServer()\n (sock_file, _) = local_connect_and_auth(server.port(), server.secret())\n chunked_out = ChunkedStream(sock_file, 8192)\n serializer.dump_stream(data, chunked_out)\n chunked_out.close()\n # this call will block until the server has read all the data and processed it (or\n # throws an exception)\n r = server.getResult()\n return r\n else:\n # without encryption, we serialize to a file, and we read the file in java and\n # parallelize from there.\n tempFile = NamedTemporaryFile(delete=False, dir=self._temp_dir)\n try:\n try:\n serializer.dump_stream(data, tempFile)\n finally:\n tempFile.close()\n return reader_func(tempFile.name)\n finally:\n # we eagerily reads the file so we can delete right after.\n os.unlink(tempFile.name)"} {"instruction": "Write a Python 3 function that can\nload an RDD previously saved using pickleFile method.", "response": "def pickleFile(self, name, minPartitions=None):\n \"\"\"\n Load an RDD previously saved using L{RDD.saveAsPickleFile} method.\n\n >>> tmpFile = NamedTemporaryFile(delete=True)\n >>> tmpFile.close()\n >>> sc.parallelize(range(10)).saveAsPickleFile(tmpFile.name, 5)\n >>> sorted(sc.pickleFile(tmpFile.name, 3).collect())\n [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]\n \"\"\"\n minPartitions = minPartitions or self.defaultMinPartitions\n return RDD(self._jsc.objectFile(name, minPartitions), self)"} {"instruction": "Implement a Python 3 function for\nreading a text file from HDFS or local file system or any Hadoop - supported file system URI and return it as an RDD of Strings.", "response": "def textFile(self, name, minPartitions=None, use_unicode=True):\n \"\"\"\n Read a text file from HDFS, a local file system (available on all\n nodes), or any Hadoop-supported file system URI, and return it as an\n RDD of Strings.\n The text files must be encoded as UTF-8.\n\n If use_unicode is False, the strings will be kept as `str` (encoding\n as `utf-8`), which is faster and smaller than unicode. (Added in\n Spark 1.2)\n\n >>> path = os.path.join(tempdir, \"sample-text.txt\")\n >>> with open(path, \"w\") as testFile:\n ... _ = testFile.write(\"Hello world!\")\n >>> textFile = sc.textFile(path)\n >>> textFile.collect()\n [u'Hello world!']\n \"\"\"\n minPartitions = minPartitions or min(self.defaultParallelism, 2)\n return RDD(self._jsc.textFile(name, minPartitions), self,\n UTF8Deserializer(use_unicode))"} {"instruction": "Can you implement a function in Python 3 that\nreads a directory of text files from HDFS and return a key - value pair containing the text files and the corresponding entry in the file system.", "response": "def wholeTextFiles(self, path, minPartitions=None, use_unicode=True):\n \"\"\"\n Read a directory of text files from HDFS, a local file system\n (available on all nodes), or any Hadoop-supported file system\n URI. Each file is read as a single record and returned in a\n key-value pair, where the key is the path of each file, the\n value is the content of each file.\n The text files must be encoded as UTF-8.\n\n If use_unicode is False, the strings will be kept as `str` (encoding\n as `utf-8`), which is faster and smaller than unicode. (Added in\n Spark 1.2)\n\n For example, if you have the following files::\n\n hdfs://a-hdfs-path/part-00000\n hdfs://a-hdfs-path/part-00001\n ...\n hdfs://a-hdfs-path/part-nnnnn\n\n Do C{rdd = sparkContext.wholeTextFiles(\"hdfs://a-hdfs-path\")},\n then C{rdd} contains::\n\n (a-hdfs-path/part-00000, its content)\n (a-hdfs-path/part-00001, its content)\n ...\n (a-hdfs-path/part-nnnnn, its content)\n\n .. note:: Small files are preferred, as each file will be loaded\n fully in memory.\n\n >>> dirPath = os.path.join(tempdir, \"files\")\n >>> os.mkdir(dirPath)\n >>> with open(os.path.join(dirPath, \"1.txt\"), \"w\") as file1:\n ... _ = file1.write(\"1\")\n >>> with open(os.path.join(dirPath, \"2.txt\"), \"w\") as file2:\n ... _ = file2.write(\"2\")\n >>> textFiles = sc.wholeTextFiles(dirPath)\n >>> sorted(textFiles.collect())\n [(u'.../1.txt', u'1'), (u'.../2.txt', u'2')]\n \"\"\"\n minPartitions = minPartitions or self.defaultMinPartitions\n return RDD(self._jsc.wholeTextFiles(path, minPartitions), self,\n PairDeserializer(UTF8Deserializer(use_unicode), UTF8Deserializer(use_unicode)))"} {"instruction": "Make a summary of the following Python 3 code\ndef binaryFiles(self, path, minPartitions=None):\n minPartitions = minPartitions or self.defaultMinPartitions\n return RDD(self._jsc.binaryFiles(path, minPartitions), self,\n PairDeserializer(UTF8Deserializer(), NoOpSerializer()))", "response": "Reads a directory of binary files from HDFS and returns a new RDD of the files."} {"instruction": "Here you have a function in Python 3, explain what it does\ndef binaryRecords(self, path, recordLength):\n return RDD(self._jsc.binaryRecords(path, recordLength), self, NoOpSerializer())", "response": "Load data from a flat binary file."} {"instruction": "Can you implement a function in Python 3 that\nreads a Hadoop SequenceFile from HDFS and returns a Java RDD of pickled objects.", "response": "def sequenceFile(self, path, keyClass=None, valueClass=None, keyConverter=None,\n valueConverter=None, minSplits=None, batchSize=0):\n \"\"\"\n Read a Hadoop SequenceFile with arbitrary key and value Writable class from HDFS,\n a local file system (available on all nodes), or any Hadoop-supported file system URI.\n The mechanism is as follows:\n\n 1. A Java RDD is created from the SequenceFile or other InputFormat, and the key\n and value Writable classes\n 2. Serialization is attempted via Pyrolite pickling\n 3. If this fails, the fallback is to call 'toString' on each key and value\n 4. C{PickleSerializer} is used to deserialize pickled objects on the Python side\n\n :param path: path to sequncefile\n :param keyClass: fully qualified classname of key Writable class\n (e.g. \"org.apache.hadoop.io.Text\")\n :param valueClass: fully qualified classname of value Writable class\n (e.g. \"org.apache.hadoop.io.LongWritable\")\n :param keyConverter:\n :param valueConverter:\n :param minSplits: minimum splits in dataset\n (default min(2, sc.defaultParallelism))\n :param batchSize: The number of Python objects represented as a single\n Java object. (default 0, choose batchSize automatically)\n \"\"\"\n minSplits = minSplits or min(self.defaultParallelism, 2)\n jrdd = self._jvm.PythonRDD.sequenceFile(self._jsc, path, keyClass, valueClass,\n keyConverter, valueConverter, minSplits, batchSize)\n return RDD(jrdd, self)"} {"instruction": "Write a Python 3 function for\nreading a new API Hadoop file from HDFS and return an RDD of the result.", "response": "def newAPIHadoopFile(self, path, inputFormatClass, keyClass, valueClass, keyConverter=None,\n valueConverter=None, conf=None, batchSize=0):\n \"\"\"\n Read a 'new API' Hadoop InputFormat with arbitrary key and value class from HDFS,\n a local file system (available on all nodes), or any Hadoop-supported file system URI.\n The mechanism is the same as for sc.sequenceFile.\n\n A Hadoop configuration can be passed in as a Python dict. This will be converted into a\n Configuration in Java\n\n :param path: path to Hadoop file\n :param inputFormatClass: fully qualified classname of Hadoop InputFormat\n (e.g. \"org.apache.hadoop.mapreduce.lib.input.TextInputFormat\")\n :param keyClass: fully qualified classname of key Writable class\n (e.g. \"org.apache.hadoop.io.Text\")\n :param valueClass: fully qualified classname of value Writable class\n (e.g. \"org.apache.hadoop.io.LongWritable\")\n :param keyConverter: (None by default)\n :param valueConverter: (None by default)\n :param conf: Hadoop configuration, passed in as a dict\n (None by default)\n :param batchSize: The number of Python objects represented as a single\n Java object. (default 0, choose batchSize automatically)\n \"\"\"\n jconf = self._dictToJavaMap(conf)\n jrdd = self._jvm.PythonRDD.newAPIHadoopFile(self._jsc, path, inputFormatClass, keyClass,\n valueClass, keyConverter, valueConverter,\n jconf, batchSize)\n return RDD(jrdd, self)"} {"instruction": "Write a Python 3 script to\nbuild a union of a list of RDDs.", "response": "def union(self, rdds):\n \"\"\"\n Build the union of a list of RDDs.\n\n This supports unions() of RDDs with different serialized formats,\n although this forces them to be reserialized using the default\n serializer:\n\n >>> path = os.path.join(tempdir, \"union-text.txt\")\n >>> with open(path, \"w\") as testFile:\n ... _ = testFile.write(\"Hello\")\n >>> textFile = sc.textFile(path)\n >>> textFile.collect()\n [u'Hello']\n >>> parallelized = sc.parallelize([\"World!\"])\n >>> sorted(sc.union([textFile, parallelized]).collect())\n [u'Hello', 'World!']\n \"\"\"\n first_jrdd_deserializer = rdds[0]._jrdd_deserializer\n if any(x._jrdd_deserializer != first_jrdd_deserializer for x in rdds):\n rdds = [x._reserialize() for x in rdds]\n cls = SparkContext._jvm.org.apache.spark.api.java.JavaRDD\n jrdds = SparkContext._gateway.new_array(cls, len(rdds))\n for i in range(0, len(rdds)):\n jrdds[i] = rdds[i]._jrdd\n return RDD(self._jsc.union(jrdds), self, rdds[0]._jrdd_deserializer)"} {"instruction": "How would you implement a function in Python 3 that\ncreates an accumulator object with the given initial value using a given AccumulatorParam object.", "response": "def accumulator(self, value, accum_param=None):\n \"\"\"\n Create an L{Accumulator} with the given initial value, using a given\n L{AccumulatorParam} helper object to define how to add values of the\n data type if provided. Default AccumulatorParams are used for integers\n and floating-point numbers if you do not provide one. For other types,\n a custom AccumulatorParam can be used.\n \"\"\"\n if accum_param is None:\n if isinstance(value, int):\n accum_param = accumulators.INT_ACCUMULATOR_PARAM\n elif isinstance(value, float):\n accum_param = accumulators.FLOAT_ACCUMULATOR_PARAM\n elif isinstance(value, complex):\n accum_param = accumulators.COMPLEX_ACCUMULATOR_PARAM\n else:\n raise TypeError(\"No default accumulator param for type %s\" % type(value))\n SparkContext._next_accum_id += 1\n return Accumulator(SparkContext._next_accum_id - 1, value, accum_param)"} {"instruction": "Write a Python 3 script for\nadding a file to be downloaded with this Spark job on every node.", "response": "def addFile(self, path, recursive=False):\n \"\"\"\n Add a file to be downloaded with this Spark job on every node.\n The C{path} passed can be either a local file, a file in HDFS\n (or other Hadoop-supported filesystems), or an HTTP, HTTPS or\n FTP URI.\n\n To access the file in Spark jobs, use\n L{SparkFiles.get(fileName)} with the\n filename to find its download location.\n\n A directory can be given if the recursive option is set to True.\n Currently directories are only supported for Hadoop-supported filesystems.\n\n .. note:: A path can be added only once. Subsequent additions of the same path are ignored.\n\n >>> from pyspark import SparkFiles\n >>> path = os.path.join(tempdir, \"test.txt\")\n >>> with open(path, \"w\") as testFile:\n ... _ = testFile.write(\"100\")\n >>> sc.addFile(path)\n >>> def func(iterator):\n ... with open(SparkFiles.get(\"test.txt\")) as testFile:\n ... fileVal = int(testFile.readline())\n ... return [x * fileVal for x in iterator]\n >>> sc.parallelize([1, 2, 3, 4]).mapPartitions(func).collect()\n [100, 200, 300, 400]\n \"\"\"\n self._jsc.sc().addFile(path, recursive)"} {"instruction": "How would you implement a function in Python 3 that\nadds a. py or. zip file to the SparkContext.", "response": "def addPyFile(self, path):\n \"\"\"\n Add a .py or .zip dependency for all tasks to be executed on this\n SparkContext in the future. The C{path} passed can be either a local\n file, a file in HDFS (or other Hadoop-supported filesystems), or an\n HTTP, HTTPS or FTP URI.\n\n .. note:: A path can be added only once. Subsequent additions of the same path are ignored.\n \"\"\"\n self.addFile(path)\n (dirname, filename) = os.path.split(path) # dirname may be directory or HDFS/S3 prefix\n if filename[-4:].lower() in self.PACKAGE_EXTENSIONS:\n self._python_includes.append(filename)\n # for tests in local mode\n sys.path.insert(1, os.path.join(SparkFiles.getRootDirectory(), filename))\n if sys.version > '3':\n import importlib\n importlib.invalidate_caches()"} {"instruction": "Can you implement a function in Python 3 that\nreturns a Java StorageLevel based on a pyspark. StorageLevel.", "response": "def _getJavaStorageLevel(self, storageLevel):\n \"\"\"\n Returns a Java StorageLevel based on a pyspark.StorageLevel.\n \"\"\"\n if not isinstance(storageLevel, StorageLevel):\n raise Exception(\"storageLevel must be of type pyspark.StorageLevel\")\n\n newStorageLevel = self._jvm.org.apache.spark.storage.StorageLevel\n return newStorageLevel(storageLevel.useDisk,\n storageLevel.useMemory,\n storageLevel.useOffHeap,\n storageLevel.deserialized,\n storageLevel.replication)"} {"instruction": "How would you explain what the following Python 3 function does\ndef setJobGroup(self, groupId, description, interruptOnCancel=False):\n self._jsc.setJobGroup(groupId, description, interruptOnCancel)", "response": "Sets the group ID for all the jobs in this thread and optionally a description."} {"instruction": "Write a Python 3 script for\nrunning the given partitionFunc on the specified set of partitions and returns the result as an array of elements.", "response": "def runJob(self, rdd, partitionFunc, partitions=None, allowLocal=False):\n \"\"\"\n Executes the given partitionFunc on the specified set of partitions,\n returning the result as an array of elements.\n\n If 'partitions' is not specified, this will run over all partitions.\n\n >>> myRDD = sc.parallelize(range(6), 3)\n >>> sc.runJob(myRDD, lambda part: [x * x for x in part])\n [0, 1, 4, 9, 16, 25]\n\n >>> myRDD = sc.parallelize(range(6), 3)\n >>> sc.runJob(myRDD, lambda part: [x * x for x in part], [0, 2], True)\n [0, 1, 16, 25]\n \"\"\"\n if partitions is None:\n partitions = range(rdd._jrdd.partitions().size())\n\n # Implementation note: This is implemented as a mapPartitions followed\n # by runJob() in order to avoid having to pass a Python lambda into\n # SparkContext#runJob.\n mappedRDD = rdd.mapPartitions(partitionFunc)\n sock_info = self._jvm.PythonRDD.runJob(self._jsc.sc(), mappedRDD._jrdd, partitions)\n return list(_load_from_socket(sock_info, mappedRDD._jrdd_deserializer))"} {"instruction": "Given the following Python 3 function, write the documentation\ndef dump_profiles(self, path):\n if self.profiler_collector is not None:\n self.profiler_collector.dump_profiles(path)\n else:\n raise RuntimeError(\"'spark.python.profile' configuration must be set \"\n \"to 'true' to enable Python profile.\")", "response": "Dump the profile stats into directory path"} {"instruction": "How would you explain what the following Python 3 function does\ndef train(cls, ratings, rank, iterations=5, lambda_=0.01, blocks=-1, nonnegative=False,\n seed=None):\n \"\"\"\n Train a matrix factorization model given an RDD of ratings by users\n for a subset of products. The ratings matrix is approximated as the\n product of two lower-rank matrices of a given rank (number of\n features). To solve for these features, ALS is run iteratively with\n a configurable level of parallelism.\n\n :param ratings:\n RDD of `Rating` or (userID, productID, rating) tuple.\n :param rank:\n Number of features to use (also referred to as the number of latent factors).\n :param iterations:\n Number of iterations of ALS.\n (default: 5)\n :param lambda_:\n Regularization parameter.\n (default: 0.01)\n :param blocks:\n Number of blocks used to parallelize the computation. A value\n of -1 will use an auto-configured number of blocks.\n (default: -1)\n :param nonnegative:\n A value of True will solve least-squares with nonnegativity\n constraints.\n (default: False)\n :param seed:\n Random seed for initial matrix factorization model. A value\n of None will use system time as the seed.\n (default: None)\n \"\"\"\n model = callMLlibFunc(\"trainALSModel\", cls._prepare(ratings), rank, iterations,\n lambda_, blocks, nonnegative, seed)\n return MatrixFactorizationModel(model)", "response": "Train a matrix factorization model given an RDD of ratings."} {"instruction": "Can you tell what is the following Python 3 function doing\ndef train(cls, data, minSupport=0.3, numPartitions=-1):\n model = callMLlibFunc(\"trainFPGrowthModel\", data, float(minSupport), int(numPartitions))\n return FPGrowthModel(model)", "response": "Train an FP - Growth model that contains frequent itemsets."} {"instruction": "How would you explain what the following Python 3 function does\ndef train(cls, data, minSupport=0.1, maxPatternLength=10, maxLocalProjDBSize=32000000):\n model = callMLlibFunc(\"trainPrefixSpanModel\",\n data, minSupport, maxPatternLength, maxLocalProjDBSize)\n return PrefixSpanModel(model)", "response": "Train a new prefix span model on the input data set."} {"instruction": "Implement a Python 3 function for\nsetting the sample points from the population. Should be a RDD", "response": "def setSample(self, sample):\n \"\"\"Set sample points from the population. Should be a RDD\"\"\"\n if not isinstance(sample, RDD):\n raise TypeError(\"samples should be a RDD, received %s\" % type(sample))\n self._sample = sample"} {"instruction": "How would you explain what the following Python 3 function does\ndef estimate(self, points):\n points = list(points)\n densities = callMLlibFunc(\n \"estimateKernelDensity\", self._sample, self._bandwidth, points)\n return np.asarray(densities)", "response": "Estimate the probability density at a set of points"} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef _start_update_server(auth_token):\n server = AccumulatorServer((\"localhost\", 0), _UpdateRequestHandler, auth_token)\n thread = threading.Thread(target=server.serve_forever)\n thread.daemon = True\n thread.start()\n return server", "response": "Start a TCP server to receive accumulator updates in a daemon thread and returns it"} {"instruction": "How would you code a function in Python 3 to\nadd a term to this accumulator s value", "response": "def add(self, term):\n \"\"\"Adds a term to this accumulator's value\"\"\"\n self._value = self.accum_param.addInPlace(self._value, term)"} {"instruction": "How would you code a function in Python 3 to\ncompute aggregates and returns the result as a DataFrame.", "response": "def agg(self, *exprs):\n \"\"\"Compute aggregates and returns the result as a :class:`DataFrame`.\n\n The available aggregate functions can be:\n\n 1. built-in aggregation functions, such as `avg`, `max`, `min`, `sum`, `count`\n\n 2. group aggregate pandas UDFs, created with :func:`pyspark.sql.functions.pandas_udf`\n\n .. note:: There is no partial aggregation with group aggregate UDFs, i.e.,\n a full shuffle is required. Also, all the data of a group will be loaded into\n memory, so the user should be aware of the potential OOM risk if data is skewed\n and certain groups are too large to fit in memory.\n\n .. seealso:: :func:`pyspark.sql.functions.pandas_udf`\n\n If ``exprs`` is a single :class:`dict` mapping from string to string, then the key\n is the column to perform aggregation on, and the value is the aggregate function.\n\n Alternatively, ``exprs`` can also be a list of aggregate :class:`Column` expressions.\n\n .. note:: Built-in aggregation functions and group aggregate pandas UDFs cannot be mixed\n in a single call to this function.\n\n :param exprs: a dict mapping from column name (string) to aggregate functions (string),\n or a list of :class:`Column`.\n\n >>> gdf = df.groupBy(df.name)\n >>> sorted(gdf.agg({\"*\": \"count\"}).collect())\n [Row(name=u'Alice', count(1)=1), Row(name=u'Bob', count(1)=1)]\n\n >>> from pyspark.sql import functions as F\n >>> sorted(gdf.agg(F.min(df.age)).collect())\n [Row(name=u'Alice', min(age)=2), Row(name=u'Bob', min(age)=5)]\n\n >>> from pyspark.sql.functions import pandas_udf, PandasUDFType\n >>> @pandas_udf('int', PandasUDFType.GROUPED_AGG) # doctest: +SKIP\n ... def min_udf(v):\n ... return v.min()\n >>> sorted(gdf.agg(min_udf(df.age)).collect()) # doctest: +SKIP\n [Row(name=u'Alice', min_udf(age)=2), Row(name=u'Bob', min_udf(age)=5)]\n \"\"\"\n assert exprs, \"exprs should not be empty\"\n if len(exprs) == 1 and isinstance(exprs[0], dict):\n jdf = self._jgd.agg(exprs[0])\n else:\n # Columns\n assert all(isinstance(c, Column) for c in exprs), \"all exprs should be Column\"\n jdf = self._jgd.agg(exprs[0]._jc,\n _to_seq(self.sql_ctx._sc, [c._jc for c in exprs[1:]]))\n return DataFrame(jdf, self.sql_ctx)"} {"instruction": "How would you implement a function in Python 3 that\npivots a column of the current DataFrame and perform the specified aggregation.", "response": "def pivot(self, pivot_col, values=None):\n \"\"\"\n Pivots a column of the current :class:`DataFrame` and perform the specified aggregation.\n There are two versions of pivot function: one that requires the caller to specify the list\n of distinct values to pivot on, and one that does not. The latter is more concise but less\n efficient, because Spark needs to first compute the list of distinct values internally.\n\n :param pivot_col: Name of the column to pivot.\n :param values: List of values that will be translated to columns in the output DataFrame.\n\n # Compute the sum of earnings for each year by course with each course as a separate column\n\n >>> df4.groupBy(\"year\").pivot(\"course\", [\"dotNET\", \"Java\"]).sum(\"earnings\").collect()\n [Row(year=2012, dotNET=15000, Java=20000), Row(year=2013, dotNET=48000, Java=30000)]\n\n # Or without specifying column values (less efficient)\n\n >>> df4.groupBy(\"year\").pivot(\"course\").sum(\"earnings\").collect()\n [Row(year=2012, Java=20000, dotNET=15000), Row(year=2013, Java=30000, dotNET=48000)]\n >>> df5.groupBy(\"sales.year\").pivot(\"sales.course\").sum(\"sales.earnings\").collect()\n [Row(year=2012, Java=20000, dotNET=15000), Row(year=2013, Java=30000, dotNET=48000)]\n \"\"\"\n if values is None:\n jgd = self._jgd.pivot(pivot_col)\n else:\n jgd = self._jgd.pivot(pivot_col, values)\n return GroupedData(jgd, self._df)"} {"instruction": "Create a Python 3 function for\nreturning a pandas. DataFrame that maps each group of the current DataFrame to a user - defined function and returns the result.", "response": "def apply(self, udf):\n \"\"\"\n Maps each group of the current :class:`DataFrame` using a pandas udf and returns the result\n as a `DataFrame`.\n\n The user-defined function should take a `pandas.DataFrame` and return another\n `pandas.DataFrame`. For each group, all columns are passed together as a `pandas.DataFrame`\n to the user-function and the returned `pandas.DataFrame` are combined as a\n :class:`DataFrame`.\n\n The returned `pandas.DataFrame` can be of arbitrary length and its schema must match the\n returnType of the pandas udf.\n\n .. note:: This function requires a full shuffle. all the data of a group will be loaded\n into memory, so the user should be aware of the potential OOM risk if data is skewed\n and certain groups are too large to fit in memory.\n\n .. note:: Experimental\n\n :param udf: a grouped map user-defined function returned by\n :func:`pyspark.sql.functions.pandas_udf`.\n\n >>> from pyspark.sql.functions import pandas_udf, PandasUDFType\n >>> df = spark.createDataFrame(\n ... [(1, 1.0), (1, 2.0), (2, 3.0), (2, 5.0), (2, 10.0)],\n ... (\"id\", \"v\"))\n >>> @pandas_udf(\"id long, v double\", PandasUDFType.GROUPED_MAP) # doctest: +SKIP\n ... def normalize(pdf):\n ... v = pdf.v\n ... return pdf.assign(v=(v - v.mean()) / v.std())\n >>> df.groupby(\"id\").apply(normalize).show() # doctest: +SKIP\n +---+-------------------+\n | id| v|\n +---+-------------------+\n | 1|-0.7071067811865475|\n | 1| 0.7071067811865475|\n | 2|-0.8320502943378437|\n | 2|-0.2773500981126146|\n | 2| 1.1094003924504583|\n +---+-------------------+\n\n .. seealso:: :meth:`pyspark.sql.functions.pandas_udf`\n\n \"\"\"\n # Columns are special because hasattr always return True\n if isinstance(udf, Column) or not hasattr(udf, 'func') \\\n or udf.evalType != PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF:\n raise ValueError(\"Invalid udf: the udf argument must be a pandas_udf of type \"\n \"GROUPED_MAP.\")\n df = self._df\n udf_column = udf(*[df[col] for col in df.columns])\n jdf = self._jgd.flatMapGroupsInPandas(udf_column._jc.expr())\n return DataFrame(jdf, self.sql_ctx)"} {"instruction": "Implement a function in Python 3 to\ncreate a new WindowSpec with the partitioning defined.", "response": "def partitionBy(*cols):\n \"\"\"\n Creates a :class:`WindowSpec` with the partitioning defined.\n \"\"\"\n sc = SparkContext._active_spark_context\n jspec = sc._jvm.org.apache.spark.sql.expressions.Window.partitionBy(_to_java_cols(cols))\n return WindowSpec(jspec)"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef rowsBetween(start, end):\n if start <= Window._PRECEDING_THRESHOLD:\n start = Window.unboundedPreceding\n if end >= Window._FOLLOWING_THRESHOLD:\n end = Window.unboundedFollowing\n sc = SparkContext._active_spark_context\n jspec = sc._jvm.org.apache.spark.sql.expressions.Window.rowsBetween(start, end)\n return WindowSpec(jspec)", "response": "Create a DataFrame with the rows between start and end."} {"instruction": "Can you write a function in Python 3 where it\nreturns a new WindowSpec object containing the rows between start and end.", "response": "def rowsBetween(self, start, end):\n \"\"\"\n Defines the frame boundaries, from `start` (inclusive) to `end` (inclusive).\n\n Both `start` and `end` are relative positions from the current row.\n For example, \"0\" means \"current row\", while \"-1\" means the row before\n the current row, and \"5\" means the fifth row after the current row.\n\n We recommend users use ``Window.unboundedPreceding``, ``Window.unboundedFollowing``,\n and ``Window.currentRow`` to specify special boundary values, rather than using integral\n values directly.\n\n :param start: boundary start, inclusive.\n The frame is unbounded if this is ``Window.unboundedPreceding``, or\n any value less than or equal to max(-sys.maxsize, -9223372036854775808).\n :param end: boundary end, inclusive.\n The frame is unbounded if this is ``Window.unboundedFollowing``, or\n any value greater than or equal to min(sys.maxsize, 9223372036854775807).\n \"\"\"\n if start <= Window._PRECEDING_THRESHOLD:\n start = Window.unboundedPreceding\n if end >= Window._FOLLOWING_THRESHOLD:\n end = Window.unboundedFollowing\n return WindowSpec(self._jspec.rowsBetween(start, end))"} {"instruction": "How would you implement a function in Python 3 that\ngenerates an RDD comprised of random samples from the uniform distribution.", "response": "def uniformRDD(sc, size, numPartitions=None, seed=None):\n \"\"\"\n Generates an RDD comprised of i.i.d. samples from the\n uniform distribution U(0.0, 1.0).\n\n To transform the distribution in the generated RDD from U(0.0, 1.0)\n to U(a, b), use\n C{RandomRDDs.uniformRDD(sc, n, p, seed)\\\n .map(lambda v: a + (b - a) * v)}\n\n :param sc: SparkContext used to create the RDD.\n :param size: Size of the RDD.\n :param numPartitions: Number of partitions in the RDD (default: `sc.defaultParallelism`).\n :param seed: Random seed (default: a random long integer).\n :return: RDD of float comprised of i.i.d. samples ~ `U(0.0, 1.0)`.\n\n >>> x = RandomRDDs.uniformRDD(sc, 100).collect()\n >>> len(x)\n 100\n >>> max(x) <= 1.0 and min(x) >= 0.0\n True\n >>> RandomRDDs.uniformRDD(sc, 100, 4).getNumPartitions()\n 4\n >>> parts = RandomRDDs.uniformRDD(sc, 100, seed=4).getNumPartitions()\n >>> parts == sc.defaultParallelism\n True\n \"\"\"\n return callMLlibFunc(\"uniformRDD\", sc._jsc, size, numPartitions, seed)"} {"instruction": "How would you code a function in Python 3 to\ngenerate an RDD comprised of i. d. samples from the standard normal distribution.", "response": "def normalRDD(sc, size, numPartitions=None, seed=None):\n \"\"\"\n Generates an RDD comprised of i.i.d. samples from the standard normal\n distribution.\n\n To transform the distribution in the generated RDD from standard normal\n to some other normal N(mean, sigma^2), use\n C{RandomRDDs.normal(sc, n, p, seed)\\\n .map(lambda v: mean + sigma * v)}\n\n :param sc: SparkContext used to create the RDD.\n :param size: Size of the RDD.\n :param numPartitions: Number of partitions in the RDD (default: `sc.defaultParallelism`).\n :param seed: Random seed (default: a random long integer).\n :return: RDD of float comprised of i.i.d. samples ~ N(0.0, 1.0).\n\n >>> x = RandomRDDs.normalRDD(sc, 1000, seed=1)\n >>> stats = x.stats()\n >>> stats.count()\n 1000\n >>> abs(stats.mean() - 0.0) < 0.1\n True\n >>> abs(stats.stdev() - 1.0) < 0.1\n True\n \"\"\"\n return callMLlibFunc(\"normalRDD\", sc._jsc, size, numPartitions, seed)"} {"instruction": "Make a summary of the following Python 3 code\ndef logNormalRDD(sc, mean, std, size, numPartitions=None, seed=None):\n return callMLlibFunc(\"logNormalRDD\", sc._jsc, float(mean), float(std),\n size, numPartitions, seed)", "response": "Generates an RDD comprised of i. d. samples from the log normal distribution with the input mean and standard distribution."} {"instruction": "Create a Python 3 function for\ngenerating an RDD comprised of i. d. samples from the Exponential distribution with the input mean.", "response": "def exponentialRDD(sc, mean, size, numPartitions=None, seed=None):\n \"\"\"\n Generates an RDD comprised of i.i.d. samples from the Exponential\n distribution with the input mean.\n\n :param sc: SparkContext used to create the RDD.\n :param mean: Mean, or 1 / lambda, for the Exponential distribution.\n :param size: Size of the RDD.\n :param numPartitions: Number of partitions in the RDD (default: `sc.defaultParallelism`).\n :param seed: Random seed (default: a random long integer).\n :return: RDD of float comprised of i.i.d. samples ~ Exp(mean).\n\n >>> mean = 2.0\n >>> x = RandomRDDs.exponentialRDD(sc, mean, 1000, seed=2)\n >>> stats = x.stats()\n >>> stats.count()\n 1000\n >>> abs(stats.mean() - mean) < 0.5\n True\n >>> from math import sqrt\n >>> abs(stats.stdev() - sqrt(mean)) < 0.5\n True\n \"\"\"\n return callMLlibFunc(\"exponentialRDD\", sc._jsc, float(mean), size, numPartitions, seed)"} {"instruction": "Write a Python 3 function for\ngenerating an RDD comprised of i. d. samples from the Gamma distribution with the input shape and scale and size.", "response": "def gammaRDD(sc, shape, scale, size, numPartitions=None, seed=None):\n \"\"\"\n Generates an RDD comprised of i.i.d. samples from the Gamma\n distribution with the input shape and scale.\n\n :param sc: SparkContext used to create the RDD.\n :param shape: shape (> 0) parameter for the Gamma distribution\n :param scale: scale (> 0) parameter for the Gamma distribution\n :param size: Size of the RDD.\n :param numPartitions: Number of partitions in the RDD (default: `sc.defaultParallelism`).\n :param seed: Random seed (default: a random long integer).\n :return: RDD of float comprised of i.i.d. samples ~ Gamma(shape, scale).\n\n >>> from math import sqrt\n >>> shape = 1.0\n >>> scale = 2.0\n >>> expMean = shape * scale\n >>> expStd = sqrt(shape * scale * scale)\n >>> x = RandomRDDs.gammaRDD(sc, shape, scale, 1000, seed=2)\n >>> stats = x.stats()\n >>> stats.count()\n 1000\n >>> abs(stats.mean() - expMean) < 0.5\n True\n >>> abs(stats.stdev() - expStd) < 0.5\n True\n \"\"\"\n return callMLlibFunc(\"gammaRDD\", sc._jsc, float(shape),\n float(scale), size, numPartitions, seed)"} {"instruction": "Create a Python 3 function for\ngenerating an RDD comprised of vectors containing i. d samples drawn from the uniform distribution U ( 0. 0 1. 0 ).", "response": "def uniformVectorRDD(sc, numRows, numCols, numPartitions=None, seed=None):\n \"\"\"\n Generates an RDD comprised of vectors containing i.i.d. samples drawn\n from the uniform distribution U(0.0, 1.0).\n\n :param sc: SparkContext used to create the RDD.\n :param numRows: Number of Vectors in the RDD.\n :param numCols: Number of elements in each Vector.\n :param numPartitions: Number of partitions in the RDD.\n :param seed: Seed for the RNG that generates the seed for the generator in each partition.\n :return: RDD of Vector with vectors containing i.i.d samples ~ `U(0.0, 1.0)`.\n\n >>> import numpy as np\n >>> mat = np.matrix(RandomRDDs.uniformVectorRDD(sc, 10, 10).collect())\n >>> mat.shape\n (10, 10)\n >>> mat.max() <= 1.0 and mat.min() >= 0.0\n True\n >>> RandomRDDs.uniformVectorRDD(sc, 10, 10, 4).getNumPartitions()\n 4\n \"\"\"\n return callMLlibFunc(\"uniformVectorRDD\", sc._jsc, numRows, numCols, numPartitions, seed)"} {"instruction": "How would you explain what the following Python 3 function does\ndef normalVectorRDD(sc, numRows, numCols, numPartitions=None, seed=None):\n return callMLlibFunc(\"normalVectorRDD\", sc._jsc, numRows, numCols, numPartitions, seed)", "response": "Generates an RDD comprised of vectors containing i. d. samples drawn from the standard normal distribution."} {"instruction": "Explain what the following Python 3 code does\ndef logNormalVectorRDD(sc, mean, std, numRows, numCols, numPartitions=None, seed=None):\n return callMLlibFunc(\"logNormalVectorRDD\", sc._jsc, float(mean), float(std),\n numRows, numCols, numPartitions, seed)", "response": "Generates an RDD comprised of vectors drawn from the log normal distribution."} {"instruction": "Write a Python 3 function for\ngenerating an RDD comprised of vectors drawn from the Poisson distribution with the input mean.", "response": "def poissonVectorRDD(sc, mean, numRows, numCols, numPartitions=None, seed=None):\n \"\"\"\n Generates an RDD comprised of vectors containing i.i.d. samples drawn\n from the Poisson distribution with the input mean.\n\n :param sc: SparkContext used to create the RDD.\n :param mean: Mean, or lambda, for the Poisson distribution.\n :param numRows: Number of Vectors in the RDD.\n :param numCols: Number of elements in each Vector.\n :param numPartitions: Number of partitions in the RDD (default: `sc.defaultParallelism`)\n :param seed: Random seed (default: a random long integer).\n :return: RDD of Vector with vectors containing i.i.d. samples ~ Pois(mean).\n\n >>> import numpy as np\n >>> mean = 100.0\n >>> rdd = RandomRDDs.poissonVectorRDD(sc, mean, 100, 100, seed=1)\n >>> mat = np.mat(rdd.collect())\n >>> mat.shape\n (100, 100)\n >>> abs(mat.mean() - mean) < 0.5\n True\n >>> from math import sqrt\n >>> abs(mat.std() - sqrt(mean)) < 0.5\n True\n \"\"\"\n return callMLlibFunc(\"poissonVectorRDD\", sc._jsc, float(mean), numRows, numCols,\n numPartitions, seed)"} {"instruction": "Write a Python 3 script to\ngenerate an RDD comprised of vectors containing i. d. samples drawn from the Gamma distribution.", "response": "def gammaVectorRDD(sc, shape, scale, numRows, numCols, numPartitions=None, seed=None):\n \"\"\"\n Generates an RDD comprised of vectors containing i.i.d. samples drawn\n from the Gamma distribution.\n\n :param sc: SparkContext used to create the RDD.\n :param shape: Shape (> 0) of the Gamma distribution\n :param scale: Scale (> 0) of the Gamma distribution\n :param numRows: Number of Vectors in the RDD.\n :param numCols: Number of elements in each Vector.\n :param numPartitions: Number of partitions in the RDD (default: `sc.defaultParallelism`).\n :param seed: Random seed (default: a random long integer).\n :return: RDD of Vector with vectors containing i.i.d. samples ~ Gamma(shape, scale).\n\n >>> import numpy as np\n >>> from math import sqrt\n >>> shape = 1.0\n >>> scale = 2.0\n >>> expMean = shape * scale\n >>> expStd = sqrt(shape * scale * scale)\n >>> mat = np.matrix(RandomRDDs.gammaVectorRDD(sc, shape, scale, 100, 100, seed=1).collect())\n >>> mat.shape\n (100, 100)\n >>> abs(mat.mean() - expMean) < 0.1\n True\n >>> abs(mat.std() - expStd) < 0.1\n True\n \"\"\"\n return callMLlibFunc(\"gammaVectorRDD\", sc._jsc, float(shape), float(scale),\n numRows, numCols, numPartitions, seed)"} {"instruction": "Here you have a function in Python 3, explain what it does\ndef getActiveSession(cls):\n from pyspark import SparkContext\n sc = SparkContext._active_spark_context\n if sc is None:\n return None\n else:\n if sc._jvm.SparkSession.getActiveSession().isDefined():\n SparkSession(sc, sc._jvm.SparkSession.getActiveSession().get())\n return SparkSession._activeSession\n else:\n return None", "response": "Returns the active SparkSession for the current thread."} {"instruction": "Can you create a Python 3 function that\ninterfaces through which the user may create drop alter or query underlying databases tables functions etc.", "response": "def catalog(self):\n \"\"\"Interface through which the user may create, drop, alter or query underlying\n databases, tables, functions etc.\n\n :return: :class:`Catalog`\n \"\"\"\n from pyspark.sql.catalog import Catalog\n if not hasattr(self, \"_catalog\"):\n self._catalog = Catalog(self)\n return self._catalog"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef range(self, start, end=None, step=1, numPartitions=None):\n if numPartitions is None:\n numPartitions = self._sc.defaultParallelism\n\n if end is None:\n jdf = self._jsparkSession.range(0, int(start), int(step), int(numPartitions))\n else:\n jdf = self._jsparkSession.range(int(start), int(end), int(step), int(numPartitions))\n\n return DataFrame(jdf, self._wrapped)", "response": "Create a DataFrame with a single column named\n containing elements in a range from start to end."} {"instruction": "Can you tell what is the following Python 3 function doing\ndef _inferSchemaFromList(self, data, names=None):\n if not data:\n raise ValueError(\"can not infer schema from empty dataset\")\n first = data[0]\n if type(first) is dict:\n warnings.warn(\"inferring schema from dict is deprecated,\"\n \"please use pyspark.sql.Row instead\")\n schema = reduce(_merge_type, (_infer_schema(row, names) for row in data))\n if _has_nulltype(schema):\n raise ValueError(\"Some of types cannot be determined after inferring\")\n return schema", "response": "Infer schema from list of Row or tuple."} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef _inferSchema(self, rdd, samplingRatio=None, names=None):\n first = rdd.first()\n if not first:\n raise ValueError(\"The first row in RDD is empty, \"\n \"can not infer schema\")\n if type(first) is dict:\n warnings.warn(\"Using RDD of dict to inferSchema is deprecated. \"\n \"Use pyspark.sql.Row instead\")\n\n if samplingRatio is None:\n schema = _infer_schema(first, names=names)\n if _has_nulltype(schema):\n for row in rdd.take(100)[1:]:\n schema = _merge_type(schema, _infer_schema(row, names=names))\n if not _has_nulltype(schema):\n break\n else:\n raise ValueError(\"Some of types cannot be determined by the \"\n \"first 100 rows, please try again with sampling\")\n else:\n if samplingRatio < 0.99:\n rdd = rdd.sample(False, float(samplingRatio))\n schema = rdd.map(lambda row: _infer_schema(row, names)).reduce(_merge_type)\n return schema", "response": "Infer schema from an RDD of Row or tuple."} {"instruction": "Implement a Python 3 function for\ncreating an RDD for DataFrame from an existing RDD returns the RDD and schema.", "response": "def _createFromRDD(self, rdd, schema, samplingRatio):\n \"\"\"\n Create an RDD for DataFrame from an existing RDD, returns the RDD and schema.\n \"\"\"\n if schema is None or isinstance(schema, (list, tuple)):\n struct = self._inferSchema(rdd, samplingRatio, names=schema)\n converter = _create_converter(struct)\n rdd = rdd.map(converter)\n if isinstance(schema, (list, tuple)):\n for i, name in enumerate(schema):\n struct.fields[i].name = name\n struct.names[i] = name\n schema = struct\n\n elif not isinstance(schema, StructType):\n raise TypeError(\"schema should be StructType or list or None, but got: %s\" % schema)\n\n # convert python objects to sql data\n rdd = rdd.map(schema.toInternal)\n return rdd, schema"} {"instruction": "Create a Python 3 function for\ncreating an RDD for DataFrame from a list or pandas. DataFrame returns the RDD and schema.", "response": "def _createFromLocal(self, data, schema):\n \"\"\"\n Create an RDD for DataFrame from a list or pandas.DataFrame, returns\n the RDD and schema.\n \"\"\"\n # make sure data could consumed multiple times\n if not isinstance(data, list):\n data = list(data)\n\n if schema is None or isinstance(schema, (list, tuple)):\n struct = self._inferSchemaFromList(data, names=schema)\n converter = _create_converter(struct)\n data = map(converter, data)\n if isinstance(schema, (list, tuple)):\n for i, name in enumerate(schema):\n struct.fields[i].name = name\n struct.names[i] = name\n schema = struct\n\n elif not isinstance(schema, StructType):\n raise TypeError(\"schema should be StructType or list or None, but got: %s\" % schema)\n\n # convert python objects to sql data\n data = [schema.toInternal(row) for row in data]\n return self._sc.parallelize(data), schema"} {"instruction": "Explain what the following Python 3 code does\ndef _get_numpy_record_dtype(self, rec):\n import numpy as np\n cur_dtypes = rec.dtype\n col_names = cur_dtypes.names\n record_type_list = []\n has_rec_fix = False\n for i in xrange(len(cur_dtypes)):\n curr_type = cur_dtypes[i]\n # If type is a datetime64 timestamp, convert to microseconds\n # NOTE: if dtype is datetime[ns] then np.record.tolist() will output values as longs,\n # conversion from [us] or lower will lead to py datetime objects, see SPARK-22417\n if curr_type == np.dtype('datetime64[ns]'):\n curr_type = 'datetime64[us]'\n has_rec_fix = True\n record_type_list.append((str(col_names[i]), curr_type))\n return np.dtype(record_type_list) if has_rec_fix else None", "response": "Returns a numpy. record dtype for a record."} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef _convert_from_pandas(self, pdf, schema, timezone):\n if timezone is not None:\n from pyspark.sql.types import _check_series_convert_timestamps_tz_local\n copied = False\n if isinstance(schema, StructType):\n for field in schema:\n # TODO: handle nested timestamps, such as ArrayType(TimestampType())?\n if isinstance(field.dataType, TimestampType):\n s = _check_series_convert_timestamps_tz_local(pdf[field.name], timezone)\n if s is not pdf[field.name]:\n if not copied:\n # Copy once if the series is modified to prevent the original\n # Pandas DataFrame from being updated\n pdf = pdf.copy()\n copied = True\n pdf[field.name] = s\n else:\n for column, series in pdf.iteritems():\n s = _check_series_convert_timestamps_tz_local(series, timezone)\n if s is not series:\n if not copied:\n # Copy once if the series is modified to prevent the original\n # Pandas DataFrame from being updated\n pdf = pdf.copy()\n copied = True\n pdf[column] = s\n\n # Convert pandas.DataFrame to list of numpy records\n np_records = pdf.to_records(index=False)\n\n # Check if any columns need to be fixed for Spark to infer properly\n if len(np_records) > 0:\n record_dtype = self._get_numpy_record_dtype(np_records[0])\n if record_dtype is not None:\n return [r.astype(record_dtype).tolist() for r in np_records]\n\n # Convert list of numpy records to python lists\n return [r.tolist() for r in np_records]", "response": "Convert a pandas. DataFrame to list of records that can be used to make a DataFrame\n "} {"instruction": "Explain what the following Python 3 code does\ndef _create_from_pandas_with_arrow(self, pdf, schema, timezone):\n from pyspark.serializers import ArrowStreamPandasSerializer\n from pyspark.sql.types import from_arrow_type, to_arrow_type, TimestampType\n from pyspark.sql.utils import require_minimum_pandas_version, \\\n require_minimum_pyarrow_version\n\n require_minimum_pandas_version()\n require_minimum_pyarrow_version()\n\n from pandas.api.types import is_datetime64_dtype, is_datetime64tz_dtype\n import pyarrow as pa\n\n # Create the Spark schema from list of names passed in with Arrow types\n if isinstance(schema, (list, tuple)):\n arrow_schema = pa.Schema.from_pandas(pdf, preserve_index=False)\n struct = StructType()\n for name, field in zip(schema, arrow_schema):\n struct.add(name, from_arrow_type(field.type), nullable=field.nullable)\n schema = struct\n\n # Determine arrow types to coerce data when creating batches\n if isinstance(schema, StructType):\n arrow_types = [to_arrow_type(f.dataType) for f in schema.fields]\n elif isinstance(schema, DataType):\n raise ValueError(\"Single data type %s is not supported with Arrow\" % str(schema))\n else:\n # Any timestamps must be coerced to be compatible with Spark\n arrow_types = [to_arrow_type(TimestampType())\n if is_datetime64_dtype(t) or is_datetime64tz_dtype(t) else None\n for t in pdf.dtypes]\n\n # Slice the DataFrame to be batched\n step = -(-len(pdf) // self.sparkContext.defaultParallelism) # round int up\n pdf_slices = (pdf[start:start + step] for start in xrange(0, len(pdf), step))\n\n # Create list of Arrow (columns, type) for serializer dump_stream\n arrow_data = [[(c, t) for (_, c), t in zip(pdf_slice.iteritems(), arrow_types)]\n for pdf_slice in pdf_slices]\n\n jsqlContext = self._wrapped._jsqlContext\n\n safecheck = self._wrapped._conf.arrowSafeTypeConversion()\n col_by_name = True # col by name only applies to StructType columns, can't happen here\n ser = ArrowStreamPandasSerializer(timezone, safecheck, col_by_name)\n\n def reader_func(temp_filename):\n return self._jvm.PythonSQLUtils.readArrowStreamFromFile(jsqlContext, temp_filename)\n\n def create_RDD_server():\n return self._jvm.ArrowRDDServer(jsqlContext)\n\n # Create Spark DataFrame from Arrow stream file, using one batch per partition\n jrdd = self._sc._serialize_to_jvm(arrow_data, ser, reader_func, create_RDD_server)\n jdf = self._jvm.PythonSQLUtils.toDataFrame(jrdd, schema.json(), jsqlContext)\n df = DataFrame(jdf, self._wrapped)\n df._schema = schema\n return df", "response": "Create a DataFrame from a given pandas. DataFrame by slicing it into partitions converting it into Arrow data then sending to the JVM to parallelize."} {"instruction": "Write a Python 3 script for\ninitializing a SparkSession for a pyspark shell session.", "response": "def _create_shell_session():\n \"\"\"\n Initialize a SparkSession for a pyspark shell session. This is called from shell.py\n to make error handling simpler without needing to declare local variables in that\n script, which would expose those to users.\n \"\"\"\n import py4j\n from pyspark.conf import SparkConf\n from pyspark.context import SparkContext\n try:\n # Try to access HiveConf, it will raise exception if Hive is not added\n conf = SparkConf()\n if conf.get('spark.sql.catalogImplementation', 'hive').lower() == 'hive':\n SparkContext._jvm.org.apache.hadoop.hive.conf.HiveConf()\n return SparkSession.builder\\\n .enableHiveSupport()\\\n .getOrCreate()\n else:\n return SparkSession.builder.getOrCreate()\n except (py4j.protocol.Py4JError, TypeError):\n if conf.get('spark.sql.catalogImplementation', '').lower() == 'hive':\n warnings.warn(\"Fall back to non-hive support because failing to access HiveConf, \"\n \"please make sure you build spark with hive\")\n\n return SparkSession.builder.getOrCreate()"} {"instruction": "Explain what the following Python 3 code does\ndef createDataFrame(self, data, schema=None, samplingRatio=None, verifySchema=True):\n SparkSession._activeSession = self\n self._jvm.SparkSession.setActiveSession(self._jsparkSession)\n if isinstance(data, DataFrame):\n raise TypeError(\"data is already a DataFrame\")\n\n if isinstance(schema, basestring):\n schema = _parse_datatype_string(schema)\n elif isinstance(schema, (list, tuple)):\n # Must re-encode any unicode strings to be consistent with StructField names\n schema = [x.encode('utf-8') if not isinstance(x, str) else x for x in schema]\n\n try:\n import pandas\n has_pandas = True\n except Exception:\n has_pandas = False\n if has_pandas and isinstance(data, pandas.DataFrame):\n from pyspark.sql.utils import require_minimum_pandas_version\n require_minimum_pandas_version()\n\n if self._wrapped._conf.pandasRespectSessionTimeZone():\n timezone = self._wrapped._conf.sessionLocalTimeZone()\n else:\n timezone = None\n\n # If no schema supplied by user then get the names of columns only\n if schema is None:\n schema = [str(x) if not isinstance(x, basestring) else\n (x.encode('utf-8') if not isinstance(x, str) else x)\n for x in data.columns]\n\n if self._wrapped._conf.arrowEnabled() and len(data) > 0:\n try:\n return self._create_from_pandas_with_arrow(data, schema, timezone)\n except Exception as e:\n from pyspark.util import _exception_message\n\n if self._wrapped._conf.arrowFallbackEnabled():\n msg = (\n \"createDataFrame attempted Arrow optimization because \"\n \"'spark.sql.execution.arrow.enabled' is set to true; however, \"\n \"failed by the reason below:\\n %s\\n\"\n \"Attempting non-optimization as \"\n \"'spark.sql.execution.arrow.fallback.enabled' is set to \"\n \"true.\" % _exception_message(e))\n warnings.warn(msg)\n else:\n msg = (\n \"createDataFrame attempted Arrow optimization because \"\n \"'spark.sql.execution.arrow.enabled' is set to true, but has reached \"\n \"the error below and will not continue because automatic fallback \"\n \"with 'spark.sql.execution.arrow.fallback.enabled' has been set to \"\n \"false.\\n %s\" % _exception_message(e))\n warnings.warn(msg)\n raise\n data = self._convert_from_pandas(data, schema, timezone)\n\n if isinstance(schema, StructType):\n verify_func = _make_type_verifier(schema) if verifySchema else lambda _: True\n\n def prepare(obj):\n verify_func(obj)\n return obj\n elif isinstance(schema, DataType):\n dataType = schema\n schema = StructType().add(\"value\", schema)\n\n verify_func = _make_type_verifier(\n dataType, name=\"field value\") if verifySchema else lambda _: True\n\n def prepare(obj):\n verify_func(obj)\n return obj,\n else:\n prepare = lambda obj: obj\n\n if isinstance(data, RDD):\n rdd, schema = self._createFromRDD(data.map(prepare), schema, samplingRatio)\n else:\n rdd, schema = self._createFromLocal(map(prepare, data), schema)\n jrdd = self._jvm.SerDeUtil.toJavaArray(rdd._to_java_object_rdd())\n jdf = self._jsparkSession.applySchemaToPythonRDD(jrdd.rdd(), schema.json())\n df = DataFrame(jdf, self._wrapped)\n df._schema = schema\n return df", "response": "Creates a DataFrame from an RDD of data."} {"instruction": "Can you tell what is the following Python 3 function doing\ndef sql(self, sqlQuery):\n return DataFrame(self._jsparkSession.sql(sqlQuery), self._wrapped)", "response": "Returns a DataFrame representing the result of the given SQL query."} {"instruction": "Make a summary of the following Python 3 code\ndef table(self, tableName):\n return DataFrame(self._jsparkSession.table(tableName), self._wrapped)", "response": "Returns the specified table as a DataFrame."} {"instruction": "Write a Python 3 function that can\nreturn a new StreamingQueryManager that allows managing all the : class:`StreamingQuery` StreamingQueries active on this context.", "response": "def streams(self):\n \"\"\"Returns a :class:`StreamingQueryManager` that allows managing all the\n :class:`StreamingQuery` StreamingQueries active on `this` context.\n\n .. note:: Evolving.\n\n :return: :class:`StreamingQueryManager`\n \"\"\"\n from pyspark.sql.streaming import StreamingQueryManager\n return StreamingQueryManager(self._jsparkSession.streams())"} {"instruction": "Given the following Python 3 function, write the documentation\ndef stop(self):\n self._sc.stop()\n # We should clean the default session up. See SPARK-23228.\n self._jvm.SparkSession.clearDefaultSession()\n self._jvm.SparkSession.clearActiveSession()\n SparkSession._instantiatedSession = None\n SparkSession._activeSession = None", "response": "Stop the underlying SparkContext."} {"instruction": "Can you implement a function in Python 3 that\nreturns a SparkJobInfo object for the given job id.", "response": "def getJobInfo(self, jobId):\n \"\"\"\n Returns a :class:`SparkJobInfo` object, or None if the job info\n could not be found or was garbage collected.\n \"\"\"\n job = self._jtracker.getJobInfo(jobId)\n if job is not None:\n return SparkJobInfo(jobId, job.stageIds(), str(job.status()))"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef getStageInfo(self, stageId):\n stage = self._jtracker.getStageInfo(stageId)\n if stage is not None:\n # TODO: fetch them in batch for better performance\n attrs = [getattr(stage, f)() for f in SparkStageInfo._fields[1:]]\n return SparkStageInfo(stageId, *attrs)", "response": "Returns a SparkStageInfo object for the given stage."} {"instruction": "Can you implement a function in Python 3 that\nrestores an object of namedtuple", "response": "def _restore(name, fields, value):\n \"\"\" Restore an object of namedtuple\"\"\"\n k = (name, fields)\n cls = __cls.get(k)\n if cls is None:\n cls = collections.namedtuple(name, fields)\n __cls[k] = cls\n return cls(*value)"} {"instruction": "Write a Python 3 script to\nmake class generated by namedtuple picklable", "response": "def _hack_namedtuple(cls):\n \"\"\" Make class generated by namedtuple picklable \"\"\"\n name = cls.__name__\n fields = cls._fields\n\n def __reduce__(self):\n return (_restore, (name, fields, tuple(self)))\n cls.__reduce__ = __reduce__\n cls._is_namedtuple_ = True\n return cls"} {"instruction": "How would you code a function in Python 3 to\nhijack a namedtuple function to make it picklable", "response": "def _hijack_namedtuple():\n \"\"\" Hack namedtuple() to make it picklable \"\"\"\n # hijack only one time\n if hasattr(collections.namedtuple, \"__hijack\"):\n return\n\n global _old_namedtuple # or it will put in closure\n global _old_namedtuple_kwdefaults # or it will put in closure too\n\n def _copy_func(f):\n return types.FunctionType(f.__code__, f.__globals__, f.__name__,\n f.__defaults__, f.__closure__)\n\n def _kwdefaults(f):\n # __kwdefaults__ contains the default values of keyword-only arguments which are\n # introduced from Python 3. The possible cases for __kwdefaults__ in namedtuple\n # are as below:\n #\n # - Does not exist in Python 2.\n # - Returns None in <= Python 3.5.x.\n # - Returns a dictionary containing the default values to the keys from Python 3.6.x\n # (See https://bugs.python.org/issue25628).\n kargs = getattr(f, \"__kwdefaults__\", None)\n if kargs is None:\n return {}\n else:\n return kargs\n\n _old_namedtuple = _copy_func(collections.namedtuple)\n _old_namedtuple_kwdefaults = _kwdefaults(collections.namedtuple)\n\n def namedtuple(*args, **kwargs):\n for k, v in _old_namedtuple_kwdefaults.items():\n kwargs[k] = kwargs.get(k, v)\n cls = _old_namedtuple(*args, **kwargs)\n return _hack_namedtuple(cls)\n\n # replace namedtuple with the new one\n collections.namedtuple.__globals__[\"_old_namedtuple_kwdefaults\"] = _old_namedtuple_kwdefaults\n collections.namedtuple.__globals__[\"_old_namedtuple\"] = _old_namedtuple\n collections.namedtuple.__globals__[\"_hack_namedtuple\"] = _hack_namedtuple\n collections.namedtuple.__code__ = namedtuple.__code__\n collections.namedtuple.__hijack = 1\n\n # hack the cls already generated by namedtuple.\n # Those created in other modules can be pickled as normal,\n # so only hack those in __main__ module\n for n, o in sys.modules[\"__main__\"].__dict__.items():\n if (type(o) is type and o.__base__ is tuple\n and hasattr(o, \"_fields\")\n and \"__reduce__\" not in o.__dict__):\n _hack_namedtuple(o)"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef load_stream(self, stream):\n # load the batches\n for batch in self.serializer.load_stream(stream):\n yield batch\n\n # load the batch order indices\n num = read_int(stream)\n batch_order = []\n for i in xrange(num):\n index = read_int(stream)\n batch_order.append(index)\n yield batch_order", "response": "Load a stream of un - ordered Arrow RecordBatches and yield the list of indices that can be used to put the RecordBatches in the correct order."} {"instruction": "Explain what the following Python 3 code does\ndef _create_batch(self, series):\n import pandas as pd\n import pyarrow as pa\n from pyspark.sql.types import _check_series_convert_timestamps_internal\n # Make input conform to [(series1, type1), (series2, type2), ...]\n if not isinstance(series, (list, tuple)) or \\\n (len(series) == 2 and isinstance(series[1], pa.DataType)):\n series = [series]\n series = ((s, None) if not isinstance(s, (list, tuple)) else s for s in series)\n\n def create_array(s, t):\n mask = s.isnull()\n # Ensure timestamp series are in expected form for Spark internal representation\n if t is not None and pa.types.is_timestamp(t):\n s = _check_series_convert_timestamps_internal(s.fillna(0), self._timezone)\n # TODO: need cast after Arrow conversion, ns values cause error with pandas 0.19.2\n return pa.Array.from_pandas(s, mask=mask).cast(t, safe=False)\n\n try:\n array = pa.Array.from_pandas(s, mask=mask, type=t, safe=self._safecheck)\n except pa.ArrowException as e:\n error_msg = \"Exception thrown when converting pandas.Series (%s) to Arrow \" + \\\n \"Array (%s). It can be caused by overflows or other unsafe \" + \\\n \"conversions warned by Arrow. Arrow safe type check can be \" + \\\n \"disabled by using SQL config \" + \\\n \"`spark.sql.execution.pandas.arrowSafeTypeConversion`.\"\n raise RuntimeError(error_msg % (s.dtype, t), e)\n return array\n\n arrs = []\n for s, t in series:\n if t is not None and pa.types.is_struct(t):\n if not isinstance(s, pd.DataFrame):\n raise ValueError(\"A field of type StructType expects a pandas.DataFrame, \"\n \"but got: %s\" % str(type(s)))\n\n # Input partition and result pandas.DataFrame empty, make empty Arrays with struct\n if len(s) == 0 and len(s.columns) == 0:\n arrs_names = [(pa.array([], type=field.type), field.name) for field in t]\n # Assign result columns by schema name if user labeled with strings\n elif self._assign_cols_by_name and any(isinstance(name, basestring)\n for name in s.columns):\n arrs_names = [(create_array(s[field.name], field.type), field.name)\n for field in t]\n # Assign result columns by position\n else:\n arrs_names = [(create_array(s[s.columns[i]], field.type), field.name)\n for i, field in enumerate(t)]\n\n struct_arrs, struct_names = zip(*arrs_names)\n arrs.append(pa.StructArray.from_arrays(struct_arrs, struct_names))\n else:\n arrs.append(create_array(s, t))\n\n return pa.RecordBatch.from_arrays(arrs, [\"_%d\" % i for i in xrange(len(arrs))])", "response": "Create an Arrow record batch from the given pandas. Series or list of Series or list of ArrowRecordBatches."} {"instruction": "Can you tell what is the following Python 3 function doing\ndef dump_stream(self, iterator, stream):\n batches = (self._create_batch(series) for series in iterator)\n super(ArrowStreamPandasSerializer, self).dump_stream(batches, stream)", "response": "Make ArrowRecordBatches from Pandas Series and serialize."} {"instruction": "Write a Python 3 script for\ndumping a series to a stream.", "response": "def dump_stream(self, iterator, stream):\n \"\"\"\n Override because Pandas UDFs require a START_ARROW_STREAM before the Arrow stream is sent.\n This should be sent after creating the first record batch so in case of an error, it can\n be sent back to the JVM before the Arrow stream starts.\n \"\"\"\n\n def init_stream_yield_batches():\n should_write_start_length = True\n for series in iterator:\n batch = self._create_batch(series)\n if should_write_start_length:\n write_int(SpecialLengths.START_ARROW_STREAM, stream)\n should_write_start_length = False\n yield batch\n\n return ArrowStreamSerializer.dump_stream(self, init_stream_yield_batches(), stream)"} {"instruction": "Make a summary of the following Python 3 code\ndef awaitTermination(self, timeout=None):\n if timeout is not None:\n if not isinstance(timeout, (int, float)) or timeout < 0:\n raise ValueError(\"timeout must be a positive integer or float. Got %s\" % timeout)\n return self._jsq.awaitTermination(int(timeout * 1000))\n else:\n return self._jsq.awaitTermination()", "response": "Waits for the termination of this query or by an exception."} {"instruction": "Implement a function in Python 3 to\nreturn an array of the most recent [[ StreamingQueryProgress ]] updates for this query.", "response": "def recentProgress(self):\n \"\"\"Returns an array of the most recent [[StreamingQueryProgress]] updates for this query.\n The number of progress updates retained for each stream is configured by Spark session\n configuration `spark.sql.streaming.numRecentProgressUpdates`.\n \"\"\"\n return [json.loads(p.json()) for p in self._jsq.recentProgress()]"} {"instruction": "How would you implement a function in Python 3 that\nreturns the most recent streaming query progress update or None if there were no progress updates", "response": "def lastProgress(self):\n \"\"\"\n Returns the most recent :class:`StreamingQueryProgress` update of this streaming query or\n None if there were no progress updates\n :return: a map\n \"\"\"\n lastProgress = self._jsq.lastProgress()\n if lastProgress:\n return json.loads(lastProgress.json())\n else:\n return None"} {"instruction": "Explain what the following Python 3 code does\ndef exception(self):\n if self._jsq.exception().isDefined():\n je = self._jsq.exception().get()\n msg = je.toString().split(': ', 1)[1] # Drop the Java StreamingQueryException type info\n stackTrace = '\\n\\t at '.join(map(lambda x: x.toString(), je.getStackTrace()))\n return StreamingQueryException(msg, stackTrace, je.getCause())\n else:\n return None", "response": "Returns the StreamingQueryException object if the query was terminated by an exception or None."} {"instruction": "Given the following Python 3 function, write the documentation\ndef awaitAnyTermination(self, timeout=None):\n if timeout is not None:\n if not isinstance(timeout, (int, float)) or timeout < 0:\n raise ValueError(\"timeout must be a positive integer or float. Got %s\" % timeout)\n return self._jsqm.awaitAnyTermination(int(timeout * 1000))\n else:\n return self._jsqm.awaitAnyTermination()", "response": "Wait until any of the queries on the associated SQLContext has terminated or until the timeout is reached."} {"instruction": "Can you write a function in Python 3 where it\nloads a data stream from a file - system backed data source and returns it as a : class : DataFrame.", "response": "def load(self, path=None, format=None, schema=None, **options):\n \"\"\"Loads a data stream from a data source and returns it as a :class`DataFrame`.\n\n .. note:: Evolving.\n\n :param path: optional string for file-system backed data sources.\n :param format: optional string for format of the data source. Default to 'parquet'.\n :param schema: optional :class:`pyspark.sql.types.StructType` for the input schema\n or a DDL-formatted string (For example ``col0 INT, col1 DOUBLE``).\n :param options: all other string options\n\n >>> json_sdf = spark.readStream.format(\"json\") \\\\\n ... .schema(sdf_schema) \\\\\n ... .load(tempfile.mkdtemp())\n >>> json_sdf.isStreaming\n True\n >>> json_sdf.schema == sdf_schema\n True\n \"\"\"\n if format is not None:\n self.format(format)\n if schema is not None:\n self.schema(schema)\n self.options(**options)\n if path is not None:\n if type(path) != str or len(path.strip()) == 0:\n raise ValueError(\"If the path is provided for stream, it needs to be a \" +\n \"non-empty string. List of paths are not supported.\")\n return self._df(self._jreader.load(path))\n else:\n return self._df(self._jreader.load())"} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef json(self, path, schema=None, primitivesAsString=None, prefersDecimal=None,\n allowComments=None, allowUnquotedFieldNames=None, allowSingleQuotes=None,\n allowNumericLeadingZero=None, allowBackslashEscapingAnyCharacter=None,\n mode=None, columnNameOfCorruptRecord=None, dateFormat=None, timestampFormat=None,\n multiLine=None, allowUnquotedControlChars=None, lineSep=None, locale=None,\n dropFieldIfAllNull=None, encoding=None):\n \"\"\"\n Loads a JSON file stream and returns the results as a :class:`DataFrame`.\n\n `JSON Lines `_ (newline-delimited JSON) is supported by default.\n For JSON (one record per file), set the ``multiLine`` parameter to ``true``.\n\n If the ``schema`` parameter is not specified, this function goes\n through the input once to determine the input schema.\n\n .. note:: Evolving.\n\n :param path: string represents path to the JSON dataset,\n or RDD of Strings storing JSON objects.\n :param schema: an optional :class:`pyspark.sql.types.StructType` for the input schema\n or a DDL-formatted string (For example ``col0 INT, col1 DOUBLE``).\n :param primitivesAsString: infers all primitive values as a string type. If None is set,\n it uses the default value, ``false``.\n :param prefersDecimal: infers all floating-point values as a decimal type. If the values\n do not fit in decimal, then it infers them as doubles. If None is\n set, it uses the default value, ``false``.\n :param allowComments: ignores Java/C++ style comment in JSON records. If None is set,\n it uses the default value, ``false``.\n :param allowUnquotedFieldNames: allows unquoted JSON field names. If None is set,\n it uses the default value, ``false``.\n :param allowSingleQuotes: allows single quotes in addition to double quotes. If None is\n set, it uses the default value, ``true``.\n :param allowNumericLeadingZero: allows leading zeros in numbers (e.g. 00012). If None is\n set, it uses the default value, ``false``.\n :param allowBackslashEscapingAnyCharacter: allows accepting quoting of all character\n using backslash quoting mechanism. If None is\n set, it uses the default value, ``false``.\n :param mode: allows a mode for dealing with corrupt records during parsing. If None is\n set, it uses the default value, ``PERMISSIVE``.\n\n * ``PERMISSIVE`` : when it meets a corrupted record, puts the malformed string \\\n into a field configured by ``columnNameOfCorruptRecord``, and sets malformed \\\n fields to ``null``. To keep corrupt records, an user can set a string type \\\n field named ``columnNameOfCorruptRecord`` in an user-defined schema. If a \\\n schema does not have the field, it drops corrupt records during parsing. \\\n When inferring a schema, it implicitly adds a ``columnNameOfCorruptRecord`` \\\n field in an output schema.\n * ``DROPMALFORMED`` : ignores the whole corrupted records.\n * ``FAILFAST`` : throws an exception when it meets corrupted records.\n\n :param columnNameOfCorruptRecord: allows renaming the new field having malformed string\n created by ``PERMISSIVE`` mode. This overrides\n ``spark.sql.columnNameOfCorruptRecord``. If None is set,\n it uses the value specified in\n ``spark.sql.columnNameOfCorruptRecord``.\n :param dateFormat: sets the string that indicates a date format. Custom date formats\n follow the formats at ``java.time.format.DateTimeFormatter``. This\n applies to date type. If None is set, it uses the\n default value, ``yyyy-MM-dd``.\n :param timestampFormat: sets the string that indicates a timestamp format.\n Custom date formats follow the formats at\n ``java.time.format.DateTimeFormatter``.\n This applies to timestamp type. If None is set, it uses the\n default value, ``yyyy-MM-dd'T'HH:mm:ss.SSSXXX``.\n :param multiLine: parse one record, which may span multiple lines, per file. If None is\n set, it uses the default value, ``false``.\n :param allowUnquotedControlChars: allows JSON Strings to contain unquoted control\n characters (ASCII characters with value less than 32,\n including tab and line feed characters) or not.\n :param lineSep: defines the line separator that should be used for parsing. If None is\n set, it covers all ``\\\\r``, ``\\\\r\\\\n`` and ``\\\\n``.\n :param locale: sets a locale as language tag in IETF BCP 47 format. If None is set,\n it uses the default value, ``en-US``. For instance, ``locale`` is used while\n parsing dates and timestamps.\n :param dropFieldIfAllNull: whether to ignore column of all null values or empty\n array/struct during schema inference. If None is set, it\n uses the default value, ``false``.\n :param encoding: allows to forcibly set one of standard basic or extended encoding for\n the JSON files. For example UTF-16BE, UTF-32LE. If None is set,\n the encoding of input JSON will be detected automatically\n when the multiLine option is set to ``true``.\n\n >>> json_sdf = spark.readStream.json(tempfile.mkdtemp(), schema = sdf_schema)\n >>> json_sdf.isStreaming\n True\n >>> json_sdf.schema == sdf_schema\n True\n \"\"\"\n self._set_opts(\n schema=schema, primitivesAsString=primitivesAsString, prefersDecimal=prefersDecimal,\n allowComments=allowComments, allowUnquotedFieldNames=allowUnquotedFieldNames,\n allowSingleQuotes=allowSingleQuotes, allowNumericLeadingZero=allowNumericLeadingZero,\n allowBackslashEscapingAnyCharacter=allowBackslashEscapingAnyCharacter,\n mode=mode, columnNameOfCorruptRecord=columnNameOfCorruptRecord, dateFormat=dateFormat,\n timestampFormat=timestampFormat, multiLine=multiLine,\n allowUnquotedControlChars=allowUnquotedControlChars, lineSep=lineSep, locale=locale,\n dropFieldIfAllNull=dropFieldIfAllNull, encoding=encoding)\n if isinstance(path, basestring):\n return self._df(self._jreader.json(path))\n else:\n raise TypeError(\"path can be only a single string\")", "response": "Load a JSON file and return a DataFrame."} {"instruction": "Write a Python 3 function for\nloading an ORC file stream returning the result as a : class : DataFrame.", "response": "def orc(self, path):\n \"\"\"Loads a ORC file stream, returning the result as a :class:`DataFrame`.\n\n .. note:: Evolving.\n\n >>> orc_sdf = spark.readStream.schema(sdf_schema).orc(tempfile.mkdtemp())\n >>> orc_sdf.isStreaming\n True\n >>> orc_sdf.schema == sdf_schema\n True\n \"\"\"\n if isinstance(path, basestring):\n return self._df(self._jreader.orc(path))\n else:\n raise TypeError(\"path can be only a single string\")"} {"instruction": "Here you have a function in Python 3, explain what it does\ndef parquet(self, path):\n if isinstance(path, basestring):\n return self._df(self._jreader.parquet(path))\n else:\n raise TypeError(\"path can be only a single string\")", "response": "Loads a Parquet file stream returning the result as a DataFrame."} {"instruction": "How would you code a function in Python 3 to\nload a text file stream and returns a DataFrame containing the data for the specified key.", "response": "def text(self, path, wholetext=False, lineSep=None):\n \"\"\"\n Loads a text file stream and returns a :class:`DataFrame` whose schema starts with a\n string column named \"value\", and followed by partitioned columns if there\n are any.\n The text files must be encoded as UTF-8.\n\n By default, each line in the text file is a new row in the resulting DataFrame.\n\n .. note:: Evolving.\n\n :param paths: string, or list of strings, for input path(s).\n :param wholetext: if true, read each file from input path(s) as a single row.\n :param lineSep: defines the line separator that should be used for parsing. If None is\n set, it covers all ``\\\\r``, ``\\\\r\\\\n`` and ``\\\\n``.\n\n >>> text_sdf = spark.readStream.text(tempfile.mkdtemp())\n >>> text_sdf.isStreaming\n True\n >>> \"value\" in str(text_sdf.schema)\n True\n \"\"\"\n self._set_opts(wholetext=wholetext, lineSep=lineSep)\n if isinstance(path, basestring):\n return self._df(self._jreader.text(path))\n else:\n raise TypeError(\"path can be only a single string\")"} {"instruction": "Can you write a function in Python 3 where it\nloads a CSV file stream and returns a DataFrame.", "response": "def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=None,\n comment=None, header=None, inferSchema=None, ignoreLeadingWhiteSpace=None,\n ignoreTrailingWhiteSpace=None, nullValue=None, nanValue=None, positiveInf=None,\n negativeInf=None, dateFormat=None, timestampFormat=None, maxColumns=None,\n maxCharsPerColumn=None, maxMalformedLogPerPartition=None, mode=None,\n columnNameOfCorruptRecord=None, multiLine=None, charToEscapeQuoteEscaping=None,\n enforceSchema=None, emptyValue=None, locale=None, lineSep=None):\n r\"\"\"Loads a CSV file stream and returns the result as a :class:`DataFrame`.\n\n This function will go through the input once to determine the input schema if\n ``inferSchema`` is enabled. To avoid going through the entire data once, disable\n ``inferSchema`` option or specify the schema explicitly using ``schema``.\n\n .. note:: Evolving.\n\n :param path: string, or list of strings, for input path(s).\n :param schema: an optional :class:`pyspark.sql.types.StructType` for the input schema\n or a DDL-formatted string (For example ``col0 INT, col1 DOUBLE``).\n :param sep: sets a single character as a separator for each field and value.\n If None is set, it uses the default value, ``,``.\n :param encoding: decodes the CSV files by the given encoding type. If None is set,\n it uses the default value, ``UTF-8``.\n :param quote: sets a single character used for escaping quoted values where the\n separator can be part of the value. If None is set, it uses the default\n value, ``\"``. If you would like to turn off quotations, you need to set an\n empty string.\n :param escape: sets a single character used for escaping quotes inside an already\n quoted value. If None is set, it uses the default value, ``\\``.\n :param comment: sets a single character used for skipping lines beginning with this\n character. By default (None), it is disabled.\n :param header: uses the first line as names of columns. If None is set, it uses the\n default value, ``false``.\n :param inferSchema: infers the input schema automatically from data. It requires one extra\n pass over the data. If None is set, it uses the default value, ``false``.\n :param enforceSchema: If it is set to ``true``, the specified or inferred schema will be\n forcibly applied to datasource files, and headers in CSV files will be\n ignored. If the option is set to ``false``, the schema will be\n validated against all headers in CSV files or the first header in RDD\n if the ``header`` option is set to ``true``. Field names in the schema\n and column names in CSV headers are checked by their positions\n taking into account ``spark.sql.caseSensitive``. If None is set,\n ``true`` is used by default. Though the default value is ``true``,\n it is recommended to disable the ``enforceSchema`` option\n to avoid incorrect results.\n :param ignoreLeadingWhiteSpace: a flag indicating whether or not leading whitespaces from\n values being read should be skipped. If None is set, it\n uses the default value, ``false``.\n :param ignoreTrailingWhiteSpace: a flag indicating whether or not trailing whitespaces from\n values being read should be skipped. If None is set, it\n uses the default value, ``false``.\n :param nullValue: sets the string representation of a null value. If None is set, it uses\n the default value, empty string. Since 2.0.1, this ``nullValue`` param\n applies to all supported types including the string type.\n :param nanValue: sets the string representation of a non-number value. If None is set, it\n uses the default value, ``NaN``.\n :param positiveInf: sets the string representation of a positive infinity value. If None\n is set, it uses the default value, ``Inf``.\n :param negativeInf: sets the string representation of a negative infinity value. If None\n is set, it uses the default value, ``Inf``.\n :param dateFormat: sets the string that indicates a date format. Custom date formats\n follow the formats at ``java.time.format.DateTimeFormatter``. This\n applies to date type. If None is set, it uses the\n default value, ``yyyy-MM-dd``.\n :param timestampFormat: sets the string that indicates a timestamp format.\n Custom date formats follow the formats at\n ``java.time.format.DateTimeFormatter``.\n This applies to timestamp type. If None is set, it uses the\n default value, ``yyyy-MM-dd'T'HH:mm:ss.SSSXXX``.\n :param maxColumns: defines a hard limit of how many columns a record can have. If None is\n set, it uses the default value, ``20480``.\n :param maxCharsPerColumn: defines the maximum number of characters allowed for any given\n value being read. If None is set, it uses the default value,\n ``-1`` meaning unlimited length.\n :param maxMalformedLogPerPartition: this parameter is no longer used since Spark 2.2.0.\n If specified, it is ignored.\n :param mode: allows a mode for dealing with corrupt records during parsing. If None is\n set, it uses the default value, ``PERMISSIVE``.\n\n * ``PERMISSIVE`` : when it meets a corrupted record, puts the malformed string \\\n into a field configured by ``columnNameOfCorruptRecord``, and sets malformed \\\n fields to ``null``. To keep corrupt records, an user can set a string type \\\n field named ``columnNameOfCorruptRecord`` in an user-defined schema. If a \\\n schema does not have the field, it drops corrupt records during parsing. \\\n A record with less/more tokens than schema is not a corrupted record to CSV. \\\n When it meets a record having fewer tokens than the length of the schema, \\\n sets ``null`` to extra fields. When the record has more tokens than the \\\n length of the schema, it drops extra tokens.\n * ``DROPMALFORMED`` : ignores the whole corrupted records.\n * ``FAILFAST`` : throws an exception when it meets corrupted records.\n\n :param columnNameOfCorruptRecord: allows renaming the new field having malformed string\n created by ``PERMISSIVE`` mode. This overrides\n ``spark.sql.columnNameOfCorruptRecord``. If None is set,\n it uses the value specified in\n ``spark.sql.columnNameOfCorruptRecord``.\n :param multiLine: parse one record, which may span multiple lines. If None is\n set, it uses the default value, ``false``.\n :param charToEscapeQuoteEscaping: sets a single character used for escaping the escape for\n the quote character. If None is set, the default value is\n escape character when escape and quote characters are\n different, ``\\0`` otherwise..\n :param emptyValue: sets the string representation of an empty value. If None is set, it uses\n the default value, empty string.\n :param locale: sets a locale as language tag in IETF BCP 47 format. If None is set,\n it uses the default value, ``en-US``. For instance, ``locale`` is used while\n parsing dates and timestamps.\n :param lineSep: defines the line separator that should be used for parsing. If None is\n set, it covers all ``\\\\r``, ``\\\\r\\\\n`` and ``\\\\n``.\n Maximum length is 1 character.\n\n >>> csv_sdf = spark.readStream.csv(tempfile.mkdtemp(), schema = sdf_schema)\n >>> csv_sdf.isStreaming\n True\n >>> csv_sdf.schema == sdf_schema\n True\n \"\"\"\n self._set_opts(\n schema=schema, sep=sep, encoding=encoding, quote=quote, escape=escape, comment=comment,\n header=header, inferSchema=inferSchema, ignoreLeadingWhiteSpace=ignoreLeadingWhiteSpace,\n ignoreTrailingWhiteSpace=ignoreTrailingWhiteSpace, nullValue=nullValue,\n nanValue=nanValue, positiveInf=positiveInf, negativeInf=negativeInf,\n dateFormat=dateFormat, timestampFormat=timestampFormat, maxColumns=maxColumns,\n maxCharsPerColumn=maxCharsPerColumn,\n maxMalformedLogPerPartition=maxMalformedLogPerPartition, mode=mode,\n columnNameOfCorruptRecord=columnNameOfCorruptRecord, multiLine=multiLine,\n charToEscapeQuoteEscaping=charToEscapeQuoteEscaping, enforceSchema=enforceSchema,\n emptyValue=emptyValue, locale=locale, lineSep=lineSep)\n if isinstance(path, basestring):\n return self._df(self._jreader.csv(path))\n else:\n raise TypeError(\"path can be only a single string\")"} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef outputMode(self, outputMode):\n if not outputMode or type(outputMode) != str or len(outputMode.strip()) == 0:\n raise ValueError('The output mode must be a non-empty string. Got: %s' % outputMode)\n self._jwrite = self._jwrite.outputMode(outputMode)\n return self", "response": "Set the output mode of the data for a specific entry in the stream."} {"instruction": "Write a Python 3 function that can\nspecify the name of the streaming query that can be started with a specific name.", "response": "def queryName(self, queryName):\n \"\"\"Specifies the name of the :class:`StreamingQuery` that can be started with\n :func:`start`. This name must be unique among all the currently active queries\n in the associated SparkSession.\n\n .. note:: Evolving.\n\n :param queryName: unique name for the query\n\n >>> writer = sdf.writeStream.queryName('streaming_query')\n \"\"\"\n if not queryName or type(queryName) != str or len(queryName.strip()) == 0:\n raise ValueError('The queryName must be a non-empty string. Got: %s' % queryName)\n self._jwrite = self._jwrite.queryName(queryName)\n return self"} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef trigger(self, processingTime=None, once=None, continuous=None):\n params = [processingTime, once, continuous]\n\n if params.count(None) == 3:\n raise ValueError('No trigger provided')\n elif params.count(None) < 2:\n raise ValueError('Multiple triggers not allowed.')\n\n jTrigger = None\n if processingTime is not None:\n if type(processingTime) != str or len(processingTime.strip()) == 0:\n raise ValueError('Value for processingTime must be a non empty string. Got: %s' %\n processingTime)\n interval = processingTime.strip()\n jTrigger = self._spark._sc._jvm.org.apache.spark.sql.streaming.Trigger.ProcessingTime(\n interval)\n\n elif once is not None:\n if once is not True:\n raise ValueError('Value for once must be True. Got: %s' % once)\n jTrigger = self._spark._sc._jvm.org.apache.spark.sql.streaming.Trigger.Once()\n\n else:\n if type(continuous) != str or len(continuous.strip()) == 0:\n raise ValueError('Value for continuous must be a non empty string. Got: %s' %\n continuous)\n interval = continuous.strip()\n jTrigger = self._spark._sc._jvm.org.apache.spark.sql.streaming.Trigger.Continuous(\n interval)\n\n self._jwrite = self._jwrite.trigger(jTrigger)\n return self", "response": "Sets the trigger for the stream query."} {"instruction": "Implement a Python 3 function for\nsetting the output of the streaming query to be processed using the provided writer ``f``. This is often used to write the output of a streaming query to arbitrary storage systems. The processing logic can be specified in two ways. #. A **function** that takes a row as input. This is a simple way to express your processing logic. Note that this does not allow you to deduplicate generated data when failures cause reprocessing of some input data. That would require you to specify the processing logic in the next way. #. An **object** with a ``process`` method and optional ``open`` and ``close`` methods. The object can have the following methods. * ``open(partition_id, epoch_id)``: *Optional* method that initializes the processing (for example, open a connection, start a transaction, etc). Additionally, you can use the `partition_id` and `epoch_id` to deduplicate regenerated data (discussed later). * ``process(row)``: *Non-optional* method that processes each :class:`Row`. * ``close(error)``: *Optional* method that finalizes and cleans up (for example, close connection, commit transaction, etc.) after all rows have been processed. The object will be used by Spark in the following way. * A single copy of this object is responsible of all the data generated by a single task in a query. In other words, one instance is responsible for processing one partition of the data generated in a distributed manner. * This object must be serializable because each task will get a fresh serialized-deserialized copy of the provided object. Hence, it is strongly recommended that any initialization for writing data (e.g. opening a connection or starting a transaction) is done after the `open(...)` method has been called, which signifies that the task is ready to generate data. * The lifecycle of the methods are as follows. For each partition with ``partition_id``: ... For each batch/epoch of streaming data with ``epoch_id``: ....... Method ``open(partitionId, epochId)`` is called. ....... If ``open(...)`` returns true, for each row in the partition and batch/epoch, method ``process(row)`` is called. ....... Method ``close(errorOrNull)`` is called with error (if any) seen while processing rows. Important points to note: * The `partitionId` and `epochId` can be used to deduplicate generated data when failures cause reprocessing of some input data. This depends on the execution mode of the query. If the streaming query is being executed in the micro-batch mode, then every partition represented by a unique tuple (partition_id, epoch_id) is guaranteed to have the same data. Hence, (partition_id, epoch_id) can be used to deduplicate and/or transactionally commit data and achieve exactly-once guarantees. However, if the streaming query is being executed in the continuous mode, then this guarantee does not hold and therefore should not be used for deduplication. * The ``close()`` method (if exists) will be called if `open()` method exists and returns successfully (irrespective of the return value), except if the Python crashes in the middle. .. note:: Evolving. >>> # Print every row using a function >>> def print_row(row): ... print(row) ... >>> writer = sdf.writeStream.foreach(print_row) >>> # Print every row using a object with process() method >>> class RowPrinter: ... def open(self, partition_id, epoch_id): ... print(\"Opened %d, %d\" % (partition_id, epoch_id)) ... return True ... def process(self, row): ... print(row) ... def close(self, error): ... print(\"Closed with error: %s\" % str(error)) ... >>> writer = sdf.writeStream.foreach(RowPrinter())", "response": "def foreach(self, f):\n \"\"\"\n Sets the output of the streaming query to be processed using the provided writer ``f``.\n This is often used to write the output of a streaming query to arbitrary storage systems.\n The processing logic can be specified in two ways.\n\n #. A **function** that takes a row as input.\n This is a simple way to express your processing logic. Note that this does\n not allow you to deduplicate generated data when failures cause reprocessing of\n some input data. That would require you to specify the processing logic in the next\n way.\n\n #. An **object** with a ``process`` method and optional ``open`` and ``close`` methods.\n The object can have the following methods.\n\n * ``open(partition_id, epoch_id)``: *Optional* method that initializes the processing\n (for example, open a connection, start a transaction, etc). Additionally, you can\n use the `partition_id` and `epoch_id` to deduplicate regenerated data\n (discussed later).\n\n * ``process(row)``: *Non-optional* method that processes each :class:`Row`.\n\n * ``close(error)``: *Optional* method that finalizes and cleans up (for example,\n close connection, commit transaction, etc.) after all rows have been processed.\n\n The object will be used by Spark in the following way.\n\n * A single copy of this object is responsible of all the data generated by a\n single task in a query. In other words, one instance is responsible for\n processing one partition of the data generated in a distributed manner.\n\n * This object must be serializable because each task will get a fresh\n serialized-deserialized copy of the provided object. Hence, it is strongly\n recommended that any initialization for writing data (e.g. opening a\n connection or starting a transaction) is done after the `open(...)`\n method has been called, which signifies that the task is ready to generate data.\n\n * The lifecycle of the methods are as follows.\n\n For each partition with ``partition_id``:\n\n ... For each batch/epoch of streaming data with ``epoch_id``:\n\n ....... Method ``open(partitionId, epochId)`` is called.\n\n ....... If ``open(...)`` returns true, for each row in the partition and\n batch/epoch, method ``process(row)`` is called.\n\n ....... Method ``close(errorOrNull)`` is called with error (if any) seen while\n processing rows.\n\n Important points to note:\n\n * The `partitionId` and `epochId` can be used to deduplicate generated data when\n failures cause reprocessing of some input data. This depends on the execution\n mode of the query. If the streaming query is being executed in the micro-batch\n mode, then every partition represented by a unique tuple (partition_id, epoch_id)\n is guaranteed to have the same data. Hence, (partition_id, epoch_id) can be used\n to deduplicate and/or transactionally commit data and achieve exactly-once\n guarantees. However, if the streaming query is being executed in the continuous\n mode, then this guarantee does not hold and therefore should not be used for\n deduplication.\n\n * The ``close()`` method (if exists) will be called if `open()` method exists and\n returns successfully (irrespective of the return value), except if the Python\n crashes in the middle.\n\n .. note:: Evolving.\n\n >>> # Print every row using a function\n >>> def print_row(row):\n ... print(row)\n ...\n >>> writer = sdf.writeStream.foreach(print_row)\n >>> # Print every row using a object with process() method\n >>> class RowPrinter:\n ... def open(self, partition_id, epoch_id):\n ... print(\"Opened %d, %d\" % (partition_id, epoch_id))\n ... return True\n ... def process(self, row):\n ... print(row)\n ... def close(self, error):\n ... print(\"Closed with error: %s\" % str(error))\n ...\n >>> writer = sdf.writeStream.foreach(RowPrinter())\n \"\"\"\n\n from pyspark.rdd import _wrap_function\n from pyspark.serializers import PickleSerializer, AutoBatchedSerializer\n from pyspark.taskcontext import TaskContext\n\n if callable(f):\n # The provided object is a callable function that is supposed to be called on each row.\n # Construct a function that takes an iterator and calls the provided function on each\n # row.\n def func_without_process(_, iterator):\n for x in iterator:\n f(x)\n return iter([])\n\n func = func_without_process\n\n else:\n # The provided object is not a callable function. Then it is expected to have a\n # 'process(row)' method, and optional 'open(partition_id, epoch_id)' and\n # 'close(error)' methods.\n\n if not hasattr(f, 'process'):\n raise Exception(\"Provided object does not have a 'process' method\")\n\n if not callable(getattr(f, 'process')):\n raise Exception(\"Attribute 'process' in provided object is not callable\")\n\n def doesMethodExist(method_name):\n exists = hasattr(f, method_name)\n if exists and not callable(getattr(f, method_name)):\n raise Exception(\n \"Attribute '%s' in provided object is not callable\" % method_name)\n return exists\n\n open_exists = doesMethodExist('open')\n close_exists = doesMethodExist('close')\n\n def func_with_open_process_close(partition_id, iterator):\n epoch_id = TaskContext.get().getLocalProperty('streaming.sql.batchId')\n if epoch_id:\n epoch_id = int(epoch_id)\n else:\n raise Exception(\"Could not get batch id from TaskContext\")\n\n # Check if the data should be processed\n should_process = True\n if open_exists:\n should_process = f.open(partition_id, epoch_id)\n\n error = None\n\n try:\n if should_process:\n for x in iterator:\n f.process(x)\n except Exception as ex:\n error = ex\n finally:\n if close_exists:\n f.close(error)\n if error:\n raise error\n\n return iter([])\n\n func = func_with_open_process_close\n\n serializer = AutoBatchedSerializer(PickleSerializer())\n wrapped_func = _wrap_function(self._spark._sc, func, serializer, serializer)\n jForeachWriter = \\\n self._spark._sc._jvm.org.apache.spark.sql.execution.python.PythonForeachWriter(\n wrapped_func, self._df._jdf.schema())\n self._jwrite.foreach(jForeachWriter)\n return self"} {"instruction": "Can you implement a function in Python 3 that\ncalls the provided function for each batch of data in the current Spark context.", "response": "def foreachBatch(self, func):\n \"\"\"\n Sets the output of the streaming query to be processed using the provided\n function. This is supported only the in the micro-batch execution modes (that is, when the\n trigger is not continuous). In every micro-batch, the provided function will be called in\n every micro-batch with (i) the output rows as a DataFrame and (ii) the batch identifier.\n The batchId can be used deduplicate and transactionally write the output\n (that is, the provided Dataset) to external systems. The output DataFrame is guaranteed\n to exactly same for the same batchId (assuming all operations are deterministic in the\n query).\n\n .. note:: Evolving.\n\n >>> def func(batch_df, batch_id):\n ... batch_df.collect()\n ...\n >>> writer = sdf.writeStream.foreach(func)\n \"\"\"\n\n from pyspark.java_gateway import ensure_callback_server_started\n gw = self._spark._sc._gateway\n java_import(gw.jvm, \"org.apache.spark.sql.execution.streaming.sources.*\")\n\n wrapped_func = ForeachBatchFunction(self._spark, func)\n gw.jvm.PythonForeachBatchHelper.callForeachBatch(self._jwrite, wrapped_func)\n ensure_callback_server_started(gw)\n return self"} {"instruction": "Write a Python 3 script for\nstreaming the contents of the DataFrame to a data source.", "response": "def start(self, path=None, format=None, outputMode=None, partitionBy=None, queryName=None,\n **options):\n \"\"\"Streams the contents of the :class:`DataFrame` to a data source.\n\n The data source is specified by the ``format`` and a set of ``options``.\n If ``format`` is not specified, the default data source configured by\n ``spark.sql.sources.default`` will be used.\n\n .. note:: Evolving.\n\n :param path: the path in a Hadoop supported file system\n :param format: the format used to save\n :param outputMode: specifies how data of a streaming DataFrame/Dataset is written to a\n streaming sink.\n\n * `append`:Only the new rows in the streaming DataFrame/Dataset will be written to the\n sink\n * `complete`:All the rows in the streaming DataFrame/Dataset will be written to the sink\n every time these is some updates\n * `update`:only the rows that were updated in the streaming DataFrame/Dataset will be\n written to the sink every time there are some updates. If the query doesn't contain\n aggregations, it will be equivalent to `append` mode.\n :param partitionBy: names of partitioning columns\n :param queryName: unique name for the query\n :param options: All other string options. You may want to provide a `checkpointLocation`\n for most streams, however it is not required for a `memory` stream.\n\n >>> sq = sdf.writeStream.format('memory').queryName('this_query').start()\n >>> sq.isActive\n True\n >>> sq.name\n u'this_query'\n >>> sq.stop()\n >>> sq.isActive\n False\n >>> sq = sdf.writeStream.trigger(processingTime='5 seconds').start(\n ... queryName='that_query', outputMode=\"append\", format='memory')\n >>> sq.name\n u'that_query'\n >>> sq.isActive\n True\n >>> sq.stop()\n \"\"\"\n self.options(**options)\n if outputMode is not None:\n self.outputMode(outputMode)\n if partitionBy is not None:\n self.partitionBy(partitionBy)\n if format is not None:\n self.format(format)\n if queryName is not None:\n self.queryName(queryName)\n if path is None:\n return self._sq(self._jwrite.start())\n else:\n return self._sq(self._jwrite.start(path))"} {"instruction": "Write a Python 3 script to\nreturn the code for the cell set template.", "response": "def _make_cell_set_template_code():\n \"\"\"Get the Python compiler to emit LOAD_FAST(arg); STORE_DEREF\n\n Notes\n -----\n In Python 3, we could use an easier function:\n\n .. code-block:: python\n\n def f():\n cell = None\n\n def _stub(value):\n nonlocal cell\n cell = value\n\n return _stub\n\n _cell_set_template_code = f().__code__\n\n This function is _only_ a LOAD_FAST(arg); STORE_DEREF, but that is\n invalid syntax on Python 2. If we use this function we also don't need\n to do the weird freevars/cellvars swap below\n \"\"\"\n def inner(value):\n lambda: cell # make ``cell`` a closure so that we get a STORE_DEREF\n cell = value\n\n co = inner.__code__\n\n # NOTE: we are marking the cell variable as a free variable intentionally\n # so that we simulate an inner function instead of the outer function. This\n # is what gives us the ``nonlocal`` behavior in a Python 2 compatible way.\n if not PY3: # pragma: no branch\n return types.CodeType(\n co.co_argcount,\n co.co_nlocals,\n co.co_stacksize,\n co.co_flags,\n co.co_code,\n co.co_consts,\n co.co_names,\n co.co_varnames,\n co.co_filename,\n co.co_name,\n co.co_firstlineno,\n co.co_lnotab,\n co.co_cellvars, # this is the trickery\n (),\n )\n else:\n return types.CodeType(\n co.co_argcount,\n co.co_kwonlyargcount,\n co.co_nlocals,\n co.co_stacksize,\n co.co_flags,\n co.co_code,\n co.co_consts,\n co.co_names,\n co.co_varnames,\n co.co_filename,\n co.co_name,\n co.co_firstlineno,\n co.co_lnotab,\n co.co_cellvars, # this is the trickery\n (),\n )"} {"instruction": "Make a summary of the following Python 3 code\ndef is_tornado_coroutine(func):\n if 'tornado.gen' not in sys.modules:\n return False\n gen = sys.modules['tornado.gen']\n if not hasattr(gen, \"is_coroutine_function\"):\n # Tornado version is too old\n return False\n return gen.is_coroutine_function(func)", "response": "Return whether *func* is a Tornado coroutine function."} {"instruction": "Create a Python 3 function for\nserializing obj as bytes streamed into file.", "response": "def dump(obj, file, protocol=None):\n \"\"\"Serialize obj as bytes streamed into file\n\n protocol defaults to cloudpickle.DEFAULT_PROTOCOL which is an alias to\n pickle.HIGHEST_PROTOCOL. This setting favors maximum communication speed\n between processes running the same Python version.\n\n Set protocol=pickle.DEFAULT_PROTOCOL instead if you need to ensure\n compatibility with older versions of Python.\n \"\"\"\n CloudPickler(file, protocol=protocol).dump(obj)"} {"instruction": "Write a Python 3 function that can\nserialize obj as a string of bytes allocated in memory", "response": "def dumps(obj, protocol=None):\n \"\"\"Serialize obj as a string of bytes allocated in memory\n\n protocol defaults to cloudpickle.DEFAULT_PROTOCOL which is an alias to\n pickle.HIGHEST_PROTOCOL. This setting favors maximum communication speed\n between processes running the same Python version.\n\n Set protocol=pickle.DEFAULT_PROTOCOL instead if you need to ensure\n compatibility with older versions of Python.\n \"\"\"\n file = StringIO()\n try:\n cp = CloudPickler(file, protocol=protocol)\n cp.dump(obj)\n return file.getvalue()\n finally:\n file.close()"} {"instruction": "Here you have a function in Python 3, explain what it does\ndef _fill_function(*args):\n if len(args) == 2:\n func = args[0]\n state = args[1]\n elif len(args) == 5:\n # Backwards compat for cloudpickle v0.4.0, after which the `module`\n # argument was introduced\n func = args[0]\n keys = ['globals', 'defaults', 'dict', 'closure_values']\n state = dict(zip(keys, args[1:]))\n elif len(args) == 6:\n # Backwards compat for cloudpickle v0.4.1, after which the function\n # state was passed as a dict to the _fill_function it-self.\n func = args[0]\n keys = ['globals', 'defaults', 'dict', 'module', 'closure_values']\n state = dict(zip(keys, args[1:]))\n else:\n raise ValueError('Unexpected _fill_value arguments: %r' % (args,))\n\n # - At pickling time, any dynamic global variable used by func is\n # serialized by value (in state['globals']).\n # - At unpickling time, func's __globals__ attribute is initialized by\n # first retrieving an empty isolated namespace that will be shared\n # with other functions pickled from the same original module\n # by the same CloudPickler instance and then updated with the\n # content of state['globals'] to populate the shared isolated\n # namespace with all the global variables that are specifically\n # referenced for this function.\n func.__globals__.update(state['globals'])\n\n func.__defaults__ = state['defaults']\n func.__dict__ = state['dict']\n if 'annotations' in state:\n func.__annotations__ = state['annotations']\n if 'doc' in state:\n func.__doc__ = state['doc']\n if 'name' in state:\n func.__name__ = state['name']\n if 'module' in state:\n func.__module__ = state['module']\n if 'qualname' in state:\n func.__qualname__ = state['qualname']\n\n cells = func.__closure__\n if cells is not None:\n for cell, value in zip(cells, state['closure_values']):\n if value is not _empty_cell_value:\n cell_set(cell, value)\n\n return func", "response": "Fills in the rest of function data into the skeleton function object\n "} {"instruction": "Given the following Python 3 function, write the documentation\ndef _rehydrate_skeleton_class(skeleton_class, class_dict):\n registry = None\n for attrname, attr in class_dict.items():\n if attrname == \"_abc_impl\":\n registry = attr\n else:\n setattr(skeleton_class, attrname, attr)\n if registry is not None:\n for subclass in registry:\n skeleton_class.register(subclass)\n\n return skeleton_class", "response": "Rehydrate the skeleton_class from class_dict into class_dict."} {"instruction": "Explain what the following Python 3 code does\ndef _is_dynamic(module):\n # Quick check: module that have __file__ attribute are not dynamic modules.\n if hasattr(module, '__file__'):\n return False\n\n if hasattr(module, '__spec__'):\n return module.__spec__ is None\n else:\n # Backward compat for Python 2\n import imp\n try:\n path = None\n for part in module.__name__.split('.'):\n if path is not None:\n path = [path]\n f, path, description = imp.find_module(part, path)\n if f is not None:\n f.close()\n except ImportError:\n return True\n return False", "response": "Return True if the module is dynamic."} {"instruction": "Here you have a function in Python 3, explain what it does\ndef save_codeobject(self, obj):\n if PY3: # pragma: no branch\n args = (\n obj.co_argcount, obj.co_kwonlyargcount, obj.co_nlocals, obj.co_stacksize,\n obj.co_flags, obj.co_code, obj.co_consts, obj.co_names, obj.co_varnames,\n obj.co_filename, obj.co_name, obj.co_firstlineno, obj.co_lnotab, obj.co_freevars,\n obj.co_cellvars\n )\n else:\n args = (\n obj.co_argcount, obj.co_nlocals, obj.co_stacksize, obj.co_flags, obj.co_code,\n obj.co_consts, obj.co_names, obj.co_varnames, obj.co_filename, obj.co_name,\n obj.co_firstlineno, obj.co_lnotab, obj.co_freevars, obj.co_cellvars\n )\n self.save_reduce(types.CodeType, args, obj=obj)", "response": "Save a code object in the file."} {"instruction": "Write a Python 3 script for\nsaving a function object to the cache.", "response": "def save_function(self, obj, name=None):\n \"\"\" Registered with the dispatch to handle all function types.\n\n Determines what kind of function obj is (e.g. lambda, defined at\n interactive prompt, etc) and handles the pickling appropriately.\n \"\"\"\n try:\n should_special_case = obj in _BUILTIN_TYPE_CONSTRUCTORS\n except TypeError:\n # Methods of builtin types aren't hashable in python 2.\n should_special_case = False\n\n if should_special_case:\n # We keep a special-cased cache of built-in type constructors at\n # global scope, because these functions are structured very\n # differently in different python versions and implementations (for\n # example, they're instances of types.BuiltinFunctionType in\n # CPython, but they're ordinary types.FunctionType instances in\n # PyPy).\n #\n # If the function we've received is in that cache, we just\n # serialize it as a lookup into the cache.\n return self.save_reduce(_BUILTIN_TYPE_CONSTRUCTORS[obj], (), obj=obj)\n\n write = self.write\n\n if name is None:\n name = obj.__name__\n try:\n # whichmodule() could fail, see\n # https://bitbucket.org/gutworth/six/issues/63/importing-six-breaks-pickling\n modname = pickle.whichmodule(obj, name)\n except Exception:\n modname = None\n # print('which gives %s %s %s' % (modname, obj, name))\n try:\n themodule = sys.modules[modname]\n except KeyError:\n # eval'd items such as namedtuple give invalid items for their function __module__\n modname = '__main__'\n\n if modname == '__main__':\n themodule = None\n\n try:\n lookedup_by_name = getattr(themodule, name, None)\n except Exception:\n lookedup_by_name = None\n\n if themodule:\n if lookedup_by_name is obj:\n return self.save_global(obj, name)\n\n # a builtin_function_or_method which comes in as an attribute of some\n # object (e.g., itertools.chain.from_iterable) will end\n # up with modname \"__main__\" and so end up here. But these functions\n # have no __code__ attribute in CPython, so the handling for\n # user-defined functions below will fail.\n # So we pickle them here using save_reduce; have to do it differently\n # for different python versions.\n if not hasattr(obj, '__code__'):\n if PY3: # pragma: no branch\n rv = obj.__reduce_ex__(self.proto)\n else:\n if hasattr(obj, '__self__'):\n rv = (getattr, (obj.__self__, name))\n else:\n raise pickle.PicklingError(\"Can't pickle %r\" % obj)\n return self.save_reduce(obj=obj, *rv)\n\n # if func is lambda, def'ed at prompt, is in main, or is nested, then\n # we'll pickle the actual function object rather than simply saving a\n # reference (as is done in default pickler), via save_function_tuple.\n if (islambda(obj)\n or getattr(obj.__code__, 'co_filename', None) == ''\n or themodule is None):\n self.save_function_tuple(obj)\n return\n else:\n # func is nested\n if lookedup_by_name is None or lookedup_by_name is not obj:\n self.save_function_tuple(obj)\n return\n\n if obj.__dict__:\n # essentially save_reduce, but workaround needed to avoid recursion\n self.save(_restore_attr)\n write(pickle.MARK + pickle.GLOBAL + modname + '\\n' + name + '\\n')\n self.memoize(obj)\n self.save(obj.__dict__)\n write(pickle.TUPLE + pickle.REDUCE)\n else:\n write(pickle.GLOBAL + modname + '\\n' + name + '\\n')\n self.memoize(obj)"} {"instruction": "Can you implement a function in Python 3 that\nsaves a dynamic class that can t be stored as module global.", "response": "def save_dynamic_class(self, obj):\n \"\"\"\n Save a class that can't be stored as module global.\n\n This method is used to serialize classes that are defined inside\n functions, or that otherwise can't be serialized as attribute lookups\n from global modules.\n \"\"\"\n clsdict = dict(obj.__dict__) # copy dict proxy to a dict\n clsdict.pop('__weakref__', None)\n\n # For ABCMeta in python3.7+, remove _abc_impl as it is not picklable.\n # This is a fix which breaks the cache but this only makes the first\n # calls to issubclass slower.\n if \"_abc_impl\" in clsdict:\n import abc\n (registry, _, _, _) = abc._get_dump(obj)\n clsdict[\"_abc_impl\"] = [subclass_weakref()\n for subclass_weakref in registry]\n\n # On PyPy, __doc__ is a readonly attribute, so we need to include it in\n # the initial skeleton class. This is safe because we know that the\n # doc can't participate in a cycle with the original class.\n type_kwargs = {'__doc__': clsdict.pop('__doc__', None)}\n\n if hasattr(obj, \"__slots__\"):\n type_kwargs['__slots__'] = obj.__slots__\n # pickle string length optimization: member descriptors of obj are\n # created automatically from obj's __slots__ attribute, no need to\n # save them in obj's state\n if isinstance(obj.__slots__, string_types):\n clsdict.pop(obj.__slots__)\n else:\n for k in obj.__slots__:\n clsdict.pop(k, None)\n\n # If type overrides __dict__ as a property, include it in the type kwargs.\n # In Python 2, we can't set this attribute after construction.\n __dict__ = clsdict.pop('__dict__', None)\n if isinstance(__dict__, property):\n type_kwargs['__dict__'] = __dict__\n\n save = self.save\n write = self.write\n\n # We write pickle instructions explicitly here to handle the\n # possibility that the type object participates in a cycle with its own\n # __dict__. We first write an empty \"skeleton\" version of the class and\n # memoize it before writing the class' __dict__ itself. We then write\n # instructions to \"rehydrate\" the skeleton class by restoring the\n # attributes from the __dict__.\n #\n # A type can appear in a cycle with its __dict__ if an instance of the\n # type appears in the type's __dict__ (which happens for the stdlib\n # Enum class), or if the type defines methods that close over the name\n # of the type, (which is common for Python 2-style super() calls).\n\n # Push the rehydration function.\n save(_rehydrate_skeleton_class)\n\n # Mark the start of the args tuple for the rehydration function.\n write(pickle.MARK)\n\n # Create and memoize an skeleton class with obj's name and bases.\n tp = type(obj)\n self.save_reduce(tp, (obj.__name__, obj.__bases__, type_kwargs), obj=obj)\n\n # Now save the rest of obj's __dict__. Any references to obj\n # encountered while saving will point to the skeleton class.\n save(clsdict)\n\n # Write a tuple of (skeleton_class, clsdict).\n write(pickle.TUPLE)\n\n # Call _rehydrate_skeleton_class(skeleton_class, clsdict)\n write(pickle.REDUCE)"} {"instruction": "Given the following Python 3 function, write the documentation\ndef save_function_tuple(self, func):\n if is_tornado_coroutine(func):\n self.save_reduce(_rebuild_tornado_coroutine, (func.__wrapped__,),\n obj=func)\n return\n\n save = self.save\n write = self.write\n\n code, f_globals, defaults, closure_values, dct, base_globals = self.extract_func_data(func)\n\n save(_fill_function) # skeleton function updater\n write(pickle.MARK) # beginning of tuple that _fill_function expects\n\n self._save_subimports(\n code,\n itertools.chain(f_globals.values(), closure_values or ()),\n )\n\n # create a skeleton function object and memoize it\n save(_make_skel_func)\n save((\n code,\n len(closure_values) if closure_values is not None else -1,\n base_globals,\n ))\n write(pickle.REDUCE)\n self.memoize(func)\n\n # save the rest of the func data needed by _fill_function\n state = {\n 'globals': f_globals,\n 'defaults': defaults,\n 'dict': dct,\n 'closure_values': closure_values,\n 'module': func.__module__,\n 'name': func.__name__,\n 'doc': func.__doc__,\n }\n if hasattr(func, '__annotations__') and sys.version_info >= (3, 7):\n state['annotations'] = func.__annotations__\n if hasattr(func, '__qualname__'):\n state['qualname'] = func.__qualname__\n save(state)\n write(pickle.TUPLE)\n write(pickle.REDUCE)", "response": "Save a function object in the cache."} {"instruction": "Create a Python 3 function to\nsave a global object.", "response": "def save_global(self, obj, name=None, pack=struct.pack):\n \"\"\"\n Save a \"global\".\n\n The name of this method is somewhat misleading: all types get\n dispatched here.\n \"\"\"\n if obj is type(None):\n return self.save_reduce(type, (None,), obj=obj)\n elif obj is type(Ellipsis):\n return self.save_reduce(type, (Ellipsis,), obj=obj)\n elif obj is type(NotImplemented):\n return self.save_reduce(type, (NotImplemented,), obj=obj)\n\n if obj.__module__ == \"__main__\":\n return self.save_dynamic_class(obj)\n\n try:\n return Pickler.save_global(self, obj, name=name)\n except Exception:\n if obj.__module__ == \"__builtin__\" or obj.__module__ == \"builtins\":\n if obj in _BUILTIN_TYPE_NAMES:\n return self.save_reduce(\n _builtin_type, (_BUILTIN_TYPE_NAMES[obj],), obj=obj)\n\n typ = type(obj)\n if typ is not obj and isinstance(obj, (type, types.ClassType)):\n return self.save_dynamic_class(obj)\n\n raise"} {"instruction": "Can you implement a function in Python 3 that\nsaves an instance of the object obj to the pickle file.", "response": "def save_inst(self, obj):\n \"\"\"Inner logic to save instance. Based off pickle.save_inst\"\"\"\n cls = obj.__class__\n\n # Try the dispatch table (pickle module doesn't do it)\n f = self.dispatch.get(cls)\n if f:\n f(self, obj) # Call unbound method with explicit self\n return\n\n memo = self.memo\n write = self.write\n save = self.save\n\n if hasattr(obj, '__getinitargs__'):\n args = obj.__getinitargs__()\n len(args) # XXX Assert it's a sequence\n pickle._keep_alive(args, memo)\n else:\n args = ()\n\n write(pickle.MARK)\n\n if self.bin:\n save(cls)\n for arg in args:\n save(arg)\n write(pickle.OBJ)\n else:\n for arg in args:\n save(arg)\n write(pickle.INST + cls.__module__ + '\\n' + cls.__name__ + '\\n')\n\n self.memoize(obj)\n\n try:\n getstate = obj.__getstate__\n except AttributeError:\n stuff = obj.__dict__\n else:\n stuff = getstate()\n pickle._keep_alive(stuff, memo)\n save(stuff)\n write(pickle.BUILD)"} {"instruction": "How would you explain what the following Python 3 function does\ndef save_itemgetter(self, obj):\n class Dummy:\n def __getitem__(self, item):\n return item\n items = obj(Dummy())\n if not isinstance(items, tuple):\n items = (items,)\n return self.save_reduce(operator.itemgetter, items)", "response": "save_itemgetter serializer that returns a tuple of the items of the object"} {"instruction": "How would you explain what the following Python 3 function does\ndef _copy_new_parent(self, parent):\n if self.parent == \"undefined\":\n param = copy.copy(self)\n param.parent = parent.uid\n return param\n else:\n raise ValueError(\"Cannot copy from non-dummy parent %s.\" % parent)", "response": "Copy the current param to a new parent."} {"instruction": "Implement a function in Python 3 to\nconvert a value to a list if possible.", "response": "def toList(value):\n \"\"\"\n Convert a value to a list, if possible.\n \"\"\"\n if type(value) == list:\n return value\n elif type(value) in [np.ndarray, tuple, xrange, array.array]:\n return list(value)\n elif isinstance(value, Vector):\n return list(value.toArray())\n else:\n raise TypeError(\"Could not convert %s to list\" % value)"} {"instruction": "Explain what the following Python 3 code does\ndef toListFloat(value):\n if TypeConverters._can_convert_to_list(value):\n value = TypeConverters.toList(value)\n if all(map(lambda v: TypeConverters._is_numeric(v), value)):\n return [float(v) for v in value]\n raise TypeError(\"Could not convert %s to list of floats\" % value)", "response": "Convert a value to list of floats if possible."} {"instruction": "Write a Python 3 script for\nconverting a value to list of ints if possible.", "response": "def toListInt(value):\n \"\"\"\n Convert a value to list of ints, if possible.\n \"\"\"\n if TypeConverters._can_convert_to_list(value):\n value = TypeConverters.toList(value)\n if all(map(lambda v: TypeConverters._is_integer(v), value)):\n return [int(v) for v in value]\n raise TypeError(\"Could not convert %s to list of ints\" % value)"} {"instruction": "Implement a function in Python 3 to\nconvert a value to list of strings if possible.", "response": "def toListString(value):\n \"\"\"\n Convert a value to list of strings, if possible.\n \"\"\"\n if TypeConverters._can_convert_to_list(value):\n value = TypeConverters.toList(value)\n if all(map(lambda v: TypeConverters._can_convert_to_string(v), value)):\n return [TypeConverters.toString(v) for v in value]\n raise TypeError(\"Could not convert %s to list of strings\" % value)"} {"instruction": "Make a summary of the following Python 3 code\ndef toVector(value):\n if isinstance(value, Vector):\n return value\n elif TypeConverters._can_convert_to_list(value):\n value = TypeConverters.toList(value)\n if all(map(lambda v: TypeConverters._is_numeric(v), value)):\n return DenseVector(value)\n raise TypeError(\"Could not convert %s to vector\" % value)", "response": "Converts a value to a MLlib Vector if possible."} {"instruction": "How would you code a function in Python 3 to\nconvert a value to a string if possible.", "response": "def toString(value):\n \"\"\"\n Convert a value to a string, if possible.\n \"\"\"\n if isinstance(value, basestring):\n return value\n elif type(value) in [np.string_, np.str_]:\n return str(value)\n elif type(value) == np.unicode_:\n return unicode(value)\n else:\n raise TypeError(\"Could not convert %s to string type\" % type(value))"} {"instruction": "Make a summary of the following Python 3 code\ndef _copy_params(self):\n cls = type(self)\n src_name_attrs = [(x, getattr(cls, x)) for x in dir(cls)]\n src_params = list(filter(lambda nameAttr: isinstance(nameAttr[1], Param), src_name_attrs))\n for name, param in src_params:\n setattr(self, name, param._copy_new_parent(self))", "response": "Copy all params defined on the class to current object."} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef params(self):\n if self._params is None:\n self._params = list(filter(lambda attr: isinstance(attr, Param),\n [getattr(self, x) for x in dir(self) if x != \"params\" and\n not isinstance(getattr(type(self), x, None), property)]))\n return self._params", "response": "Returns all params ordered by name."} {"instruction": "Write a Python 3 script for\nreturning a string representation of a single parameter.", "response": "def explainParam(self, param):\n \"\"\"\n Explains a single param and returns its name, doc, and optional\n default value and user-supplied value in a string.\n \"\"\"\n param = self._resolveParam(param)\n values = []\n if self.isDefined(param):\n if param in self._defaultParamMap:\n values.append(\"default: %s\" % self._defaultParamMap[param])\n if param in self._paramMap:\n values.append(\"current: %s\" % self._paramMap[param])\n else:\n values.append(\"undefined\")\n valueStr = \"(\" + \", \".join(values) + \")\"\n return \"%s: %s %s\" % (param.name, param.doc, valueStr)"} {"instruction": "Explain what the following Python 3 code does\ndef getParam(self, paramName):\n param = getattr(self, paramName)\n if isinstance(param, Param):\n return param\n else:\n raise ValueError(\"Cannot find param with name %s.\" % paramName)", "response": "Gets a param by its name."} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef isSet(self, param):\n param = self._resolveParam(param)\n return param in self._paramMap", "response": "Checks whether a param is explicitly set by user."} {"instruction": "Here you have a function in Python 3, explain what it does\ndef hasDefault(self, param):\n param = self._resolveParam(param)\n return param in self._defaultParamMap", "response": "Checks whether a param has a default value."} {"instruction": "Create a Python 3 function to\ntest whether this instance contains a param with a given name.", "response": "def hasParam(self, paramName):\n \"\"\"\n Tests whether this instance contains a param with a given\n (string) name.\n \"\"\"\n if isinstance(paramName, basestring):\n p = getattr(self, paramName, None)\n return isinstance(p, Param)\n else:\n raise TypeError(\"hasParam(): paramName must be a string\")"} {"instruction": "How would you explain what the following Python 3 function does\ndef getOrDefault(self, param):\n param = self._resolveParam(param)\n if param in self._paramMap:\n return self._paramMap[param]\n else:\n return self._defaultParamMap[param]", "response": "Gets the value of a param in the user - supplied param map or its\n default value. Raises an error if neither is set."} {"instruction": "Write a Python 3 function that can\nextract the embedded default param values and user - supplied param values and merges them with extra values into the param map.", "response": "def extractParamMap(self, extra=None):\n \"\"\"\n Extracts the embedded default param values and user-supplied\n values, and then merges them with extra values from input into\n a flat param map, where the latter value is used if there exist\n conflicts, i.e., with ordering: default param values <\n user-supplied values < extra.\n\n :param extra: extra param values\n :return: merged param map\n \"\"\"\n if extra is None:\n extra = dict()\n paramMap = self._defaultParamMap.copy()\n paramMap.update(self._paramMap)\n paramMap.update(extra)\n return paramMap"} {"instruction": "Here you have a function in Python 3, explain what it does\ndef copy(self, extra=None):\n if extra is None:\n extra = dict()\n that = copy.copy(self)\n that._paramMap = {}\n that._defaultParamMap = {}\n return self._copyValues(that, extra)", "response": "Creates a shallow copy of this instance with the same uid and extra params."} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef set(self, param, value):\n self._shouldOwn(param)\n try:\n value = param.typeConverter(value)\n except ValueError as e:\n raise ValueError('Invalid param value given for param \"%s\". %s' % (param.name, e))\n self._paramMap[param] = value", "response": "Sets a parameter in the embedded param map."} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef _shouldOwn(self, param):\n if not (self.uid == param.parent and self.hasParam(param.name)):\n raise ValueError(\"Param %r does not belong to %r.\" % (param, self))", "response": "Validates that the input param belongs to this Params instance."} {"instruction": "Write a Python 3 function that can\nresolve a param and validates the ownership.", "response": "def _resolveParam(self, param):\n \"\"\"\n Resolves a param and validates the ownership.\n\n :param param: param name or the param instance, which must\n belong to this Params instance\n :return: resolved param instance\n \"\"\"\n if isinstance(param, Param):\n self._shouldOwn(param)\n return param\n elif isinstance(param, basestring):\n return self.getParam(param)\n else:\n raise ValueError(\"Cannot resolve %r as a param.\" % param)"} {"instruction": "Implement a function in Python 3 to\nset user - supplied params.", "response": "def _set(self, **kwargs):\n \"\"\"\n Sets user-supplied params.\n \"\"\"\n for param, value in kwargs.items():\n p = getattr(self, param)\n if value is not None:\n try:\n value = p.typeConverter(value)\n except TypeError as e:\n raise TypeError('Invalid param value given for param \"%s\". %s' % (p.name, e))\n self._paramMap[p] = value\n return self"} {"instruction": "Implement a function in Python 3 to\nset the default parameters for the current object.", "response": "def _setDefault(self, **kwargs):\n \"\"\"\n Sets default params.\n \"\"\"\n for param, value in kwargs.items():\n p = getattr(self, param)\n if value is not None and not isinstance(value, JavaObject):\n try:\n value = p.typeConverter(value)\n except TypeError as e:\n raise TypeError('Invalid default param value given for param \"%s\". %s'\n % (p.name, e))\n self._defaultParamMap[p] = value\n return self"} {"instruction": "Implement a function in Python 3 to\ncopy the values from this instance to another instance.", "response": "def _copyValues(self, to, extra=None):\n \"\"\"\n Copies param values from this instance to another instance for\n params shared by them.\n\n :param to: the target instance\n :param extra: extra params to be copied\n :return: the target instance with param values copied\n \"\"\"\n paramMap = self._paramMap.copy()\n if extra is not None:\n paramMap.update(extra)\n for param in self.params:\n # copy default params\n if param in self._defaultParamMap and to.hasParam(param.name):\n to._defaultParamMap[to.getParam(param.name)] = self._defaultParamMap[param]\n # copy explicitly set params\n if param in paramMap and to.hasParam(param.name):\n to._set(**{param.name: paramMap[param]})\n return to"} {"instruction": "Implement a Python 3 function for\nchanging the uid of this instance. This updates both the stored uid and the parent uid of params and param maps. This is used by persistence (loading). :param newUid: new uid to use, which is converted to unicode :return: same instance, but with the uid and Param.parent values updated, including within param maps", "response": "def _resetUid(self, newUid):\n \"\"\"\n Changes the uid of this instance. This updates both\n the stored uid and the parent uid of params and param maps.\n This is used by persistence (loading).\n :param newUid: new uid to use, which is converted to unicode\n :return: same instance, but with the uid and Param.parent values\n updated, including within param maps\n \"\"\"\n newUid = unicode(newUid)\n self.uid = newUid\n newDefaultParamMap = dict()\n newParamMap = dict()\n for param in self.params:\n newParam = copy.copy(param)\n newParam.parent = newUid\n if param in self._defaultParamMap:\n newDefaultParamMap[newParam] = self._defaultParamMap[param]\n if param in self._paramMap:\n newParamMap[newParam] = self._paramMap[param]\n param.parent = newUid\n self._defaultParamMap = newDefaultParamMap\n self._paramMap = newParamMap\n return self"} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef _to_java_object_rdd(rdd):\n rdd = rdd._reserialize(AutoBatchedSerializer(PickleSerializer()))\n return rdd.ctx._jvm.org.apache.spark.ml.python.MLSerDe.pythonToJava(rdd._jrdd, True)", "response": "Return an JavaRDD of Object by unpickling\n "} {"instruction": "Given the following Python 3 function, write the documentation\ndef value(self):\n if not hasattr(self, \"_value\") and self._path is not None:\n # we only need to decrypt it here when encryption is enabled and\n # if its on the driver, since executor decryption is handled already\n if self._sc is not None and self._sc._encryption_enabled:\n port, auth_secret = self._python_broadcast.setupDecryptionServer()\n (decrypted_sock_file, _) = local_connect_and_auth(port, auth_secret)\n self._python_broadcast.waitTillBroadcastDataSent()\n return self.load(decrypted_sock_file)\n else:\n self._value = self.load_from_path(self._path)\n return self._value", "response": "Return the broadcasted value of the current object."} {"instruction": "Create a Python 3 function to\ndelete cached copies of this broadcast on the executors. If the broadcast is used after this is called, it will need to be re-sent to each executor. :param blocking: Whether to block until unpersisting has completed", "response": "def unpersist(self, blocking=False):\n \"\"\"\n Delete cached copies of this broadcast on the executors. If the\n broadcast is used after this is called, it will need to be\n re-sent to each executor.\n\n :param blocking: Whether to block until unpersisting has completed\n \"\"\"\n if self._jbroadcast is None:\n raise Exception(\"Broadcast can only be unpersisted in driver\")\n self._jbroadcast.unpersist(blocking)"} {"instruction": "Can you create a Python 3 function that\ndestroys all data and metadata related to this broadcast variable.", "response": "def destroy(self, blocking=False):\n \"\"\"\n Destroy all data and metadata related to this broadcast variable.\n Use this with caution; once a broadcast variable has been destroyed,\n it cannot be used again.\n\n .. versionchanged:: 3.0.0\n Added optional argument `blocking` to specify whether to block until all\n blocks are deleted.\n \"\"\"\n if self._jbroadcast is None:\n raise Exception(\"Broadcast can only be destroyed in driver\")\n self._jbroadcast.destroy(blocking)\n os.unlink(self._path)"} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef _wrapped(self):\n\n # It is possible for a callable instance without __name__ attribute or/and\n # __module__ attribute to be wrapped here. For example, functools.partial. In this case,\n # we should avoid wrapping the attributes from the wrapped function to the wrapper\n # function. So, we take out these attribute names from the default names to set and\n # then manually assign it after being wrapped.\n assignments = tuple(\n a for a in functools.WRAPPER_ASSIGNMENTS if a != '__name__' and a != '__module__')\n\n @functools.wraps(self.func, assigned=assignments)\n def wrapper(*args):\n return self(*args)\n\n wrapper.__name__ = self._name\n wrapper.__module__ = (self.func.__module__ if hasattr(self.func, '__module__')\n else self.func.__class__.__module__)\n\n wrapper.func = self.func\n wrapper.returnType = self.returnType\n wrapper.evalType = self.evalType\n wrapper.deterministic = self.deterministic\n wrapper.asNondeterministic = functools.wraps(\n self.asNondeterministic)(lambda: self.asNondeterministic()._wrapped())\n return wrapper", "response": "Wrap this udf with a function and attach docstring from func\n "} {"instruction": "Can you tell what is the following Python 3 function doing\ndef register(self, name, f, returnType=None):\n\n # This is to check whether the input function is from a user-defined function or\n # Python function.\n if hasattr(f, 'asNondeterministic'):\n if returnType is not None:\n raise TypeError(\n \"Invalid returnType: data type can not be specified when f is\"\n \"a user-defined function, but got %s.\" % returnType)\n if f.evalType not in [PythonEvalType.SQL_BATCHED_UDF,\n PythonEvalType.SQL_SCALAR_PANDAS_UDF,\n PythonEvalType.SQL_GROUPED_AGG_PANDAS_UDF]:\n raise ValueError(\n \"Invalid f: f must be SQL_BATCHED_UDF, SQL_SCALAR_PANDAS_UDF or \"\n \"SQL_GROUPED_AGG_PANDAS_UDF\")\n register_udf = UserDefinedFunction(f.func, returnType=f.returnType, name=name,\n evalType=f.evalType,\n deterministic=f.deterministic)\n return_udf = f\n else:\n if returnType is None:\n returnType = StringType()\n register_udf = UserDefinedFunction(f, returnType=returnType, name=name,\n evalType=PythonEvalType.SQL_BATCHED_UDF)\n return_udf = register_udf._wrapped()\n self.sparkSession._jsparkSession.udf().registerPython(name, register_udf._judf)\n return return_udf", "response": "Registers a Python function or a user - defined function with the specified name."} {"instruction": "Can you tell what is the following Python 3 function doing\ndef registerJavaFunction(self, name, javaClassName, returnType=None):\n\n jdt = None\n if returnType is not None:\n if not isinstance(returnType, DataType):\n returnType = _parse_datatype_string(returnType)\n jdt = self.sparkSession._jsparkSession.parseDataType(returnType.json())\n self.sparkSession._jsparkSession.udf().registerJava(name, javaClassName, jdt)", "response": "Register a Java user - defined function as a SQL function."} {"instruction": "Create a Python 3 function for\nregistering a Java user - defined aggregate function as a SQL function.", "response": "def registerJavaUDAF(self, name, javaClassName):\n \"\"\"Register a Java user-defined aggregate function as a SQL function.\n\n :param name: name of the user-defined aggregate function\n :param javaClassName: fully qualified name of java class\n\n >>> spark.udf.registerJavaUDAF(\"javaUDAF\", \"test.org.apache.spark.sql.MyDoubleAvg\")\n >>> df = spark.createDataFrame([(1, \"a\"),(2, \"b\"), (3, \"a\")],[\"id\", \"name\"])\n >>> df.createOrReplaceTempView(\"df\")\n >>> spark.sql(\"SELECT name, javaUDAF(id) as avg from df group by name\").collect()\n [Row(name=u'b', avg=102.0), Row(name=u'a', avg=102.0)]\n \"\"\"\n\n self.sparkSession._jsparkSession.udf().registerJavaUDAF(name, javaClassName)"} {"instruction": "How would you explain what the following Python 3 function does\ndef getOrCreate(cls, checkpointPath, setupFunc):\n cls._ensure_initialized()\n gw = SparkContext._gateway\n\n # Check whether valid checkpoint information exists in the given path\n ssc_option = gw.jvm.StreamingContextPythonHelper().tryRecoverFromCheckpoint(checkpointPath)\n if ssc_option.isEmpty():\n ssc = setupFunc()\n ssc.checkpoint(checkpointPath)\n return ssc\n\n jssc = gw.jvm.JavaStreamingContext(ssc_option.get())\n\n # If there is already an active instance of Python SparkContext use it, or create a new one\n if not SparkContext._active_spark_context:\n jsc = jssc.sparkContext()\n conf = SparkConf(_jconf=jsc.getConf())\n SparkContext(conf=conf, gateway=gw, jsc=jsc)\n\n sc = SparkContext._active_spark_context\n\n # update ctx in serializer\n cls._transformerSerializer.ctx = sc\n return StreamingContext(sc, None, jssc)", "response": "Create a new StreamingContext from checkpoint data."} {"instruction": "Write a Python 3 function that can\nreturn either the currently active StreamingContext or None.", "response": "def getActive(cls):\n \"\"\"\n Return either the currently active StreamingContext (i.e., if there is a context started\n but not stopped) or None.\n \"\"\"\n activePythonContext = cls._activeContext\n if activePythonContext is not None:\n # Verify that the current running Java StreamingContext is active and is the same one\n # backing the supposedly active Python context\n activePythonContextJavaId = activePythonContext._jssc.ssc().hashCode()\n activeJvmContextOption = activePythonContext._jvm.StreamingContext.getActive()\n\n if activeJvmContextOption.isEmpty():\n cls._activeContext = None\n elif activeJvmContextOption.get().hashCode() != activePythonContextJavaId:\n cls._activeContext = None\n raise Exception(\"JVM's active JavaStreamingContext is not the JavaStreamingContext \"\n \"backing the action Python StreamingContext. This is unexpected.\")\n return cls._activeContext"} {"instruction": "Here you have a function in Python 3, explain what it does\ndef getActiveOrCreate(cls, checkpointPath, setupFunc):\n\n if setupFunc is None:\n raise Exception(\"setupFunc cannot be None\")\n activeContext = cls.getActive()\n if activeContext is not None:\n return activeContext\n elif checkpointPath is not None:\n return cls.getOrCreate(checkpointPath, setupFunc)\n else:\n return setupFunc()", "response": "Returns the active StreamingContext or creates a new StreamingContext if it does not exist."} {"instruction": "Write a Python 3 function that can\nwait for the execution to stop.", "response": "def awaitTermination(self, timeout=None):\n \"\"\"\n Wait for the execution to stop.\n\n @param timeout: time to wait in seconds\n \"\"\"\n if timeout is None:\n self._jssc.awaitTermination()\n else:\n self._jssc.awaitTerminationOrTimeout(int(timeout * 1000))"} {"instruction": "How would you implement a function in Python 3 that\nstops the execution of the streams.", "response": "def stop(self, stopSparkContext=True, stopGraceFully=False):\n \"\"\"\n Stop the execution of the streams, with option of ensuring all\n received data has been processed.\n\n @param stopSparkContext: Stop the associated SparkContext or not\n @param stopGracefully: Stop gracefully by waiting for the processing\n of all received data to be completed\n \"\"\"\n self._jssc.stop(stopSparkContext, stopGraceFully)\n StreamingContext._activeContext = None\n if stopSparkContext:\n self._sc.stop()"} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef socketTextStream(self, hostname, port, storageLevel=StorageLevel.MEMORY_AND_DISK_2):\n jlevel = self._sc._getJavaStorageLevel(storageLevel)\n return DStream(self._jssc.socketTextStream(hostname, port, jlevel), self,\n UTF8Deserializer())", "response": "Create a TCP socket with text data received using the specified hostname and port."} {"instruction": "Create a Python 3 function to\ncreate an input stream that monitors a Hadoop - compatible file system for new files and reads them as text files.", "response": "def textFileStream(self, directory):\n \"\"\"\n Create an input stream that monitors a Hadoop-compatible file system\n for new files and reads them as text files. Files must be wrriten to the\n monitored directory by \"moving\" them from another location within the same\n file system. File names starting with . are ignored.\n The text files must be encoded as UTF-8.\n \"\"\"\n return DStream(self._jssc.textFileStream(directory), self, UTF8Deserializer())"} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef binaryRecordsStream(self, directory, recordLength):\n return DStream(self._jssc.binaryRecordsStream(directory, recordLength), self,\n NoOpSerializer())", "response": "Create an input stream that monitors a Hadoop - compatible file system\n for new files and reads them as flat binary files with records of recordLength fixed length."} {"instruction": "Can you tell what is the following Python 3 function doing\ndef queueStream(self, rdds, oneAtATime=True, default=None):\n if default and not isinstance(default, RDD):\n default = self._sc.parallelize(default)\n\n if not rdds and default:\n rdds = [rdds]\n\n if rdds and not isinstance(rdds[0], RDD):\n rdds = [self._sc.parallelize(input) for input in rdds]\n self._check_serializers(rdds)\n\n queue = self._jvm.PythonDStream.toRDDQueue([r._jrdd for r in rdds])\n if default:\n default = default._reserialize(rdds[0]._jrdd_deserializer)\n jdstream = self._jssc.queueStream(queue, oneAtATime, default._jrdd)\n else:\n jdstream = self._jssc.queueStream(queue, oneAtATime)\n return DStream(jdstream, self, rdds[0]._jrdd_deserializer)", "response": "Create an input stream from a queue of RDDs or list of RDDs."} {"instruction": "Can you create a Python 3 function that\ncreates a new DStream in which each RDD is generated by applying a function on each RDD of the DStreams in dstreams.", "response": "def transform(self, dstreams, transformFunc):\n \"\"\"\n Create a new DStream in which each RDD is generated by applying\n a function on RDDs of the DStreams. The order of the JavaRDDs in\n the transform function parameter will be the same as the order\n of corresponding DStreams in the list.\n \"\"\"\n jdstreams = [d._jdstream for d in dstreams]\n # change the final serializer to sc.serializer\n func = TransformFunction(self._sc,\n lambda t, *rdds: transformFunc(rdds),\n *[d._jrdd_deserializer for d in dstreams])\n jfunc = self._jvm.TransformFunction(func)\n jdstream = self._jssc.transform(jdstreams, jfunc)\n return DStream(jdstream, self, self._sc.serializer)"} {"instruction": "How would you explain what the following Python 3 function does\ndef union(self, *dstreams):\n if not dstreams:\n raise ValueError(\"should have at least one DStream to union\")\n if len(dstreams) == 1:\n return dstreams[0]\n if len(set(s._jrdd_deserializer for s in dstreams)) > 1:\n raise ValueError(\"All DStreams should have same serializer\")\n if len(set(s._slideDuration for s in dstreams)) > 1:\n raise ValueError(\"All DStreams should have same slide duration\")\n cls = SparkContext._jvm.org.apache.spark.streaming.api.java.JavaDStream\n jdstreams = SparkContext._gateway.new_array(cls, len(dstreams))\n for i in range(0, len(dstreams)):\n jdstreams[i] = dstreams[i]._jdstream\n return DStream(self._jssc.union(jdstreams), self, dstreams[0]._jrdd_deserializer)", "response": "Create a unified DStream from multiple DStreams of the sameMimeType and same slide duration."} {"instruction": "Here you have a function in Python 3, explain what it does\ndef addStreamingListener(self, streamingListener):\n self._jssc.addStreamingListener(self._jvm.JavaStreamingListenerWrapper(\n self._jvm.PythonStreamingListenerWrapper(streamingListener)))", "response": "Adds a StreamingListener object for receiving system events related to the given stream."} {"instruction": "How would you implement a function in Python 3 that\nloads weights from a TensorFlow model in a PyTorch model.", "response": "def load_tf_weights_in_gpt2(model, gpt2_checkpoint_path):\n \"\"\" Load tf checkpoints in a pytorch model\n \"\"\"\n try:\n import re\n import numpy as np\n import tensorflow as tf\n except ImportError:\n print(\"Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see \"\n \"https://www.tensorflow.org/install/ for installation instructions.\")\n raise\n tf_path = os.path.abspath(gpt2_checkpoint_path)\n print(\"Converting TensorFlow checkpoint from {}\".format(tf_path))\n # Load weights from TF model\n init_vars = tf.train.list_variables(tf_path)\n names = []\n arrays = []\n for name, shape in init_vars:\n print(\"Loading TF weight {} with shape {}\".format(name, shape))\n array = tf.train.load_variable(tf_path, name)\n names.append(name)\n arrays.append(array.squeeze())\n\n for name, array in zip(names, arrays):\n name = name[6:] # skip \"model/\"\n name = name.split('/')\n pointer = model\n for m_name in name:\n if re.fullmatch(r'[A-Za-z]+\\d+', m_name):\n l = re.split(r'(\\d+)', m_name)\n else:\n l = [m_name]\n if l[0] == 'w' or l[0] == 'g':\n pointer = getattr(pointer, 'weight')\n elif l[0] == 'b':\n pointer = getattr(pointer, 'bias')\n elif l[0] == 'wpe' or l[0] == 'wte':\n pointer = getattr(pointer, l[0])\n pointer = getattr(pointer, 'weight')\n else:\n pointer = getattr(pointer, l[0])\n if len(l) >= 2:\n num = int(l[1])\n pointer = pointer[num]\n try:\n assert pointer.shape == array.shape\n except AssertionError as e:\n e.args += (pointer.shape, array.shape)\n raise\n print(\"Initialize PyTorch weight {}\".format(name))\n pointer.data = torch.from_numpy(array)\n return model"} {"instruction": "Make a summary of the following Python 3 code\ndef from_json_file(cls, json_file):\n with open(json_file, \"r\", encoding=\"utf-8\") as reader:\n text = reader.read()\n return cls.from_dict(json.loads(text))", "response": "Constructs a GPT2Config from a json file of parameters."} {"instruction": "Here you have a function in Python 3, explain what it does\ndef to_json_file(self, json_file_path):\n with open(json_file_path, \"w\", encoding='utf-8') as writer:\n writer.write(self.to_json_string())", "response": "Save this instance to a json file."} {"instruction": "How would you explain what the following Python 3 function does\ndef convert_examples_to_features(examples, seq_length, tokenizer):\n\n features = []\n for (ex_index, example) in enumerate(examples):\n tokens_a = tokenizer.tokenize(example.text_a)\n\n tokens_b = None\n if example.text_b:\n tokens_b = tokenizer.tokenize(example.text_b)\n\n if tokens_b:\n # Modifies `tokens_a` and `tokens_b` in place so that the total\n # length is less than the specified length.\n # Account for [CLS], [SEP], [SEP] with \"- 3\"\n _truncate_seq_pair(tokens_a, tokens_b, seq_length - 3)\n else:\n # Account for [CLS] and [SEP] with \"- 2\"\n if len(tokens_a) > seq_length - 2:\n tokens_a = tokens_a[0:(seq_length - 2)]\n\n # The convention in BERT is:\n # (a) For sequence pairs:\n # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]\n # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1\n # (b) For single sequences:\n # tokens: [CLS] the dog is hairy . [SEP]\n # type_ids: 0 0 0 0 0 0 0\n #\n # Where \"type_ids\" are used to indicate whether this is the first\n # sequence or the second sequence. The embedding vectors for `type=0` and\n # `type=1` were learned during pre-training and are added to the wordpiece\n # embedding vector (and position vector). This is not *strictly* necessary\n # since the [SEP] token unambigiously separates the sequences, but it makes\n # it easier for the model to learn the concept of sequences.\n #\n # For classification tasks, the first vector (corresponding to [CLS]) is\n # used as as the \"sentence vector\". Note that this only makes sense because\n # the entire model is fine-tuned.\n tokens = []\n input_type_ids = []\n tokens.append(\"[CLS]\")\n input_type_ids.append(0)\n for token in tokens_a:\n tokens.append(token)\n input_type_ids.append(0)\n tokens.append(\"[SEP]\")\n input_type_ids.append(0)\n\n if tokens_b:\n for token in tokens_b:\n tokens.append(token)\n input_type_ids.append(1)\n tokens.append(\"[SEP]\")\n input_type_ids.append(1)\n\n input_ids = tokenizer.convert_tokens_to_ids(tokens)\n\n # The mask has 1 for real tokens and 0 for padding tokens. Only real\n # tokens are attended to.\n input_mask = [1] * len(input_ids)\n\n # Zero-pad up to the sequence length.\n while len(input_ids) < seq_length:\n input_ids.append(0)\n input_mask.append(0)\n input_type_ids.append(0)\n\n assert len(input_ids) == seq_length\n assert len(input_mask) == seq_length\n assert len(input_type_ids) == seq_length\n\n if ex_index < 5:\n logger.info(\"*** Example ***\")\n logger.info(\"unique_id: %s\" % (example.unique_id))\n logger.info(\"tokens: %s\" % \" \".join([str(x) for x in tokens]))\n logger.info(\"input_ids: %s\" % \" \".join([str(x) for x in input_ids]))\n logger.info(\"input_mask: %s\" % \" \".join([str(x) for x in input_mask]))\n logger.info(\n \"input_type_ids: %s\" % \" \".join([str(x) for x in input_type_ids]))\n\n features.append(\n InputFeatures(\n unique_id=example.unique_id,\n tokens=tokens,\n input_ids=input_ids,\n input_mask=input_mask,\n input_type_ids=input_type_ids))\n return features", "response": "Loads a data file into a list of InputFeature s."} {"instruction": "Make a summary of the following Python 3 code\ndef read_examples(input_file):\n examples = []\n unique_id = 0\n with open(input_file, \"r\", encoding='utf-8') as reader:\n while True:\n line = reader.readline()\n if not line:\n break\n line = line.strip()\n text_a = None\n text_b = None\n m = re.match(r\"^(.*) \\|\\|\\| (.*)$\", line)\n if m is None:\n text_a = line\n else:\n text_a = m.group(1)\n text_b = m.group(2)\n examples.append(\n InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b))\n unique_id += 1\n return examples", "response": "Read a list of InputExample s from an input file."} {"instruction": "Here you have a function in Python 3, explain what it does\ndef read_squad_examples(input_file, is_training, version_2_with_negative):\n with open(input_file, \"r\", encoding='utf-8') as reader:\n input_data = json.load(reader)[\"data\"]\n\n def is_whitespace(c):\n if c == \" \" or c == \"\\t\" or c == \"\\r\" or c == \"\\n\" or ord(c) == 0x202F:\n return True\n return False\n\n examples = []\n for entry in input_data:\n for paragraph in entry[\"paragraphs\"]:\n paragraph_text = paragraph[\"context\"]\n doc_tokens = []\n char_to_word_offset = []\n prev_is_whitespace = True\n for c in paragraph_text:\n if is_whitespace(c):\n prev_is_whitespace = True\n else:\n if prev_is_whitespace:\n doc_tokens.append(c)\n else:\n doc_tokens[-1] += c\n prev_is_whitespace = False\n char_to_word_offset.append(len(doc_tokens) - 1)\n\n for qa in paragraph[\"qas\"]:\n qas_id = qa[\"id\"]\n question_text = qa[\"question\"]\n start_position = None\n end_position = None\n orig_answer_text = None\n is_impossible = False\n if is_training:\n if version_2_with_negative:\n is_impossible = qa[\"is_impossible\"]\n if (len(qa[\"answers\"]) != 1) and (not is_impossible):\n raise ValueError(\n \"For training, each question should have exactly 1 answer.\")\n if not is_impossible:\n answer = qa[\"answers\"][0]\n orig_answer_text = answer[\"text\"]\n answer_offset = answer[\"answer_start\"]\n answer_length = len(orig_answer_text)\n start_position = char_to_word_offset[answer_offset]\n end_position = char_to_word_offset[answer_offset + answer_length - 1]\n # Only add answers where the text can be exactly recovered from the\n # document. If this CAN'T happen it's likely due to weird Unicode\n # stuff so we will just skip the example.\n #\n # Note that this means for training mode, every example is NOT\n # guaranteed to be preserved.\n actual_text = \" \".join(doc_tokens[start_position:(end_position + 1)])\n cleaned_answer_text = \" \".join(\n whitespace_tokenize(orig_answer_text))\n if actual_text.find(cleaned_answer_text) == -1:\n logger.warning(\"Could not find answer: '%s' vs. '%s'\",\n actual_text, cleaned_answer_text)\n continue\n else:\n start_position = -1\n end_position = -1\n orig_answer_text = \"\"\n\n example = SquadExample(\n qas_id=qas_id,\n question_text=question_text,\n doc_tokens=doc_tokens,\n orig_answer_text=orig_answer_text,\n start_position=start_position,\n end_position=end_position,\n is_impossible=is_impossible)\n examples.append(example)\n return examples", "response": "Read a SQuAD json file into a list of SquadExample objects."} {"instruction": "Given the following Python 3 function, write the documentation\ndef convert_examples_to_features(examples, tokenizer, max_seq_length,\n doc_stride, max_query_length, is_training):\n \"\"\"Loads a data file into a list of `InputBatch`s.\"\"\"\n\n unique_id = 1000000000\n\n features = []\n for (example_index, example) in enumerate(examples):\n query_tokens = tokenizer.tokenize(example.question_text)\n\n if len(query_tokens) > max_query_length:\n query_tokens = query_tokens[0:max_query_length]\n\n tok_to_orig_index = []\n orig_to_tok_index = []\n all_doc_tokens = []\n for (i, token) in enumerate(example.doc_tokens):\n orig_to_tok_index.append(len(all_doc_tokens))\n sub_tokens = tokenizer.tokenize(token)\n for sub_token in sub_tokens:\n tok_to_orig_index.append(i)\n all_doc_tokens.append(sub_token)\n\n tok_start_position = None\n tok_end_position = None\n if is_training and example.is_impossible:\n tok_start_position = -1\n tok_end_position = -1\n if is_training and not example.is_impossible:\n tok_start_position = orig_to_tok_index[example.start_position]\n if example.end_position < len(example.doc_tokens) - 1:\n tok_end_position = orig_to_tok_index[example.end_position + 1] - 1\n else:\n tok_end_position = len(all_doc_tokens) - 1\n (tok_start_position, tok_end_position) = _improve_answer_span(\n all_doc_tokens, tok_start_position, tok_end_position, tokenizer,\n example.orig_answer_text)\n\n # The -3 accounts for [CLS], [SEP] and [SEP]\n max_tokens_for_doc = max_seq_length - len(query_tokens) - 3\n\n # We can have documents that are longer than the maximum sequence length.\n # To deal with this we do a sliding window approach, where we take chunks\n # of the up to our max length with a stride of `doc_stride`.\n _DocSpan = collections.namedtuple( # pylint: disable=invalid-name\n \"DocSpan\", [\"start\", \"length\"])\n doc_spans = []\n start_offset = 0\n while start_offset < len(all_doc_tokens):\n length = len(all_doc_tokens) - start_offset\n if length > max_tokens_for_doc:\n length = max_tokens_for_doc\n doc_spans.append(_DocSpan(start=start_offset, length=length))\n if start_offset + length == len(all_doc_tokens):\n break\n start_offset += min(length, doc_stride)\n\n for (doc_span_index, doc_span) in enumerate(doc_spans):\n tokens = []\n token_to_orig_map = {}\n token_is_max_context = {}\n segment_ids = []\n tokens.append(\"[CLS]\")\n segment_ids.append(0)\n for token in query_tokens:\n tokens.append(token)\n segment_ids.append(0)\n tokens.append(\"[SEP]\")\n segment_ids.append(0)\n\n for i in range(doc_span.length):\n split_token_index = doc_span.start + i\n token_to_orig_map[len(tokens)] = tok_to_orig_index[split_token_index]\n\n is_max_context = _check_is_max_context(doc_spans, doc_span_index,\n split_token_index)\n token_is_max_context[len(tokens)] = is_max_context\n tokens.append(all_doc_tokens[split_token_index])\n segment_ids.append(1)\n tokens.append(\"[SEP]\")\n segment_ids.append(1)\n\n input_ids = tokenizer.convert_tokens_to_ids(tokens)\n\n # The mask has 1 for real tokens and 0 for padding tokens. Only real\n # tokens are attended to.\n input_mask = [1] * len(input_ids)\n\n # Zero-pad up to the sequence length.\n while len(input_ids) < max_seq_length:\n input_ids.append(0)\n input_mask.append(0)\n segment_ids.append(0)\n\n assert len(input_ids) == max_seq_length\n assert len(input_mask) == max_seq_length\n assert len(segment_ids) == max_seq_length\n\n start_position = None\n end_position = None\n if is_training and not example.is_impossible:\n # For training, if our document chunk does not contain an annotation\n # we throw it out, since there is nothing to predict.\n doc_start = doc_span.start\n doc_end = doc_span.start + doc_span.length - 1\n out_of_span = False\n if not (tok_start_position >= doc_start and\n tok_end_position <= doc_end):\n out_of_span = True\n if out_of_span:\n start_position = 0\n end_position = 0\n else:\n doc_offset = len(query_tokens) + 2\n start_position = tok_start_position - doc_start + doc_offset\n end_position = tok_end_position - doc_start + doc_offset\n if is_training and example.is_impossible:\n start_position = 0\n end_position = 0\n if example_index < 20:\n logger.info(\"*** Example ***\")\n logger.info(\"unique_id: %s\" % (unique_id))\n logger.info(\"example_index: %s\" % (example_index))\n logger.info(\"doc_span_index: %s\" % (doc_span_index))\n logger.info(\"tokens: %s\" % \" \".join(tokens))\n logger.info(\"token_to_orig_map: %s\" % \" \".join([\n \"%d:%d\" % (x, y) for (x, y) in token_to_orig_map.items()]))\n logger.info(\"token_is_max_context: %s\" % \" \".join([\n \"%d:%s\" % (x, y) for (x, y) in token_is_max_context.items()\n ]))\n logger.info(\"input_ids: %s\" % \" \".join([str(x) for x in input_ids]))\n logger.info(\n \"input_mask: %s\" % \" \".join([str(x) for x in input_mask]))\n logger.info(\n \"segment_ids: %s\" % \" \".join([str(x) for x in segment_ids]))\n if is_training and example.is_impossible:\n logger.info(\"impossible example\")\n if is_training and not example.is_impossible:\n answer_text = \" \".join(tokens[start_position:(end_position + 1)])\n logger.info(\"start_position: %d\" % (start_position))\n logger.info(\"end_position: %d\" % (end_position))\n logger.info(\n \"answer: %s\" % (answer_text))\n\n features.append(\n InputFeatures(\n unique_id=unique_id,\n example_index=example_index,\n doc_span_index=doc_span_index,\n tokens=tokens,\n token_to_orig_map=token_to_orig_map,\n token_is_max_context=token_is_max_context,\n input_ids=input_ids,\n input_mask=input_mask,\n segment_ids=segment_ids,\n start_position=start_position,\n end_position=end_position,\n is_impossible=example.is_impossible))\n unique_id += 1\n\n return features", "response": "Loads a data file into a list of InputBatches."} {"instruction": "Can you create a Python 3 function that\nreturns tokenized answer spans that better match the annotated answer.", "response": "def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer,\n orig_answer_text):\n \"\"\"Returns tokenized answer spans that better match the annotated answer.\"\"\"\n\n # The SQuAD annotations are character based. We first project them to\n # whitespace-tokenized words. But then after WordPiece tokenization, we can\n # often find a \"better match\". For example:\n #\n # Question: What year was John Smith born?\n # Context: The leader was John Smith (1895-1943).\n # Answer: 1895\n #\n # The original whitespace-tokenized answer will be \"(1895-1943).\". However\n # after tokenization, our tokens will be \"( 1895 - 1943 ) .\". So we can match\n # the exact answer, 1895.\n #\n # However, this is not always possible. Consider the following:\n #\n # Question: What country is the top exporter of electornics?\n # Context: The Japanese electronics industry is the lagest in the world.\n # Answer: Japan\n #\n # In this case, the annotator chose \"Japan\" as a character sub-span of\n # the word \"Japanese\". Since our WordPiece tokenizer does not split\n # \"Japanese\", we just use \"Japanese\" as the annotation. This is fairly rare\n # in SQuAD, but does happen.\n tok_answer_text = \" \".join(tokenizer.tokenize(orig_answer_text))\n\n for new_start in range(input_start, input_end + 1):\n for new_end in range(input_end, new_start - 1, -1):\n text_span = \" \".join(doc_tokens[new_start:(new_end + 1)])\n if text_span == tok_answer_text:\n return (new_start, new_end)\n\n return (input_start, input_end)"} {"instruction": "Can you implement a function in Python 3 that\nchecks if this token is the max context for the token.", "response": "def _check_is_max_context(doc_spans, cur_span_index, position):\n \"\"\"Check if this is the 'max context' doc span for the token.\"\"\"\n\n # Because of the sliding window approach taken to scoring documents, a single\n # token can appear in multiple documents. E.g.\n # Doc: the man went to the store and bought a gallon of milk\n # Span A: the man went to the\n # Span B: to the store and bought\n # Span C: and bought a gallon of\n # ...\n #\n # Now the word 'bought' will have two scores from spans B and C. We only\n # want to consider the score with \"maximum context\", which we define as\n # the *minimum* of its left and right context (the *sum* of left and\n # right context will always be the same, of course).\n #\n # In the example the maximum context for 'bought' would be span C since\n # it has 1 left context and 3 right context, while span B has 4 left context\n # and 0 right context.\n best_score = None\n best_span_index = None\n for (span_index, doc_span) in enumerate(doc_spans):\n end = doc_span.start + doc_span.length - 1\n if position < doc_span.start:\n continue\n if position > end:\n continue\n num_left_context = position - doc_span.start\n num_right_context = end - position\n score = min(num_left_context, num_right_context) + 0.01 * doc_span.length\n if best_score is None or score > best_score:\n best_score = score\n best_span_index = span_index\n\n return cur_span_index == best_span_index"} {"instruction": "Here you have a function in Python 3, explain what it does\ndef write_predictions(all_examples, all_features, all_results, n_best_size,\n max_answer_length, do_lower_case, output_prediction_file,\n output_nbest_file, output_null_log_odds_file, verbose_logging,\n version_2_with_negative, null_score_diff_threshold):\n \"\"\"Write final predictions to the json file and log-odds of null if needed.\"\"\"\n logger.info(\"Writing predictions to: %s\" % (output_prediction_file))\n logger.info(\"Writing nbest to: %s\" % (output_nbest_file))\n\n example_index_to_features = collections.defaultdict(list)\n for feature in all_features:\n example_index_to_features[feature.example_index].append(feature)\n\n unique_id_to_result = {}\n for result in all_results:\n unique_id_to_result[result.unique_id] = result\n\n _PrelimPrediction = collections.namedtuple( # pylint: disable=invalid-name\n \"PrelimPrediction\",\n [\"feature_index\", \"start_index\", \"end_index\", \"start_logit\", \"end_logit\"])\n\n all_predictions = collections.OrderedDict()\n all_nbest_json = collections.OrderedDict()\n scores_diff_json = collections.OrderedDict()\n\n for (example_index, example) in enumerate(all_examples):\n features = example_index_to_features[example_index]\n\n prelim_predictions = []\n # keep track of the minimum score of null start+end of position 0\n score_null = 1000000 # large and positive\n min_null_feature_index = 0 # the paragraph slice with min null score\n null_start_logit = 0 # the start logit at the slice with min null score\n null_end_logit = 0 # the end logit at the slice with min null score\n for (feature_index, feature) in enumerate(features):\n result = unique_id_to_result[feature.unique_id]\n start_indexes = _get_best_indexes(result.start_logits, n_best_size)\n end_indexes = _get_best_indexes(result.end_logits, n_best_size)\n # if we could have irrelevant answers, get the min score of irrelevant\n if version_2_with_negative:\n feature_null_score = result.start_logits[0] + result.end_logits[0]\n if feature_null_score < score_null:\n score_null = feature_null_score\n min_null_feature_index = feature_index\n null_start_logit = result.start_logits[0]\n null_end_logit = result.end_logits[0]\n for start_index in start_indexes:\n for end_index in end_indexes:\n # We could hypothetically create invalid predictions, e.g., predict\n # that the start of the span is in the question. We throw out all\n # invalid predictions.\n if start_index >= len(feature.tokens):\n continue\n if end_index >= len(feature.tokens):\n continue\n if start_index not in feature.token_to_orig_map:\n continue\n if end_index not in feature.token_to_orig_map:\n continue\n if not feature.token_is_max_context.get(start_index, False):\n continue\n if end_index < start_index:\n continue\n length = end_index - start_index + 1\n if length > max_answer_length:\n continue\n prelim_predictions.append(\n _PrelimPrediction(\n feature_index=feature_index,\n start_index=start_index,\n end_index=end_index,\n start_logit=result.start_logits[start_index],\n end_logit=result.end_logits[end_index]))\n if version_2_with_negative:\n prelim_predictions.append(\n _PrelimPrediction(\n feature_index=min_null_feature_index,\n start_index=0,\n end_index=0,\n start_logit=null_start_logit,\n end_logit=null_end_logit))\n prelim_predictions = sorted(\n prelim_predictions,\n key=lambda x: (x.start_logit + x.end_logit),\n reverse=True)\n\n _NbestPrediction = collections.namedtuple( # pylint: disable=invalid-name\n \"NbestPrediction\", [\"text\", \"start_logit\", \"end_logit\"])\n\n seen_predictions = {}\n nbest = []\n for pred in prelim_predictions:\n if len(nbest) >= n_best_size:\n break\n feature = features[pred.feature_index]\n if pred.start_index > 0: # this is a non-null prediction\n tok_tokens = feature.tokens[pred.start_index:(pred.end_index + 1)]\n orig_doc_start = feature.token_to_orig_map[pred.start_index]\n orig_doc_end = feature.token_to_orig_map[pred.end_index]\n orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end + 1)]\n tok_text = \" \".join(tok_tokens)\n\n # De-tokenize WordPieces that have been split off.\n tok_text = tok_text.replace(\" ##\", \"\")\n tok_text = tok_text.replace(\"##\", \"\")\n\n # Clean whitespace\n tok_text = tok_text.strip()\n tok_text = \" \".join(tok_text.split())\n orig_text = \" \".join(orig_tokens)\n\n final_text = get_final_text(tok_text, orig_text, do_lower_case, verbose_logging)\n if final_text in seen_predictions:\n continue\n\n seen_predictions[final_text] = True\n else:\n final_text = \"\"\n seen_predictions[final_text] = True\n\n nbest.append(\n _NbestPrediction(\n text=final_text,\n start_logit=pred.start_logit,\n end_logit=pred.end_logit))\n # if we didn't include the empty option in the n-best, include it\n if version_2_with_negative:\n if \"\" not in seen_predictions:\n nbest.append(\n _NbestPrediction(\n text=\"\",\n start_logit=null_start_logit,\n end_logit=null_end_logit))\n \n # In very rare edge cases we could only have single null prediction.\n # So we just create a nonce prediction in this case to avoid failure.\n if len(nbest)==1:\n nbest.insert(0,\n _NbestPrediction(text=\"empty\", start_logit=0.0, end_logit=0.0))\n \n # In very rare edge cases we could have no valid predictions. So we\n # just create a nonce prediction in this case to avoid failure.\n if not nbest:\n nbest.append(\n _NbestPrediction(text=\"empty\", start_logit=0.0, end_logit=0.0))\n\n assert len(nbest) >= 1\n\n total_scores = []\n best_non_null_entry = None\n for entry in nbest:\n total_scores.append(entry.start_logit + entry.end_logit)\n if not best_non_null_entry:\n if entry.text:\n best_non_null_entry = entry\n\n probs = _compute_softmax(total_scores)\n\n nbest_json = []\n for (i, entry) in enumerate(nbest):\n output = collections.OrderedDict()\n output[\"text\"] = entry.text\n output[\"probability\"] = probs[i]\n output[\"start_logit\"] = entry.start_logit\n output[\"end_logit\"] = entry.end_logit\n nbest_json.append(output)\n\n assert len(nbest_json) >= 1\n\n if not version_2_with_negative:\n all_predictions[example.qas_id] = nbest_json[0][\"text\"]\n else:\n # predict \"\" iff the null score - the score of best non-null > threshold\n score_diff = score_null - best_non_null_entry.start_logit - (\n best_non_null_entry.end_logit)\n scores_diff_json[example.qas_id] = score_diff\n if score_diff > null_score_diff_threshold:\n all_predictions[example.qas_id] = \"\"\n else:\n all_predictions[example.qas_id] = best_non_null_entry.text\n all_nbest_json[example.qas_id] = nbest_json\n\n with open(output_prediction_file, \"w\") as writer:\n writer.write(json.dumps(all_predictions, indent=4) + \"\\n\")\n\n with open(output_nbest_file, \"w\") as writer:\n writer.write(json.dumps(all_nbest_json, indent=4) + \"\\n\")\n\n if version_2_with_negative:\n with open(output_null_log_odds_file, \"w\") as writer:\n writer.write(json.dumps(scores_diff_json, indent=4) + \"\\n\")", "response": "Write predictions and nbest to the json file and log -odds of null if needed."} {"instruction": "Can you write a function in Python 3 where it\nprojects the tokenized prediction back to the original text.", "response": "def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False):\n \"\"\"Project the tokenized prediction back to the original text.\"\"\"\n\n # When we created the data, we kept track of the alignment between original\n # (whitespace tokenized) tokens and our WordPiece tokenized tokens. So\n # now `orig_text` contains the span of our original text corresponding to the\n # span that we predicted.\n #\n # However, `orig_text` may contain extra characters that we don't want in\n # our prediction.\n #\n # For example, let's say:\n # pred_text = steve smith\n # orig_text = Steve Smith's\n #\n # We don't want to return `orig_text` because it contains the extra \"'s\".\n #\n # We don't want to return `pred_text` because it's already been normalized\n # (the SQuAD eval script also does punctuation stripping/lower casing but\n # our tokenizer does additional normalization like stripping accent\n # characters).\n #\n # What we really want to return is \"Steve Smith\".\n #\n # Therefore, we have to apply a semi-complicated alignment heuristic between\n # `pred_text` and `orig_text` to get a character-to-character alignment. This\n # can fail in certain cases in which case we just return `orig_text`.\n\n def _strip_spaces(text):\n ns_chars = []\n ns_to_s_map = collections.OrderedDict()\n for (i, c) in enumerate(text):\n if c == \" \":\n continue\n ns_to_s_map[len(ns_chars)] = i\n ns_chars.append(c)\n ns_text = \"\".join(ns_chars)\n return (ns_text, ns_to_s_map)\n\n # We first tokenize `orig_text`, strip whitespace from the result\n # and `pred_text`, and check if they are the same length. If they are\n # NOT the same length, the heuristic has failed. If they are the same\n # length, we assume the characters are one-to-one aligned.\n tokenizer = BasicTokenizer(do_lower_case=do_lower_case)\n\n tok_text = \" \".join(tokenizer.tokenize(orig_text))\n\n start_position = tok_text.find(pred_text)\n if start_position == -1:\n if verbose_logging:\n logger.info(\n \"Unable to find text: '%s' in '%s'\" % (pred_text, orig_text))\n return orig_text\n end_position = start_position + len(pred_text) - 1\n\n (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text)\n (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text)\n\n if len(orig_ns_text) != len(tok_ns_text):\n if verbose_logging:\n logger.info(\"Length not equal after stripping spaces: '%s' vs '%s'\",\n orig_ns_text, tok_ns_text)\n return orig_text\n\n # We then project the characters in `pred_text` back to `orig_text` using\n # the character-to-character alignment.\n tok_s_to_ns_map = {}\n for (i, tok_index) in tok_ns_to_s_map.items():\n tok_s_to_ns_map[tok_index] = i\n\n orig_start_position = None\n if start_position in tok_s_to_ns_map:\n ns_start_position = tok_s_to_ns_map[start_position]\n if ns_start_position in orig_ns_to_s_map:\n orig_start_position = orig_ns_to_s_map[ns_start_position]\n\n if orig_start_position is None:\n if verbose_logging:\n logger.info(\"Couldn't map start position\")\n return orig_text\n\n orig_end_position = None\n if end_position in tok_s_to_ns_map:\n ns_end_position = tok_s_to_ns_map[end_position]\n if ns_end_position in orig_ns_to_s_map:\n orig_end_position = orig_ns_to_s_map[ns_end_position]\n\n if orig_end_position is None:\n if verbose_logging:\n logger.info(\"Couldn't map end position\")\n return orig_text\n\n output_text = orig_text[orig_start_position:(orig_end_position + 1)]\n return output_text"} {"instruction": "Here you have a function in Python 3, explain what it does\ndef _get_best_indexes(logits, n_best_size):\n index_and_score = sorted(enumerate(logits), key=lambda x: x[1], reverse=True)\n\n best_indexes = []\n for i in range(len(index_and_score)):\n if i >= n_best_size:\n break\n best_indexes.append(index_and_score[i][0])\n return best_indexes", "response": "Get the n - best logits from a list."} {"instruction": "How would you code a function in Python 3 to\ncompute softmax probability over raw logits.", "response": "def _compute_softmax(scores):\n \"\"\"Compute softmax probability over raw logits.\"\"\"\n if not scores:\n return []\n\n max_score = None\n for score in scores:\n if max_score is None or score > max_score:\n max_score = score\n\n exp_scores = []\n total_sum = 0.0\n for score in scores:\n x = math.exp(score - max_score)\n exp_scores.append(x)\n total_sum += x\n\n probs = []\n for score in exp_scores:\n probs.append(score / total_sum)\n return probs"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef convert_examples_to_features(examples, tokenizer, max_seq_length,\n is_training):\n \"\"\"Loads a data file into a list of `InputBatch`s.\"\"\"\n\n # Swag is a multiple choice task. To perform this task using Bert,\n # we will use the formatting proposed in \"Improving Language\n # Understanding by Generative Pre-Training\" and suggested by\n # @jacobdevlin-google in this issue\n # https://github.com/google-research/bert/issues/38.\n #\n # Each choice will correspond to a sample on which we run the\n # inference. For a given Swag example, we will create the 4\n # following inputs:\n # - [CLS] context [SEP] choice_1 [SEP]\n # - [CLS] context [SEP] choice_2 [SEP]\n # - [CLS] context [SEP] choice_3 [SEP]\n # - [CLS] context [SEP] choice_4 [SEP]\n # The model will output a single value for each input. To get the\n # final decision of the model, we will run a softmax over these 4\n # outputs.\n features = []\n for example_index, example in enumerate(examples):\n context_tokens = tokenizer.tokenize(example.context_sentence)\n start_ending_tokens = tokenizer.tokenize(example.start_ending)\n\n choices_features = []\n for ending_index, ending in enumerate(example.endings):\n # We create a copy of the context tokens in order to be\n # able to shrink it according to ending_tokens\n context_tokens_choice = context_tokens[:]\n ending_tokens = start_ending_tokens + tokenizer.tokenize(ending)\n # Modifies `context_tokens_choice` and `ending_tokens` in\n # place so that the total length is less than the\n # specified length. Account for [CLS], [SEP], [SEP] with\n # \"- 3\"\n _truncate_seq_pair(context_tokens_choice, ending_tokens, max_seq_length - 3)\n\n tokens = [\"[CLS]\"] + context_tokens_choice + [\"[SEP]\"] + ending_tokens + [\"[SEP]\"]\n segment_ids = [0] * (len(context_tokens_choice) + 2) + [1] * (len(ending_tokens) + 1)\n\n input_ids = tokenizer.convert_tokens_to_ids(tokens)\n input_mask = [1] * len(input_ids)\n\n # Zero-pad up to the sequence length.\n padding = [0] * (max_seq_length - len(input_ids))\n input_ids += padding\n input_mask += padding\n segment_ids += padding\n\n assert len(input_ids) == max_seq_length\n assert len(input_mask) == max_seq_length\n assert len(segment_ids) == max_seq_length\n\n choices_features.append((tokens, input_ids, input_mask, segment_ids))\n\n label = example.label\n if example_index < 5:\n logger.info(\"*** Example ***\")\n logger.info(\"swag_id: {}\".format(example.swag_id))\n for choice_idx, (tokens, input_ids, input_mask, segment_ids) in enumerate(choices_features):\n logger.info(\"choice: {}\".format(choice_idx))\n logger.info(\"tokens: {}\".format(' '.join(tokens)))\n logger.info(\"input_ids: {}\".format(' '.join(map(str, input_ids))))\n logger.info(\"input_mask: {}\".format(' '.join(map(str, input_mask))))\n logger.info(\"segment_ids: {}\".format(' '.join(map(str, segment_ids))))\n if is_training:\n logger.info(\"label: {}\".format(label))\n\n features.append(\n InputFeatures(\n example_id = example.swag_id,\n choices_features = choices_features,\n label = label\n )\n )\n\n return features", "response": "Loads a data file into a list of InputBatch s."} {"instruction": "Explain what the following Python 3 code does\ndef convert_examples_to_features(examples, label_list, max_seq_length,\n tokenizer, output_mode):\n \"\"\"Loads a data file into a list of `InputBatch`s.\"\"\"\n\n label_map = {label : i for i, label in enumerate(label_list)}\n\n features = []\n for (ex_index, example) in enumerate(examples):\n if ex_index % 10000 == 0:\n logger.info(\"Writing example %d of %d\" % (ex_index, len(examples)))\n\n tokens_a = tokenizer.tokenize(example.text_a)\n\n tokens_b = None\n if example.text_b:\n tokens_b = tokenizer.tokenize(example.text_b)\n # Modifies `tokens_a` and `tokens_b` in place so that the total\n # length is less than the specified length.\n # Account for [CLS], [SEP], [SEP] with \"- 3\"\n _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)\n else:\n # Account for [CLS] and [SEP] with \"- 2\"\n if len(tokens_a) > max_seq_length - 2:\n tokens_a = tokens_a[:(max_seq_length - 2)]\n\n # The convention in BERT is:\n # (a) For sequence pairs:\n # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]\n # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1\n # (b) For single sequences:\n # tokens: [CLS] the dog is hairy . [SEP]\n # type_ids: 0 0 0 0 0 0 0\n #\n # Where \"type_ids\" are used to indicate whether this is the first\n # sequence or the second sequence. The embedding vectors for `type=0` and\n # `type=1` were learned during pre-training and are added to the wordpiece\n # embedding vector (and position vector). This is not *strictly* necessary\n # since the [SEP] token unambiguously separates the sequences, but it makes\n # it easier for the model to learn the concept of sequences.\n #\n # For classification tasks, the first vector (corresponding to [CLS]) is\n # used as as the \"sentence vector\". Note that this only makes sense because\n # the entire model is fine-tuned.\n tokens = [\"[CLS]\"] + tokens_a + [\"[SEP]\"]\n segment_ids = [0] * len(tokens)\n\n if tokens_b:\n tokens += tokens_b + [\"[SEP]\"]\n segment_ids += [1] * (len(tokens_b) + 1)\n\n input_ids = tokenizer.convert_tokens_to_ids(tokens)\n\n # The mask has 1 for real tokens and 0 for padding tokens. Only real\n # tokens are attended to.\n input_mask = [1] * len(input_ids)\n\n # Zero-pad up to the sequence length.\n padding = [0] * (max_seq_length - len(input_ids))\n input_ids += padding\n input_mask += padding\n segment_ids += padding\n\n assert len(input_ids) == max_seq_length\n assert len(input_mask) == max_seq_length\n assert len(segment_ids) == max_seq_length\n\n if output_mode == \"classification\":\n label_id = label_map[example.label]\n elif output_mode == \"regression\":\n label_id = float(example.label)\n else:\n raise KeyError(output_mode)\n\n if ex_index < 5:\n logger.info(\"*** Example ***\")\n logger.info(\"guid: %s\" % (example.guid))\n logger.info(\"tokens: %s\" % \" \".join(\n [str(x) for x in tokens]))\n logger.info(\"input_ids: %s\" % \" \".join([str(x) for x in input_ids]))\n logger.info(\"input_mask: %s\" % \" \".join([str(x) for x in input_mask]))\n logger.info(\n \"segment_ids: %s\" % \" \".join([str(x) for x in segment_ids]))\n logger.info(\"label: %s (id = %d)\" % (example.label, label_id))\n\n features.append(\n InputFeatures(input_ids=input_ids,\n input_mask=input_mask,\n segment_ids=segment_ids,\n label_id=label_id))\n return features", "response": "Converts a list of examples into a list of InputBatches."} {"instruction": "Here you have a function in Python 3, explain what it does\ndef _read_tsv(cls, input_file, quotechar=None):\n with open(input_file, \"r\", encoding=\"utf-8\") as f:\n reader = csv.reader(f, delimiter=\"\\t\", quotechar=quotechar)\n lines = []\n for line in reader:\n if sys.version_info[0] == 2:\n line = list(unicode(cell, 'utf-8') for cell in line)\n lines.append(line)\n return lines", "response": "Reads a tab separated value file."} {"instruction": "Here you have a function in Python 3, explain what it does\ndef get_train_examples(self, data_dir):\n logger.info(\"LOOKING AT {}\".format(os.path.join(data_dir, \"train.tsv\")))\n return self._create_examples(\n self._read_tsv(os.path.join(data_dir, \"train.tsv\")), \"train\")", "response": "Get the training set of all examples."} {"instruction": "Write a Python 3 script to\ncreate examples for the training and dev sets.", "response": "def _create_examples(self, lines, set_type):\n \"\"\"Creates examples for the training and dev sets.\"\"\"\n examples = []\n for (i, line) in enumerate(lines):\n if i == 0:\n continue\n guid = \"%s-%s\" % (set_type, i)\n text_a = line[3]\n text_b = line[4]\n label = line[0]\n examples.append(\n InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))\n return examples"} {"instruction": "Create a Python 3 function for\ngetting the train. tsv file and create the examples.", "response": "def get_train_examples(self, data_dir):\n \"\"\"See base class.\"\"\"\n return self._create_examples(\n self._read_tsv(os.path.join(data_dir, \"train.tsv\")), \"train\")"} {"instruction": "Implement a Python 3 function for\ngetting examples from the dev_matched. tsv file.", "response": "def get_dev_examples(self, data_dir):\n \"\"\"See base class.\"\"\"\n return self._create_examples(\n self._read_tsv(os.path.join(data_dir, \"dev_matched.tsv\")),\n \"dev_matched\")"} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef top_k_logits(logits, k):\n if k == 0:\n return logits\n else:\n values = torch.topk(logits, k)[0]\n batch_mins = values[:, -1].view(-1, 1).expand_as(logits)\n return torch.where(logits < batch_mins, torch.ones_like(logits) * -1e10, logits)", "response": "Returns logits for the top k top entries."} {"instruction": "How would you explain what the following Python 3 function does\ndef load_tf_weights_in_bert(model, tf_checkpoint_path):\n try:\n import re\n import numpy as np\n import tensorflow as tf\n except ImportError:\n print(\"Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see \"\n \"https://www.tensorflow.org/install/ for installation instructions.\")\n raise\n tf_path = os.path.abspath(tf_checkpoint_path)\n print(\"Converting TensorFlow checkpoint from {}\".format(tf_path))\n # Load weights from TF model\n init_vars = tf.train.list_variables(tf_path)\n names = []\n arrays = []\n for name, shape in init_vars:\n print(\"Loading TF weight {} with shape {}\".format(name, shape))\n array = tf.train.load_variable(tf_path, name)\n names.append(name)\n arrays.append(array)\n\n for name, array in zip(names, arrays):\n name = name.split('/')\n # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v\n # which are not required for using pretrained model\n if any(n in [\"adam_v\", \"adam_m\", \"global_step\"] for n in name):\n print(\"Skipping {}\".format(\"/\".join(name)))\n continue\n pointer = model\n for m_name in name:\n if re.fullmatch(r'[A-Za-z]+_\\d+', m_name):\n l = re.split(r'_(\\d+)', m_name)\n else:\n l = [m_name]\n if l[0] == 'kernel' or l[0] == 'gamma':\n pointer = getattr(pointer, 'weight')\n elif l[0] == 'output_bias' or l[0] == 'beta':\n pointer = getattr(pointer, 'bias')\n elif l[0] == 'output_weights':\n pointer = getattr(pointer, 'weight')\n elif l[0] == 'squad':\n pointer = getattr(pointer, 'classifier')\n else:\n try:\n pointer = getattr(pointer, l[0])\n except AttributeError:\n print(\"Skipping {}\".format(\"/\".join(name)))\n continue\n if len(l) >= 2:\n num = int(l[1])\n pointer = pointer[num]\n if m_name[-11:] == '_embeddings':\n pointer = getattr(pointer, 'weight')\n elif m_name == 'kernel':\n array = np.transpose(array)\n try:\n assert pointer.shape == array.shape\n except AssertionError as e:\n e.args += (pointer.shape, array.shape)\n raise\n print(\"Initialize PyTorch weight {}\".format(name))\n pointer.data = torch.from_numpy(array)\n return model", "response": "Load weights from a TensorFlow model and a checkpoint file."} {"instruction": "Here you have a function in Python 3, explain what it does\ndef load_tf_weights_in_openai_gpt(model, openai_checkpoint_folder_path):\n import re\n import numpy as np\n print(\"Loading weights...\")\n names = json.load(open(openai_checkpoint_folder_path + '/parameters_names.json', \"r\", encoding='utf-8'))\n shapes = json.load(open(openai_checkpoint_folder_path + '/params_shapes.json', \"r\", encoding='utf-8'))\n offsets = np.cumsum([np.prod(shape) for shape in shapes])\n init_params = [np.load(openai_checkpoint_folder_path + '/params_{}.npy'.format(n)) for n in range(10)]\n init_params = np.split(np.concatenate(init_params, 0), offsets)[:-1]\n init_params = [param.reshape(shape) for param, shape in zip(init_params, shapes)]\n\n # This was used when we had a single embedding matrix for positions and tokens\n # init_params[0] = np.concatenate([init_params[1], init_params[0]], 0)\n # del init_params[1]\n init_params = [arr.squeeze() for arr in init_params]\n\n try:\n assert model.tokens_embed.weight.shape == init_params[1].shape\n assert model.positions_embed.weight.shape == init_params[0].shape\n except AssertionError as e:\n e.args += (model.tokens_embed.weight.shape, init_params[1].shape)\n e.args += (model.positions_embed.weight.shape, init_params[0].shape)\n raise\n\n model.tokens_embed.weight.data = torch.from_numpy(init_params[1])\n model.positions_embed.weight.data = torch.from_numpy(init_params[0])\n names.pop(0)\n # Pop position and token embedding arrays\n init_params.pop(0)\n init_params.pop(0)\n\n for name, array in zip(names, init_params): # names[1:n_transfer], init_params[1:n_transfer]):\n name = name[6:] # skip \"model/\"\n assert name[-2:] == \":0\"\n name = name[:-2]\n name = name.split('/')\n pointer = model\n for m_name in name:\n if re.fullmatch(r'[A-Za-z]+\\d+', m_name):\n l = re.split(r'(\\d+)', m_name)\n else:\n l = [m_name]\n if l[0] == 'g':\n pointer = getattr(pointer, 'weight')\n elif l[0] == 'b':\n pointer = getattr(pointer, 'bias')\n elif l[0] == 'w':\n pointer = getattr(pointer, 'weight')\n else:\n pointer = getattr(pointer, l[0])\n if len(l) >= 2:\n num = int(l[1])\n pointer = pointer[num]\n try:\n assert pointer.shape == array.shape\n except AssertionError as e:\n e.args += (pointer.shape, array.shape)\n raise\n try:\n assert pointer.shape == array.shape\n except AssertionError as e:\n e.args += (pointer.shape, array.shape)\n raise\n print(\"Initialize PyTorch weight {}\".format(name))\n pointer.data = torch.from_numpy(array)\n return model", "response": "Load tf pre - trained weights in a pytorch model."} {"instruction": "Can you tell what is the following Python 3 function doing\ndef from_dict(cls, json_object):\n config = OpenAIGPTConfig(vocab_size_or_config_json_file=-1)\n for key, value in json_object.items():\n config.__dict__[key] = value\n return config", "response": "Constructs a OpenAIGPTConfig from a Python dictionary of parameters."} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef set_num_special_tokens(self, num_special_tokens):\n \" Update input embeddings with new embedding matrice if needed \"\n if self.config.n_special == num_special_tokens:\n return\n # Update config\n self.config.n_special = num_special_tokens\n # Build new embeddings and initialize all new embeddings (in particular the special tokens)\n old_embed = self.tokens_embed\n self.tokens_embed = nn.Embedding(self.config.total_tokens_embeddings, self.config.n_embd)\n self.tokens_embed.to(old_embed.weight.device)\n self.init_weights(self.tokens_embed)\n # Copy word embeddings from the previous weights\n self.tokens_embed.weight.data[:self.config.vocab_size, :] = old_embed.weight.data[:self.config.vocab_size, :]", "response": "Update input embeddings with new embedding matrice if needed"} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef set_num_special_tokens(self, num_special_tokens):\n self.transformer.set_num_special_tokens(num_special_tokens)\n self.lm_head.set_embeddings_weights(self.transformer.tokens_embed.weight)", "response": "Update input and output embeddings with new embedding"} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef step(self, closure=None):\n loss = None\n if closure is not None:\n loss = closure()\n\n for group in self.param_groups:\n for p in group['params']:\n if p.grad is None:\n continue\n grad = p.grad.data\n if grad.is_sparse:\n raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')\n\n state = self.state[p]\n\n # State initialization\n if len(state) == 0:\n state['step'] = 0\n # Exponential moving average of gradient values\n state['exp_avg'] = torch.zeros_like(p.data)\n # Exponential moving average of squared gradient values\n state['exp_avg_sq'] = torch.zeros_like(p.data)\n\n exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']\n beta1, beta2 = group['b1'], group['b2']\n\n state['step'] += 1\n\n # Add grad clipping\n if group['max_grad_norm'] > 0:\n clip_grad_norm_(p, group['max_grad_norm'])\n\n # Decay the first and second moment running average coefficient\n exp_avg.mul_(beta1).add_(1 - beta1, grad)\n exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)\n denom = exp_avg_sq.sqrt().add_(group['e'])\n\n bias_correction1 = 1 - beta1 ** state['step']\n bias_correction2 = 1 - beta2 ** state['step']\n\n lr_scheduled = group['lr']\n lr_scheduled *= group['schedule'].get_lr(state['step'])\n\n step_size = lr_scheduled * math.sqrt(bias_correction2) / bias_correction1\n\n p.data.addcdiv_(-step_size, exp_avg, denom)\n\n # Add weight decay at the end (fixed version)\n if (len(p.size()) > 1 or group['vector_l2']) and group['weight_decay'] > 0:\n p.data.add_(-lr_scheduled * group['weight_decay'], p.data)\n\n return loss", "response": "Performs a single optimization step."} {"instruction": "How would you code a function in Python 3 to\nreturn the learning rate multiplier for the current update.", "response": "def get_lr(self, step, nowarn=False):\n \"\"\"\n :param step: which of t_total steps we're on\n :param nowarn: set to True to suppress warning regarding training beyond specified 't_total' steps\n :return: learning rate multiplier for current update\n \"\"\"\n if self.t_total < 0:\n return 1.\n progress = float(step) / self.t_total\n ret = self.get_lr_(progress)\n # warning for exceeding t_total (only active with warmup_linear\n if not nowarn and self.warn_t_total and progress > 1. and progress > self.warned_for_t_total_at_progress:\n logger.warning(\n \"Training beyond specified 't_total'. Learning rate multiplier set to {}. Please set 't_total' of {} correctly.\"\n .format(ret, self.__class__.__name__))\n self.warned_for_t_total_at_progress = progress\n # end warning\n return ret"} {"instruction": "How would you implement a function in Python 3 that\nperforms a single optimization step.", "response": "def step(self, closure=None):\n \"\"\"Performs a single optimization step.\n\n Arguments:\n closure (callable, optional): A closure that reevaluates the model\n and returns the loss.\n \"\"\"\n loss = None\n if closure is not None:\n loss = closure()\n\n for group in self.param_groups:\n for p in group['params']:\n if p.grad is None:\n continue\n grad = p.grad.data\n if grad.is_sparse:\n raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')\n\n state = self.state[p]\n\n # State initialization\n if len(state) == 0:\n state['step'] = 0\n # Exponential moving average of gradient values\n state['next_m'] = torch.zeros_like(p.data)\n # Exponential moving average of squared gradient values\n state['next_v'] = torch.zeros_like(p.data)\n\n next_m, next_v = state['next_m'], state['next_v']\n beta1, beta2 = group['b1'], group['b2']\n\n # Add grad clipping\n if group['max_grad_norm'] > 0:\n clip_grad_norm_(p, group['max_grad_norm'])\n\n # Decay the first and second moment running average coefficient\n # In-place operations to update the averages at the same time\n next_m.mul_(beta1).add_(1 - beta1, grad)\n next_v.mul_(beta2).addcmul_(1 - beta2, grad, grad)\n update = next_m / (next_v.sqrt() + group['e'])\n\n # Just adding the square of the weights to the loss function is *not*\n # the correct way of using L2 regularization/weight decay with Adam,\n # since that will interact with the m and v parameters in strange ways.\n #\n # Instead we want to decay the weights in a manner that doesn't interact\n # with the m/v parameters. This is equivalent to adding the square\n # of the weights to the loss with plain (non-momentum) SGD.\n if group['weight_decay'] > 0.0:\n update += group['weight_decay'] * p.data\n\n lr_scheduled = group['lr']\n lr_scheduled *= group['schedule'].get_lr(state['step'])\n\n update_with_lr = lr_scheduled * update\n p.data.add_(-update_with_lr)\n\n state['step'] += 1\n\n # step_size = lr_scheduled * math.sqrt(bias_correction2) / bias_correction1\n # No bias correction\n # bias_correction1 = 1 - beta1 ** state['step']\n # bias_correction2 = 1 - beta2 ** state['step']\n\n return loss"} {"instruction": "Explain what the following Python 3 code does\ndef whitespace_tokenize(text):\n text = text.strip()\n if not text:\n return []\n tokens = text.split()\n return tokens", "response": "Runs basic whitespace cleaning and splitting on a piece of text."} {"instruction": "Write a Python 3 function that can\ncheck whether chars is a punctuation character.", "response": "def _is_punctuation(char):\n \"\"\"Checks whether `chars` is a punctuation character.\"\"\"\n cp = ord(char)\n # We treat all non-letter/number ASCII as punctuation.\n # Characters such as \"^\", \"$\", and \"`\" are not in the Unicode\n # Punctuation class but we treat them as punctuation anyways, for\n # consistency.\n if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or\n (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):\n return True\n cat = unicodedata.category(char)\n if cat.startswith(\"P\"):\n return True\n return False"} {"instruction": "Write a Python 3 script to\nconvert a sequence of tokens into ids using the vocab.", "response": "def convert_tokens_to_ids(self, tokens):\n \"\"\"Converts a sequence of tokens into ids using the vocab.\"\"\"\n ids = []\n for token in tokens:\n ids.append(self.vocab[token])\n if len(ids) > self.max_len:\n logger.warning(\n \"Token indices sequence length is longer than the specified maximum \"\n \" sequence length for this BERT model ({} > {}). Running this\"\n \" sequence through BERT will result in indexing errors\".format(len(ids), self.max_len)\n )\n return ids"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef convert_ids_to_tokens(self, ids):\n tokens = []\n for i in ids:\n tokens.append(self.ids_to_tokens[i])\n return tokens", "response": "Converts a sequence of ids in wordpiece tokens using the vocab."} {"instruction": "How would you code a function in Python 3 to\nsave the tokenizer vocabulary to a file or directory.", "response": "def save_vocabulary(self, vocab_path):\n \"\"\"Save the tokenizer vocabulary to a directory or file.\"\"\"\n index = 0\n if os.path.isdir(vocab_path):\n vocab_file = os.path.join(vocab_path, VOCAB_NAME)\n with open(vocab_file, \"w\", encoding=\"utf-8\") as writer:\n for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):\n if index != token_index:\n logger.warning(\"Saving vocabulary to {}: vocabulary indices are not consecutive.\"\n \" Please check that the vocabulary is not corrupted!\".format(vocab_file))\n index = token_index\n writer.write(token + u'\\n')\n index += 1\n return vocab_file"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs):\n if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:\n vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name_or_path]\n if '-cased' in pretrained_model_name_or_path and kwargs.get('do_lower_case', True):\n logger.warning(\"The pre-trained model you are loading is a cased model but you have not set \"\n \"`do_lower_case` to False. We are setting `do_lower_case=False` for you but \"\n \"you may want to check this behavior.\")\n kwargs['do_lower_case'] = False\n elif '-cased' not in pretrained_model_name_or_path and not kwargs.get('do_lower_case', True):\n logger.warning(\"The pre-trained model you are loading is an uncased model but you have set \"\n \"`do_lower_case` to False. We are setting `do_lower_case=True` for you \"\n \"but you may want to check this behavior.\")\n kwargs['do_lower_case'] = True\n else:\n vocab_file = pretrained_model_name_or_path\n if os.path.isdir(vocab_file):\n vocab_file = os.path.join(vocab_file, VOCAB_NAME)\n # redirect to the cache, if necessary\n try:\n resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)\n except EnvironmentError:\n logger.error(\n \"Model name '{}' was not found in model name list ({}). \"\n \"We assumed '{}' was a path or url but couldn't find any file \"\n \"associated to this path or url.\".format(\n pretrained_model_name_or_path,\n ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),\n vocab_file))\n return None\n if resolved_vocab_file == vocab_file:\n logger.info(\"loading vocabulary file {}\".format(vocab_file))\n else:\n logger.info(\"loading vocabulary file {} from cache at {}\".format(\n vocab_file, resolved_vocab_file))\n if pretrained_model_name_or_path in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP:\n # if we're using a pretrained model, ensure the tokenizer wont index sequences longer\n # than the number of positional embeddings\n max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[pretrained_model_name_or_path]\n kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len)\n # Instantiate tokenizer.\n tokenizer = cls(resolved_vocab_file, *inputs, **kwargs)\n return tokenizer", "response": "Instantiate a PreTrainedBertModel from a pre - trained model file."} {"instruction": "Can you tell what is the following Python 3 function doing\ndef tokenize(self, text):\n text = self._clean_text(text)\n # This was added on November 1st, 2018 for the multilingual and Chinese\n # models. This is also applied to the English models now, but it doesn't\n # matter since the English models were not trained on any Chinese data\n # and generally don't have any Chinese data in them (there are Chinese\n # characters in the vocabulary because Wikipedia does have some Chinese\n # words in the English Wikipedia.).\n text = self._tokenize_chinese_chars(text)\n orig_tokens = whitespace_tokenize(text)\n split_tokens = []\n for token in orig_tokens:\n if self.do_lower_case and token not in self.never_split:\n token = token.lower()\n token = self._run_strip_accents(token)\n split_tokens.extend(self._run_split_on_punc(token))\n\n output_tokens = whitespace_tokenize(\" \".join(split_tokens))\n return output_tokens", "response": "Tokenizes a piece of text."} {"instruction": "Given the following Python 3 function, write the documentation\ndef _run_strip_accents(self, text):\n text = unicodedata.normalize(\"NFD\", text)\n output = []\n for char in text:\n cat = unicodedata.category(char)\n if cat == \"Mn\":\n continue\n output.append(char)\n return \"\".join(output)", "response": "Strips accents from a piece of text."} {"instruction": "Can you write a function in Python 3 where it\nadds whitespace around any CJK character.", "response": "def _tokenize_chinese_chars(self, text):\n \"\"\"Adds whitespace around any CJK character.\"\"\"\n output = []\n for char in text:\n cp = ord(char)\n if self._is_chinese_char(cp):\n output.append(\" \")\n output.append(char)\n output.append(\" \")\n else:\n output.append(char)\n return \"\".join(output)"} {"instruction": "Here you have a function in Python 3, explain what it does\ndef _is_chinese_char(self, cp):\n # This defines a \"chinese character\" as anything in the CJK Unicode block:\n # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)\n #\n # Note that the CJK Unicode block is NOT all Japanese and Korean characters,\n # despite its name. The modern Korean Hangul alphabet is a different block,\n # as is Japanese Hiragana and Katakana. Those alphabets are used to write\n # space-separated words, so they are not treated specially and handled\n # like the all of the other languages.\n if ((cp >= 0x4E00 and cp <= 0x9FFF) or #\n (cp >= 0x3400 and cp <= 0x4DBF) or #\n (cp >= 0x20000 and cp <= 0x2A6DF) or #\n (cp >= 0x2A700 and cp <= 0x2B73F) or #\n (cp >= 0x2B740 and cp <= 0x2B81F) or #\n (cp >= 0x2B820 and cp <= 0x2CEAF) or\n (cp >= 0xF900 and cp <= 0xFAFF) or #\n (cp >= 0x2F800 and cp <= 0x2FA1F)): #\n return True\n\n return False", "response": "Checks whether the codepoint of a CJK character is a Chinese character."} {"instruction": "Make a summary of the following Python 3 code\ndef tokenize(self, text):\n\n output_tokens = []\n for token in whitespace_tokenize(text):\n chars = list(token)\n if len(chars) > self.max_input_chars_per_word:\n output_tokens.append(self.unk_token)\n continue\n\n is_bad = False\n start = 0\n sub_tokens = []\n while start < len(chars):\n end = len(chars)\n cur_substr = None\n while start < end:\n substr = \"\".join(chars[start:end])\n if start > 0:\n substr = \"##\" + substr\n if substr in self.vocab:\n cur_substr = substr\n break\n end -= 1\n if cur_substr is None:\n is_bad = True\n break\n sub_tokens.append(cur_substr)\n start = end\n\n if is_bad:\n output_tokens.append(self.unk_token)\n else:\n output_tokens.extend(sub_tokens)\n return output_tokens", "response": "Tokenizes a piece of text into its word pieces."} {"instruction": "Write a Python 3 script for\nloading the ROCSTORY dataset.", "response": "def load_rocstories_dataset(dataset_path):\n \"\"\" Output a list of tuples(story, 1st continuation, 2nd continuation, label) \"\"\"\n with open(dataset_path, encoding='utf_8') as f:\n f = csv.reader(f)\n output = []\n next(f) # skip the first line\n for line in tqdm(f):\n output.append((' '.join(line[1:5]), line[5], line[6], int(line[-1])-1))\n return output"} {"instruction": "Explain what the following Python 3 code does\ndef random_word(tokens, tokenizer):\n output_label = []\n\n for i, token in enumerate(tokens):\n prob = random.random()\n # mask token with 15% probability\n if prob < 0.15:\n prob /= 0.15\n\n # 80% randomly change token to mask token\n if prob < 0.8:\n tokens[i] = \"[MASK]\"\n\n # 10% randomly change token to random token\n elif prob < 0.9:\n tokens[i] = random.choice(list(tokenizer.vocab.items()))[0]\n\n # -> rest 10% randomly keep current token\n\n # append current token to output (we will predict these later)\n try:\n output_label.append(tokenizer.vocab[token])\n except KeyError:\n # For unknown words (should not occur with BPE vocab)\n output_label.append(tokenizer.vocab[\"[UNK]\"])\n logger.warning(\"Cannot find token '{}' in vocab. Using [UNK] insetad\".format(token))\n else:\n # no masking token (will be ignored by loss function later)\n output_label.append(-1)\n\n return tokens, output_label", "response": "Mask some random tokens for Language Model task with probabilities as in the original BERT paper."} {"instruction": "Can you implement a function in Python 3 that\nconverts a raw sample into a proper training sample with sequence pairs LM labels input_mask CLS and SEP tokens etc.", "response": "def convert_example_to_features(example, max_seq_length, tokenizer):\n \"\"\"\n Convert a raw sample (pair of sentences as tokenized strings) into a proper training sample with\n IDs, LM labels, input_mask, CLS and SEP tokens etc.\n :param example: InputExample, containing sentence input as strings and is_next label\n :param max_seq_length: int, maximum length of sequence.\n :param tokenizer: Tokenizer\n :return: InputFeatures, containing all inputs and labels of one sample as IDs (as used for model training)\n \"\"\"\n tokens_a = example.tokens_a\n tokens_b = example.tokens_b\n # Modifies `tokens_a` and `tokens_b` in place so that the total\n # length is less than the specified length.\n # Account for [CLS], [SEP], [SEP] with \"- 3\"\n _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)\n\n tokens_a, t1_label = random_word(tokens_a, tokenizer)\n tokens_b, t2_label = random_word(tokens_b, tokenizer)\n # concatenate lm labels and account for CLS, SEP, SEP\n lm_label_ids = ([-1] + t1_label + [-1] + t2_label + [-1])\n\n # The convention in BERT is:\n # (a) For sequence pairs:\n # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]\n # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1\n # (b) For single sequences:\n # tokens: [CLS] the dog is hairy . [SEP]\n # type_ids: 0 0 0 0 0 0 0\n #\n # Where \"type_ids\" are used to indicate whether this is the first\n # sequence or the second sequence. The embedding vectors for `type=0` and\n # `type=1` were learned during pre-training and are added to the wordpiece\n # embedding vector (and position vector). This is not *strictly* necessary\n # since the [SEP] token unambigiously separates the sequences, but it makes\n # it easier for the model to learn the concept of sequences.\n #\n # For classification tasks, the first vector (corresponding to [CLS]) is\n # used as as the \"sentence vector\". Note that this only makes sense because\n # the entire model is fine-tuned.\n tokens = []\n segment_ids = []\n tokens.append(\"[CLS]\")\n segment_ids.append(0)\n for token in tokens_a:\n tokens.append(token)\n segment_ids.append(0)\n tokens.append(\"[SEP]\")\n segment_ids.append(0)\n\n assert len(tokens_b) > 0\n for token in tokens_b:\n tokens.append(token)\n segment_ids.append(1)\n tokens.append(\"[SEP]\")\n segment_ids.append(1)\n\n input_ids = tokenizer.convert_tokens_to_ids(tokens)\n\n # The mask has 1 for real tokens and 0 for padding tokens. Only real\n # tokens are attended to.\n input_mask = [1] * len(input_ids)\n\n # Zero-pad up to the sequence length.\n while len(input_ids) < max_seq_length:\n input_ids.append(0)\n input_mask.append(0)\n segment_ids.append(0)\n lm_label_ids.append(-1)\n\n assert len(input_ids) == max_seq_length\n assert len(input_mask) == max_seq_length\n assert len(segment_ids) == max_seq_length\n assert len(lm_label_ids) == max_seq_length\n\n if example.guid < 5:\n logger.info(\"*** Example ***\")\n logger.info(\"guid: %s\" % (example.guid))\n logger.info(\"tokens: %s\" % \" \".join(\n [str(x) for x in tokens]))\n logger.info(\"input_ids: %s\" % \" \".join([str(x) for x in input_ids]))\n logger.info(\"input_mask: %s\" % \" \".join([str(x) for x in input_mask]))\n logger.info(\n \"segment_ids: %s\" % \" \".join([str(x) for x in segment_ids]))\n logger.info(\"LM label: %s \" % (lm_label_ids))\n logger.info(\"Is next sentence label: %s \" % (example.is_next))\n\n features = InputFeatures(input_ids=input_ids,\n input_mask=input_mask,\n segment_ids=segment_ids,\n lm_label_ids=lm_label_ids,\n is_next=example.is_next)\n return features"} {"instruction": "Create a Python 3 function to\nget one sample from corpus consisting of two sentences.", "response": "def random_sent(self, index):\n \"\"\"\n Get one sample from corpus consisting of two sentences. With prob. 50% these are two subsequent sentences\n from one doc. With 50% the second sentence will be a random one from another doc.\n :param index: int, index of sample.\n :return: (str, str, int), sentence 1, sentence 2, isNextSentence Label\n \"\"\"\n t1, t2 = self.get_corpus_line(index)\n if random.random() > 0.5:\n label = 0\n else:\n t2 = self.get_random_line()\n label = 1\n\n assert len(t1) > 0\n assert len(t2) > 0\n return t1, t2, label"} {"instruction": "Can you implement a function in Python 3 that\ngets one sample from corpus consisting of a pair of two subsequent sentences from the same doc.", "response": "def get_corpus_line(self, item):\n \"\"\"\n Get one sample from corpus consisting of a pair of two subsequent lines from the same doc.\n :param item: int, index of sample.\n :return: (str, str), two subsequent sentences from corpus\n \"\"\"\n t1 = \"\"\n t2 = \"\"\n assert item < self.corpus_lines\n if self.on_memory:\n sample = self.sample_to_doc[item]\n t1 = self.all_docs[sample[\"doc_id\"]][sample[\"line\"]]\n t2 = self.all_docs[sample[\"doc_id\"]][sample[\"line\"]+1]\n # used later to avoid random nextSentence from same doc\n self.current_doc = sample[\"doc_id\"]\n return t1, t2\n else:\n if self.line_buffer is None:\n # read first non-empty line of file\n while t1 == \"\" :\n t1 = next(self.file).strip()\n t2 = next(self.file).strip()\n else:\n # use t2 from previous iteration as new t1\n t1 = self.line_buffer\n t2 = next(self.file).strip()\n # skip empty rows that are used for separating documents and keep track of current doc id\n while t2 == \"\" or t1 == \"\":\n t1 = next(self.file).strip()\n t2 = next(self.file).strip()\n self.current_doc = self.current_doc+1\n self.line_buffer = t2\n\n assert t1 != \"\"\n assert t2 != \"\"\n return t1, t2"} {"instruction": "Here you have a function in Python 3, explain what it does\ndef get_random_line(self):\n # Similar to original tf repo: This outer loop should rarely go for more than one iteration for large\n # corpora. However, just to be careful, we try to make sure that\n # the random document is not the same as the document we're processing.\n for _ in range(10):\n if self.on_memory:\n rand_doc_idx = random.randint(0, len(self.all_docs)-1)\n rand_doc = self.all_docs[rand_doc_idx]\n line = rand_doc[random.randrange(len(rand_doc))]\n else:\n rand_index = random.randint(1, self.corpus_lines if self.corpus_lines < 1000 else 1000)\n #pick random line\n for _ in range(rand_index):\n line = self.get_next_line()\n #check if our picked random line is really from another doc like we want it to be\n if self.current_random_doc != self.current_doc:\n break\n return line", "response": "Get a random line from another document for nextSentence task."} {"instruction": "Create a Python 3 function for\ngetting next line of random_file and starts over when reaching end of file", "response": "def get_next_line(self):\n \"\"\" Gets next line of random_file and starts over when reaching end of file\"\"\"\n try:\n line = next(self.random_file).strip()\n #keep track of which document we are currently looking at to later avoid having the same doc as t1\n if line == \"\":\n self.current_random_doc = self.current_random_doc + 1\n line = next(self.random_file).strip()\n except StopIteration:\n self.random_file.close()\n self.random_file = open(self.corpus_path, \"r\", encoding=self.encoding)\n line = next(self.random_file).strip()\n return line"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef create_masked_lm_predictions(tokens, masked_lm_prob, max_predictions_per_seq, vocab_list):\n cand_indices = []\n for (i, token) in enumerate(tokens):\n if token == \"[CLS]\" or token == \"[SEP]\":\n continue\n cand_indices.append(i)\n\n num_to_mask = min(max_predictions_per_seq,\n max(1, int(round(len(tokens) * masked_lm_prob))))\n shuffle(cand_indices)\n mask_indices = sorted(sample(cand_indices, num_to_mask))\n masked_token_labels = []\n for index in mask_indices:\n # 80% of the time, replace with [MASK]\n if random() < 0.8:\n masked_token = \"[MASK]\"\n else:\n # 10% of the time, keep original\n if random() < 0.5:\n masked_token = tokens[index]\n # 10% of the time, replace with random word\n else:\n masked_token = choice(vocab_list)\n masked_token_labels.append(tokens[index])\n # Once we've saved the true label for that token, we can overwrite it with the masked version\n tokens[index] = masked_token\n\n return tokens, mask_indices, masked_token_labels", "response": "Creates the predictions for the masked LM objective."} {"instruction": "Explain what the following Python 3 code does\ndef sample_logits(embedding, bias, labels, inputs, sampler):\n true_log_probs, samp_log_probs, neg_samples = sampler.sample(labels)\n n_sample = neg_samples.size(0)\n b1, b2 = labels.size(0), labels.size(1)\n all_ids = torch.cat([labels.view(-1), neg_samples])\n all_w = embedding(all_ids)\n true_w = all_w[: -n_sample].view(b1, b2, -1)\n sample_w = all_w[- n_sample:].view(n_sample, -1)\n\n all_b = bias[all_ids]\n true_b = all_b[: -n_sample].view(b1, b2)\n sample_b = all_b[- n_sample:]\n\n hit = (labels[:, :, None] == neg_samples).detach()\n\n true_logits = torch.einsum('ijk,ijk->ij',\n [true_w, inputs]) + true_b - true_log_probs\n sample_logits = torch.einsum('lk,ijk->ijl',\n [sample_w, inputs]) + sample_b - samp_log_probs\n sample_logits.masked_fill_(hit, -1e30)\n logits = torch.cat([true_logits[:, :, None], sample_logits], -1)\n\n return logits", "response": "Sample logits from the embedding layer."} {"instruction": "Make a summary of the following Python 3 code\ndef forward(self, hidden, target=None, keep_order=False):\n '''\n Params:\n hidden :: [len*bsz x d_proj]\n target :: [len*bsz]\n Return:\n if target is None:\n out :: [len*bsz] Negative log likelihood\n else:\n out :: [len*bsz x n_tokens] log probabilities of tokens over the vocabulary\n We could replace this implementation by the native PyTorch one\n if their's had an option to set bias on all clusters in the native one.\n here: https://github.com/pytorch/pytorch/blob/dbe6a7a9ff1a364a8706bf5df58a1ca96d2fd9da/torch/nn/modules/adaptive.py#L138\n '''\n\n if target is not None:\n target = target.view(-1)\n if hidden.size(0) != target.size(0):\n raise RuntimeError('Input and target should have the same size '\n 'in the batch dimension.')\n\n if self.n_clusters == 0:\n logit = self._compute_logit(hidden, self.out_layers[0].weight,\n self.out_layers[0].bias, self.out_projs[0])\n if target is not None:\n output = -F.log_softmax(logit, dim=-1) \\\n .gather(1, target.unsqueeze(1)).squeeze(1)\n else:\n output = F.log_softmax(logit, dim=-1)\n else:\n # construct weights and biases\n weights, biases = [], []\n for i in range(len(self.cutoffs)):\n if self.div_val == 1:\n l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1]\n weight_i = self.out_layers[0].weight[l_idx:r_idx]\n bias_i = self.out_layers[0].bias[l_idx:r_idx]\n else:\n weight_i = self.out_layers[i].weight\n bias_i = self.out_layers[i].bias\n\n if i == 0:\n weight_i = torch.cat(\n [weight_i, self.cluster_weight], dim=0)\n bias_i = torch.cat(\n [bias_i, self.cluster_bias], dim=0)\n\n weights.append(weight_i)\n biases.append(bias_i)\n\n head_weight, head_bias, head_proj = weights[0], biases[0], self.out_projs[0]\n\n head_logit = self._compute_logit(hidden, head_weight, head_bias, head_proj)\n head_logprob = F.log_softmax(head_logit, dim=1)\n\n if target is None:\n out = hidden.new_empty((head_logit.size(0), self.n_token))\n else:\n out = torch.zeros_like(target, dtype=hidden.dtype, device=hidden.device)\n\n offset = 0\n cutoff_values = [0] + self.cutoffs\n for i in range(len(cutoff_values) - 1):\n l_idx, r_idx = cutoff_values[i], cutoff_values[i + 1]\n\n if target is not None:\n mask_i = (target >= l_idx) & (target < r_idx)\n indices_i = mask_i.nonzero().squeeze()\n\n if indices_i.numel() == 0:\n continue\n\n target_i = target.index_select(0, indices_i) - l_idx\n head_logprob_i = head_logprob.index_select(0, indices_i)\n hidden_i = hidden.index_select(0, indices_i)\n else:\n hidden_i = hidden\n\n if i == 0:\n if target is not None:\n logprob_i = head_logprob_i.gather(1, target_i[:, None]).squeeze(1)\n else:\n out[:, :self.cutoffs[0]] = head_logprob[:, :self.cutoffs[0]]\n else:\n weight_i, bias_i, proj_i = weights[i], biases[i], self.out_projs[i]\n\n tail_logit_i = self._compute_logit(hidden_i, weight_i, bias_i, proj_i)\n tail_logprob_i = F.log_softmax(tail_logit_i, dim=1)\n cluster_prob_idx = self.cutoffs[0] + i - 1 # No probability for the head cluster\n if target is not None:\n logprob_i = head_logprob_i[:, cluster_prob_idx] \\\n + tail_logprob_i.gather(1, target_i[:, None]).squeeze(1)\n else:\n logprob_i = head_logprob[:, cluster_prob_idx, None] + tail_logprob_i\n out[:, l_idx:r_idx] = logprob_i\n\n if target is not None:\n if (hasattr(self, 'keep_order') and self.keep_order) or keep_order:\n out.index_copy_(0, indices_i, -logprob_i)\n else:\n out[offset:offset+logprob_i.size(0)].copy_(-logprob_i)\n offset += logprob_i.size(0)\n\n return out", "response": "Forward computation of the log likelihood of the input and target."} {"instruction": "Here you have a function in Python 3, explain what it does\ndef sample(self, labels):\n\n # neg_samples = torch.empty(0).long()\n n_sample = self.n_sample\n n_tries = 2 * n_sample\n\n with torch.no_grad():\n neg_samples = torch.multinomial(self.dist, n_tries, replacement=True).unique()\n device = labels.device\n neg_samples = neg_samples.to(device)\n true_log_probs = self.log_q[labels].to(device)\n samp_log_probs = self.log_q[neg_samples].to(device)\n return true_log_probs, samp_log_probs, neg_samples", "response": "Sample from the log - probability distribution of the cluster."} {"instruction": "Here you have a function in Python 3, explain what it does\ndef build_tf_to_pytorch_map(model, config):\n tf_to_pt_map = {}\n\n if hasattr(model, 'transformer'):\n # We are loading in a TransfoXLLMHeadModel => we will load also the Adaptive Softmax\n tf_to_pt_map.update({\n \"transformer/adaptive_softmax/cutoff_0/cluster_W\": model.crit.cluster_weight,\n \"transformer/adaptive_softmax/cutoff_0/cluster_b\": model.crit.cluster_bias})\n for i, (out_l, proj_l, tie_proj) in enumerate(zip(\n model.crit.out_layers,\n model.crit.out_projs,\n config.tie_projs)):\n layer_str = \"transformer/adaptive_softmax/cutoff_%d/\" % i\n if config.tie_weight:\n tf_to_pt_map.update({\n layer_str + 'b': out_l.bias})\n else:\n raise NotImplementedError\n # I don't think this is implemented in the TF code\n tf_to_pt_map.update({\n layer_str + 'lookup_table': out_l.weight,\n layer_str + 'b': out_l.bias})\n if not tie_proj:\n tf_to_pt_map.update({\n layer_str + 'proj': proj_l\n })\n # Now load the rest of the transformer\n model = model.transformer\n\n # Embeddings\n for i, (embed_l, proj_l) in enumerate(zip(model.word_emb.emb_layers, model.word_emb.emb_projs)):\n layer_str = \"transformer/adaptive_embed/cutoff_%d/\" % i\n tf_to_pt_map.update({\n layer_str + 'lookup_table': embed_l.weight,\n layer_str + 'proj_W': proj_l\n })\n\n # Transformer blocks\n for i, b in enumerate(model.layers):\n layer_str = \"transformer/layer_%d/\" % i\n tf_to_pt_map.update({\n layer_str + \"rel_attn/LayerNorm/gamma\": b.dec_attn.layer_norm.weight,\n layer_str + \"rel_attn/LayerNorm/beta\": b.dec_attn.layer_norm.bias,\n layer_str + \"rel_attn/o/kernel\": b.dec_attn.o_net.weight,\n layer_str + \"rel_attn/qkv/kernel\": b.dec_attn.qkv_net.weight,\n layer_str + \"rel_attn/r/kernel\": b.dec_attn.r_net.weight,\n layer_str + \"ff/LayerNorm/gamma\": b.pos_ff.layer_norm.weight,\n layer_str + \"ff/LayerNorm/beta\": b.pos_ff.layer_norm.bias,\n layer_str + \"ff/layer_1/kernel\": b.pos_ff.CoreNet[0].weight,\n layer_str + \"ff/layer_1/bias\": b.pos_ff.CoreNet[0].bias,\n layer_str + \"ff/layer_2/kernel\": b.pos_ff.CoreNet[3].weight,\n layer_str + \"ff/layer_2/bias\": b.pos_ff.CoreNet[3].bias,\n })\n\n # Relative positioning biases\n if config.untie_r:\n r_r_list = []\n r_w_list = []\n for b in model.layers:\n r_r_list.append(b.dec_attn.r_r_bias)\n r_w_list.append(b.dec_attn.r_w_bias)\n else:\n r_r_list = [model.r_r_bias]\n r_w_list = [model.r_w_bias]\n tf_to_pt_map.update({\n 'transformer/r_r_bias': r_r_list,\n 'transformer/r_w_bias': r_w_list})\n return tf_to_pt_map", "response": "Build a map of modules from TF to PyTorch."} {"instruction": "Can you tell what is the following Python 3 function doing\ndef load_tf_weights_in_transfo_xl(model, config, tf_path):\n try:\n import numpy as np\n import tensorflow as tf\n except ImportError:\n print(\"Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see \"\n \"https://www.tensorflow.org/install/ for installation instructions.\")\n raise\n # Build TF to PyTorch weights loading map\n tf_to_pt_map = build_tf_to_pytorch_map(model, config)\n\n # Load weights from TF model\n init_vars = tf.train.list_variables(tf_path)\n tf_weights = {}\n for name, shape in init_vars:\n print(\"Loading TF weight {} with shape {}\".format(name, shape))\n array = tf.train.load_variable(tf_path, name)\n tf_weights[name] = array\n\n for name, pointer in tf_to_pt_map.items():\n assert name in tf_weights\n array = tf_weights[name]\n # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v\n # which are not required for using pretrained model\n if 'kernel' in name or 'proj' in name:\n array = np.transpose(array)\n if ('r_r_bias' in name or 'r_w_bias' in name) and len(pointer) > 1:\n # Here we will split the TF weigths\n assert len(pointer) == array.shape[0]\n for i, p_i in enumerate(pointer):\n arr_i = array[i, ...]\n try:\n assert p_i.shape == arr_i.shape\n except AssertionError as e:\n e.args += (p_i.shape, arr_i.shape)\n raise\n print(\"Initialize PyTorch weight {} for layer {}\".format(name, i))\n p_i.data = torch.from_numpy(arr_i)\n else:\n try:\n assert pointer.shape == array.shape\n except AssertionError as e:\n e.args += (pointer.shape, array.shape)\n raise\n print(\"Initialize PyTorch weight {}\".format(name))\n pointer.data = torch.from_numpy(array)\n tf_weights.pop(name, None)\n tf_weights.pop(name + '/Adam', None)\n tf_weights.pop(name + '/Adam_1', None)\n\n print(\"Weights not copied to PyTorch model: {}\".format(', '.join(tf_weights.keys())))\n return model", "response": "Loads weights from a TensorFlow model and creates a PyTorch model with the weights."} {"instruction": "Make a summary of the following Python 3 code\ndef forward(self, input_ids, mems=None):\n # the original code for Transformer-XL used shapes [len, bsz] but we want a unified interface in the library\n # so we transpose here from shape [bsz, len] to shape [len, bsz]\n input_ids = input_ids.transpose(0, 1).contiguous()\n\n if mems is None:\n mems = self.init_mems(input_ids)\n last_hidden, new_mems = self._forward(input_ids, mems=mems)\n\n # We transpose back here to shape [bsz, len, hidden_dim]\n last_hidden = last_hidden.transpose(0, 1).contiguous()\n return (last_hidden, new_mems)", "response": "Forward method for the internal forwar model."} {"instruction": "How would you explain what the following Python 3 function does\ndef tie_weights(self):\n # sampled softmax\n if self.sample_softmax > 0:\n if self.config.tie_weight:\n self.out_layer.weight = self.transformer.word_emb.weight\n # adaptive softmax (including standard softmax)\n else:\n if self.config.tie_weight:\n for i in range(len(self.crit.out_layers)):\n self.crit.out_layers[i].weight = self.transformer.word_emb.emb_layers[i].weight\n if self.config.tie_projs:\n for i, tie_proj in enumerate(self.config.tie_projs):\n if tie_proj and self.config.div_val == 1 and self.config.d_model != self.config.d_embed:\n self.crit.out_projs[i] = self.transformer.word_emb.emb_projs[0]\n elif tie_proj and self.config.div_val != 1:\n self.crit.out_projs[i] = self.transformer.word_emb.emb_projs[i]", "response": "Tie weights of output and input."} {"instruction": "Can you implement a function in Python 3 that\nforwards computation of the model.", "response": "def forward(self, input_ids, target=None, mems=None):\n \"\"\" Params:\n input_ids :: [bsz, len]\n target :: [bsz, len]\n Returns:\n tuple(softmax_output, new_mems) where:\n new_mems: list (num layers) of hidden states at the entry of each layer\n shape :: [mem_len, bsz, self.config.d_model] :: Warning: shapes are transposed here w. regards to input_ids\n softmax_output: output of the (adaptive) softmax:\n if target is None:\n Negative log likelihood of shape :: [bsz, len] \n else:\n log probabilities of tokens, shape :: [bsz, len, n_tokens]\n \"\"\"\n bsz = input_ids.size(0)\n tgt_len = input_ids.size(1)\n\n last_hidden, new_mems = self.transformer(input_ids, mems)\n\n pred_hid = last_hidden[:, -tgt_len:]\n if self.sample_softmax > 0 and self.training:\n assert self.config.tie_weight\n logit = sample_logits(self.transformer.word_emb, self.out_layer.bias, target, pred_hid, self.sampler)\n softmax_output = -F.log_softmax(logit, -1)[:, :, 0]\n else:\n softmax_output = self.crit(pred_hid.view(-1, pred_hid.size(-1)), target)\n if target is None:\n softmax_output = softmax_output.view(bsz, tgt_len, -1)\n else:\n softmax_output = softmax_output.view(bsz, tgt_len)\n\n # We transpose back\n return (softmax_output, new_mems)"} {"instruction": "Write a Python 3 function that can\nreturn a DateOffset object from string or tuple representation of a frequency object.", "response": "def to_offset(freq):\n \"\"\"\n Return DateOffset object from string or tuple representation\n or datetime.timedelta object\n\n Parameters\n ----------\n freq : str, tuple, datetime.timedelta, DateOffset or None\n\n Returns\n -------\n DateOffset\n None if freq is None.\n\n Raises\n ------\n ValueError\n If freq is an invalid frequency\n\n See Also\n --------\n DateOffset\n\n Examples\n --------\n >>> to_offset('5min')\n <5 * Minutes>\n\n >>> to_offset('1D1H')\n <25 * Hours>\n\n >>> to_offset(('W', 2))\n <2 * Weeks: weekday=6>\n\n >>> to_offset((2, 'B'))\n <2 * BusinessDays>\n\n >>> to_offset(datetime.timedelta(days=1))\n \n\n >>> to_offset(Hour())\n \n \"\"\"\n if freq is None:\n return None\n\n if isinstance(freq, DateOffset):\n return freq\n\n if isinstance(freq, tuple):\n name = freq[0]\n stride = freq[1]\n if isinstance(stride, str):\n name, stride = stride, name\n name, _ = libfreqs._base_and_stride(name)\n delta = get_offset(name) * stride\n\n elif isinstance(freq, timedelta):\n delta = None\n freq = Timedelta(freq)\n try:\n for name in freq.components._fields:\n offset = _name_to_offset_map[name]\n stride = getattr(freq.components, name)\n if stride != 0:\n offset = stride * offset\n if delta is None:\n delta = offset\n else:\n delta = delta + offset\n except Exception:\n raise ValueError(libfreqs.INVALID_FREQ_ERR_MSG.format(freq))\n\n else:\n delta = None\n stride_sign = None\n try:\n splitted = re.split(libfreqs.opattern, freq)\n if splitted[-1] != '' and not splitted[-1].isspace():\n # the last element must be blank\n raise ValueError('last element must be blank')\n for sep, stride, name in zip(splitted[0::4], splitted[1::4],\n splitted[2::4]):\n if sep != '' and not sep.isspace():\n raise ValueError('separator must be spaces')\n prefix = libfreqs._lite_rule_alias.get(name) or name\n if stride_sign is None:\n stride_sign = -1 if stride.startswith('-') else 1\n if not stride:\n stride = 1\n if prefix in Resolution._reso_str_bump_map.keys():\n stride, name = Resolution.get_stride_from_decimal(\n float(stride), prefix\n )\n stride = int(stride)\n offset = get_offset(name)\n offset = offset * int(np.fabs(stride) * stride_sign)\n if delta is None:\n delta = offset\n else:\n delta = delta + offset\n except Exception:\n raise ValueError(libfreqs.INVALID_FREQ_ERR_MSG.format(freq))\n\n if delta is None:\n raise ValueError(libfreqs.INVALID_FREQ_ERR_MSG.format(freq))\n\n return delta"} {"instruction": "How would you explain what the following Python 3 function does\ndef get_offset(name):\n if name not in libfreqs._dont_uppercase:\n name = name.upper()\n name = libfreqs._lite_rule_alias.get(name, name)\n name = libfreqs._lite_rule_alias.get(name.lower(), name)\n else:\n name = libfreqs._lite_rule_alias.get(name, name)\n\n if name not in _offset_map:\n try:\n split = name.split('-')\n klass = prefix_mapping[split[0]]\n # handles case where there's no suffix (and will TypeError if too\n # many '-')\n offset = klass._from_name(*split[1:])\n except (ValueError, TypeError, KeyError):\n # bad prefix or suffix\n raise ValueError(libfreqs.INVALID_FREQ_ERR_MSG.format(name))\n # cache\n _offset_map[name] = offset\n\n return _offset_map[name]", "response": "Returns DateOffset object associated with rule name."} {"instruction": "Explain what the following Python 3 code does\ndef infer_freq(index, warn=True):\n import pandas as pd\n\n if isinstance(index, ABCSeries):\n values = index._values\n if not (is_datetime64_dtype(values) or\n is_timedelta64_dtype(values) or\n values.dtype == object):\n raise TypeError(\"cannot infer freq from a non-convertible dtype \"\n \"on a Series of {dtype}\".format(dtype=index.dtype))\n index = values\n\n if is_period_arraylike(index):\n raise TypeError(\"PeriodIndex given. Check the `freq` attribute \"\n \"instead of using infer_freq.\")\n elif is_timedelta64_dtype(index):\n # Allow TimedeltaIndex and TimedeltaArray\n inferer = _TimedeltaFrequencyInferer(index, warn=warn)\n return inferer.get_freq()\n\n if isinstance(index, pd.Index) and not isinstance(index, pd.DatetimeIndex):\n if isinstance(index, (pd.Int64Index, pd.Float64Index)):\n raise TypeError(\"cannot infer freq from a non-convertible index \"\n \"type {type}\".format(type=type(index)))\n index = index.values\n\n if not isinstance(index, pd.DatetimeIndex):\n try:\n index = pd.DatetimeIndex(index)\n except AmbiguousTimeError:\n index = pd.DatetimeIndex(index.asi8)\n\n inferer = _FrequencyInferer(index, warn=warn)\n return inferer.get_freq()", "response": "Infer the most likely frequency given the input index."} {"instruction": "Implement a Python 3 function for\nreturning the appropriate frequency string to describe the inferred .", "response": "def get_freq(self):\n \"\"\"\n Find the appropriate frequency string to describe the inferred\n frequency of self.values\n\n Returns\n -------\n str or None\n \"\"\"\n if not self.is_monotonic or not self.index._is_unique:\n return None\n\n delta = self.deltas[0]\n if _is_multiple(delta, _ONE_DAY):\n return self._infer_daily_rule()\n\n # Business hourly, maybe. 17: one day / 65: one weekend\n if self.hour_deltas in ([1, 17], [1, 65], [1, 17, 65]):\n return 'BH'\n # Possibly intraday frequency. Here we use the\n # original .asi8 values as the modified values\n # will not work around DST transitions. See #8772\n elif not self.is_unique_asi8:\n return None\n\n delta = self.deltas_asi8[0]\n if _is_multiple(delta, _ONE_HOUR):\n # Hours\n return _maybe_add_count('H', delta / _ONE_HOUR)\n elif _is_multiple(delta, _ONE_MINUTE):\n # Minutes\n return _maybe_add_count('T', delta / _ONE_MINUTE)\n elif _is_multiple(delta, _ONE_SECOND):\n # Seconds\n return _maybe_add_count('S', delta / _ONE_SECOND)\n elif _is_multiple(delta, _ONE_MILLI):\n # Milliseconds\n return _maybe_add_count('L', delta / _ONE_MILLI)\n elif _is_multiple(delta, _ONE_MICRO):\n # Microseconds\n return _maybe_add_count('U', delta / _ONE_MICRO)\n else:\n # Nanoseconds\n return _maybe_add_count('N', delta)"} {"instruction": "Create a Python 3 function for\nloading a pickle file with a provided encoding", "response": "def load(fh, encoding=None, is_verbose=False):\n \"\"\"load a pickle, with a provided encoding\n\n if compat is True:\n fake the old class hierarchy\n if it works, then return the new type objects\n\n Parameters\n ----------\n fh : a filelike object\n encoding : an optional encoding\n is_verbose : show exception output\n \"\"\"\n\n try:\n fh.seek(0)\n if encoding is not None:\n up = Unpickler(fh, encoding=encoding)\n else:\n up = Unpickler(fh)\n up.is_verbose = is_verbose\n\n return up.load()\n except (ValueError, TypeError):\n raise"} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef _new_Index(cls, d):\n # required for backward compat, because PI can't be instantiated with\n # ordinals through __new__ GH #13277\n if issubclass(cls, ABCPeriodIndex):\n from pandas.core.indexes.period import _new_PeriodIndex\n return _new_PeriodIndex(cls, **d)\n return cls.__new__(cls, **d)", "response": "This is called upon unpickling rather than the default which doesn t have arguments."} {"instruction": "How would you explain what the following Python 3 function does\ndef ensure_index_from_sequences(sequences, names=None):\n from .multi import MultiIndex\n\n if len(sequences) == 1:\n if names is not None:\n names = names[0]\n return Index(sequences[0], name=names)\n else:\n return MultiIndex.from_arrays(sequences, names=names)", "response": "Construct an index from a list of sequences."} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef ensure_index(index_like, copy=False):\n if isinstance(index_like, Index):\n if copy:\n index_like = index_like.copy()\n return index_like\n if hasattr(index_like, 'name'):\n return Index(index_like, name=index_like.name, copy=copy)\n\n if is_iterator(index_like):\n index_like = list(index_like)\n\n # must check for exactly list here because of strict type\n # check in clean_index_list\n if isinstance(index_like, list):\n if type(index_like) != list:\n index_like = list(index_like)\n\n converted, all_arrays = lib.clean_index_list(index_like)\n\n if len(converted) > 0 and all_arrays:\n from .multi import MultiIndex\n return MultiIndex.from_arrays(converted)\n else:\n index_like = converted\n else:\n # clean_index_list does the equivalent of copying\n # so only need to do this if not list instance\n if copy:\n from copy import copy\n index_like = copy(index_like)\n\n return Index(index_like)", "response": "Ensures that we have an index from some index - like object."} {"instruction": "How would you explain what the following Python 3 function does\ndef _trim_front(strings):\n trimmed = strings\n while len(strings) > 0 and all(x[0] == ' ' for x in trimmed):\n trimmed = [x[1:] for x in trimmed]\n return trimmed", "response": "Trims zeros and decimal points."} {"instruction": "Write a Python 3 function that can\ncreate a new object of the same class with the passed values.", "response": "def _simple_new(cls, values, name=None, dtype=None, **kwargs):\n \"\"\"\n We require that we have a dtype compat for the values. If we are passed\n a non-dtype compat, then coerce using the constructor.\n\n Must be careful not to recurse.\n \"\"\"\n if not hasattr(values, 'dtype'):\n if (values is None or not len(values)) and dtype is not None:\n values = np.empty(0, dtype=dtype)\n else:\n values = np.array(values, copy=False)\n if is_object_dtype(values):\n values = cls(values, name=name, dtype=dtype,\n **kwargs)._ndarray_values\n\n if isinstance(values, (ABCSeries, ABCIndexClass)):\n # Index._data must always be an ndarray.\n # This is no-copy for when _values is an ndarray,\n # which should be always at this point.\n values = np.asarray(values._values)\n\n result = object.__new__(cls)\n result._data = values\n # _index_data is a (temporary?) fix to ensure that the direct data\n # manipulation we do in `_libs/reduction.pyx` continues to work.\n # We need access to the actual ndarray, since we're messing with\n # data buffers and strides. We don't re-use `_ndarray_values`, since\n # we actually set this value too.\n result._index_data = values\n result.name = name\n for k, v in kwargs.items():\n setattr(result, k, v)\n return result._reset_identity()"} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef _shallow_copy_with_infer(self, values, **kwargs):\n attributes = self._get_attributes_dict()\n attributes.update(kwargs)\n attributes['copy'] = False\n if not len(values) and 'dtype' not in kwargs:\n attributes['dtype'] = self.dtype\n if self._infer_as_myclass:\n try:\n return self._constructor(values, **attributes)\n except (TypeError, ValueError):\n pass\n return Index(values, **attributes)", "response": "Create a new Index with the passed values and infer the class with passed values."} {"instruction": "Write a Python 3 script for\nreturning True if this object is the same underlying data as other.", "response": "def is_(self, other):\n \"\"\"\n More flexible, faster check like ``is`` but that works through views.\n\n Note: this is *not* the same as ``Index.identical()``, which checks\n that metadata is also the same.\n\n Parameters\n ----------\n other : object\n other object to compare against.\n\n Returns\n -------\n True if both have same underlying data, False otherwise : bool\n \"\"\"\n # use something other than None to be clearer\n return self._id is getattr(\n other, '_id', Ellipsis) and self._id is not None"} {"instruction": "Given the following Python 3 function, write the documentation\ndef _assert_take_fillable(self, values, indices, allow_fill=True,\n fill_value=None, na_value=np.nan):\n \"\"\"\n Internal method to handle NA filling of take.\n \"\"\"\n indices = ensure_platform_int(indices)\n\n # only fill if we are passing a non-None fill_value\n if allow_fill and fill_value is not None:\n if (indices < -1).any():\n msg = ('When allow_fill=True and fill_value is not None, '\n 'all indices must be >= -1')\n raise ValueError(msg)\n taken = algos.take(values,\n indices,\n allow_fill=allow_fill,\n fill_value=na_value)\n else:\n taken = values.take(indices)\n return taken", "response": "Internal method to handle NA filling of take."} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef _format_data(self, name=None):\n\n # do we want to justify (only do so for non-objects)\n is_justify = not (self.inferred_type in ('string', 'unicode') or\n (self.inferred_type == 'categorical' and\n is_object_dtype(self.categories)))\n\n return format_object_summary(self, self._formatter_func,\n is_justify=is_justify, name=name)", "response": "Return the formatted data as a unicode string."} {"instruction": "Write a Python 3 script to\nrender a string representation of the Index.", "response": "def format(self, name=False, formatter=None, **kwargs):\n \"\"\"\n Render a string representation of the Index.\n \"\"\"\n header = []\n if name:\n header.append(pprint_thing(self.name,\n escape_chars=('\\t', '\\r', '\\n')) if\n self.name is not None else '')\n\n if formatter is not None:\n return header + list(self.map(formatter))\n\n return self._format_with_header(header, **kwargs)"} {"instruction": "Create a Python 3 function for\nreturning a list of native types for the keys in the array that are not None.", "response": "def to_native_types(self, slicer=None, **kwargs):\n \"\"\"\n Format specified values of `self` and return them.\n\n Parameters\n ----------\n slicer : int, array-like\n An indexer into `self` that specifies which values\n are used in the formatting process.\n kwargs : dict\n Options for specifying how the values should be formatted.\n These options include the following:\n\n 1) na_rep : str\n The value that serves as a placeholder for NULL values\n 2) quoting : bool or None\n Whether or not there are quoted values in `self`\n 3) date_format : str\n The format used to represent date-like values\n \"\"\"\n\n values = self\n if slicer is not None:\n values = values[slicer]\n return values._format_native_types(**kwargs)"} {"instruction": "Explain what the following Python 3 code does\ndef _format_native_types(self, na_rep='', quoting=None, **kwargs):\n mask = isna(self)\n if not self.is_object() and not quoting:\n values = np.asarray(self).astype(str)\n else:\n values = np.array(self, dtype=object, copy=True)\n\n values[mask] = na_rep\n return values", "response": "Format the native types of the index."} {"instruction": "Write a Python 3 function for\nreturning a summarized representation of the index and the head of the entry.", "response": "def _summary(self, name=None):\n \"\"\"\n Return a summarized representation.\n\n Parameters\n ----------\n name : str\n name to use in the summary representation\n\n Returns\n -------\n String with a summarized representation of the index\n \"\"\"\n if len(self) > 0:\n head = self[0]\n if hasattr(head, 'format') and not isinstance(head, str):\n head = head.format()\n tail = self[-1]\n if hasattr(tail, 'format') and not isinstance(tail, str):\n tail = tail.format()\n index_summary = ', %s to %s' % (pprint_thing(head),\n pprint_thing(tail))\n else:\n index_summary = ''\n\n if name is None:\n name = type(self).__name__\n return '%s: %s entries%s' % (name, len(self), index_summary)"} {"instruction": "Create a Python 3 function for\nreturning a summarized representation of the current object.", "response": "def summary(self, name=None):\n \"\"\"\n Return a summarized representation.\n\n .. deprecated:: 0.23.0\n \"\"\"\n warnings.warn(\"'summary' is deprecated and will be removed in a \"\n \"future version.\", FutureWarning, stacklevel=2)\n return self._summary(name)"} {"instruction": "Here you have a function in Python 3, explain what it does\ndef to_series(self, index=None, name=None):\n\n from pandas import Series\n\n if index is None:\n index = self._shallow_copy()\n if name is None:\n name = self.name\n\n return Series(self.values.copy(), index=index, name=name)", "response": "Create a Series with both index and values equal to the index keys\n ."} {"instruction": "Can you write a function in Python 3 where it\nconverts the index to a DataFrame.", "response": "def to_frame(self, index=True, name=None):\n \"\"\"\n Create a DataFrame with a column containing the Index.\n\n .. versionadded:: 0.24.0\n\n Parameters\n ----------\n index : boolean, default True\n Set the index of the returned DataFrame as the original Index.\n\n name : object, default None\n The passed name should substitute for the index name (if it has\n one).\n\n Returns\n -------\n DataFrame\n DataFrame containing the original Index data.\n\n See Also\n --------\n Index.to_series : Convert an Index to a Series.\n Series.to_frame : Convert Series to DataFrame.\n\n Examples\n --------\n >>> idx = pd.Index(['Ant', 'Bear', 'Cow'], name='animal')\n >>> idx.to_frame()\n animal\n animal\n Ant Ant\n Bear Bear\n Cow Cow\n\n By default, the original Index is reused. To enforce a new Index:\n\n >>> idx.to_frame(index=False)\n animal\n 0 Ant\n 1 Bear\n 2 Cow\n\n To override the name of the resulting column, specify `name`:\n\n >>> idx.to_frame(index=False, name='zoo')\n zoo\n 0 Ant\n 1 Bear\n 2 Cow\n \"\"\"\n\n from pandas import DataFrame\n if name is None:\n name = self.name or 0\n result = DataFrame({name: self._values.copy()})\n\n if index:\n result.index = self\n return result"} {"instruction": "How would you code a function in Python 3 to\nvalidate the names parameter.", "response": "def _validate_names(self, name=None, names=None, deep=False):\n \"\"\"\n Handles the quirks of having a singular 'name' parameter for general\n Index and plural 'names' parameter for MultiIndex.\n \"\"\"\n from copy import deepcopy\n if names is not None and name is not None:\n raise TypeError(\"Can only provide one of `names` and `name`\")\n elif names is None and name is None:\n return deepcopy(self.names) if deep else self.names\n elif names is not None:\n if not is_list_like(names):\n raise TypeError(\"Must pass list-like as `names`.\")\n return names\n else:\n if not is_list_like(name):\n return [name]\n return name"} {"instruction": "Can you implement a function in Python 3 that\nsets new names on index.", "response": "def _set_names(self, values, level=None):\n \"\"\"\n Set new names on index. Each name has to be a hashable type.\n\n Parameters\n ----------\n values : str or sequence\n name(s) to set\n level : int, level name, or sequence of int/level names (default None)\n If the index is a MultiIndex (hierarchical), level(s) to set (None\n for all levels). Otherwise level must be None\n\n Raises\n ------\n TypeError if each name is not hashable.\n \"\"\"\n if not is_list_like(values):\n raise ValueError('Names must be a list-like')\n if len(values) != 1:\n raise ValueError('Length of new names must be 1, got %d' %\n len(values))\n\n # GH 20527\n # All items in 'name' need to be hashable:\n for name in values:\n if not is_hashable(name):\n raise TypeError('{}.name must be a hashable type'\n .format(self.__class__.__name__))\n self.name = values[0]"} {"instruction": "Write a Python 3 script to\nset the names of the Entry in the index or MultiIndex.", "response": "def set_names(self, names, level=None, inplace=False):\n \"\"\"\n Set Index or MultiIndex name.\n\n Able to set new names partially and by level.\n\n Parameters\n ----------\n names : label or list of label\n Name(s) to set.\n level : int, label or list of int or label, optional\n If the index is a MultiIndex, level(s) to set (None for all\n levels). Otherwise level must be None.\n inplace : bool, default False\n Modifies the object directly, instead of creating a new Index or\n MultiIndex.\n\n Returns\n -------\n Index\n The same type as the caller or None if inplace is True.\n\n See Also\n --------\n Index.rename : Able to set new names without level.\n\n Examples\n --------\n >>> idx = pd.Index([1, 2, 3, 4])\n >>> idx\n Int64Index([1, 2, 3, 4], dtype='int64')\n >>> idx.set_names('quarter')\n Int64Index([1, 2, 3, 4], dtype='int64', name='quarter')\n\n >>> idx = pd.MultiIndex.from_product([['python', 'cobra'],\n ... [2018, 2019]])\n >>> idx\n MultiIndex(levels=[['cobra', 'python'], [2018, 2019]],\n codes=[[1, 1, 0, 0], [0, 1, 0, 1]])\n >>> idx.set_names(['kind', 'year'], inplace=True)\n >>> idx\n MultiIndex(levels=[['cobra', 'python'], [2018, 2019]],\n codes=[[1, 1, 0, 0], [0, 1, 0, 1]],\n names=['kind', 'year'])\n >>> idx.set_names('species', level=0)\n MultiIndex(levels=[['cobra', 'python'], [2018, 2019]],\n codes=[[1, 1, 0, 0], [0, 1, 0, 1]],\n names=['species', 'year'])\n \"\"\"\n\n if level is not None and not isinstance(self, ABCMultiIndex):\n raise ValueError('Level must be None for non-MultiIndex')\n\n if level is not None and not is_list_like(level) and is_list_like(\n names):\n msg = \"Names must be a string when a single level is provided.\"\n raise TypeError(msg)\n\n if not is_list_like(names) and level is None and self.nlevels > 1:\n raise TypeError(\"Must pass list-like as `names`.\")\n\n if not is_list_like(names):\n names = [names]\n if level is not None and not is_list_like(level):\n level = [level]\n\n if inplace:\n idx = self\n else:\n idx = self._shallow_copy()\n idx._set_names(names, level=level)\n if not inplace:\n return idx"} {"instruction": "Write a Python 3 script to\nalter Index or MultiIndex name. Able to set new names without level. Defaults to returning new index. Length of names must match number of levels in MultiIndex. Parameters ---------- name : label or list of labels Name(s) to set. inplace : boolean, default False Modifies the object directly, instead of creating a new Index or MultiIndex. Returns ------- Index The same type as the caller or None if inplace is True. See Also -------- Index.set_names : Able to set new names partially and by level. Examples -------- >>> idx = pd.Index(['A', 'C', 'A', 'B'], name='score') >>> idx.rename('grade') Index(['A', 'C', 'A', 'B'], dtype='object', name='grade') >>> idx = pd.MultiIndex.from_product([['python', 'cobra'], ... [2018, 2019]], ... names=['kind', 'year']) >>> idx MultiIndex(levels=[['cobra', 'python'], [2018, 2019]], codes=[[1, 1, 0, 0], [0, 1, 0, 1]], names=['kind', 'year']) >>> idx.rename(['species', 'year']) MultiIndex(levels=[['cobra', 'python'], [2018, 2019]], codes=[[1, 1, 0, 0], [0, 1, 0, 1]], names=['species', 'year']) >>> idx.rename('species') Traceback (most recent call last): TypeError: Must pass list-like as `names`.", "response": "def rename(self, name, inplace=False):\n \"\"\"\n Alter Index or MultiIndex name.\n\n Able to set new names without level. Defaults to returning new index.\n Length of names must match number of levels in MultiIndex.\n\n Parameters\n ----------\n name : label or list of labels\n Name(s) to set.\n inplace : boolean, default False\n Modifies the object directly, instead of creating a new Index or\n MultiIndex.\n\n Returns\n -------\n Index\n The same type as the caller or None if inplace is True.\n\n See Also\n --------\n Index.set_names : Able to set new names partially and by level.\n\n Examples\n --------\n >>> idx = pd.Index(['A', 'C', 'A', 'B'], name='score')\n >>> idx.rename('grade')\n Index(['A', 'C', 'A', 'B'], dtype='object', name='grade')\n\n >>> idx = pd.MultiIndex.from_product([['python', 'cobra'],\n ... [2018, 2019]],\n ... names=['kind', 'year'])\n >>> idx\n MultiIndex(levels=[['cobra', 'python'], [2018, 2019]],\n codes=[[1, 1, 0, 0], [0, 1, 0, 1]],\n names=['kind', 'year'])\n >>> idx.rename(['species', 'year'])\n MultiIndex(levels=[['cobra', 'python'], [2018, 2019]],\n codes=[[1, 1, 0, 0], [0, 1, 0, 1]],\n names=['species', 'year'])\n >>> idx.rename('species')\n Traceback (most recent call last):\n TypeError: Must pass list-like as `names`.\n \"\"\"\n return self.set_names([name], inplace=inplace)"} {"instruction": "Implement a Python 3 function for\nvalidating index level. For single-level Index getting level number is a no-op, but some verification must be done like in MultiIndex.", "response": "def _validate_index_level(self, level):\n \"\"\"\n Validate index level.\n\n For single-level Index getting level number is a no-op, but some\n verification must be done like in MultiIndex.\n\n \"\"\"\n if isinstance(level, int):\n if level < 0 and level != -1:\n raise IndexError(\"Too many levels: Index has only 1 level,\"\n \" %d is not a valid level number\" % (level, ))\n elif level > 0:\n raise IndexError(\"Too many levels:\"\n \" Index has only 1 level, not %d\" %\n (level + 1))\n elif level != self.name:\n raise KeyError('Level %s must be same as name (%s)' %\n (level, self.name))"} {"instruction": "Write a Python 3 script for\nsorting the index by level.", "response": "def sortlevel(self, level=None, ascending=True, sort_remaining=None):\n \"\"\"\n For internal compatibility with with the Index API.\n\n Sort the Index. This is for compat with MultiIndex\n\n Parameters\n ----------\n ascending : boolean, default True\n False to sort in descending order\n\n level, sort_remaining are compat parameters\n\n Returns\n -------\n Index\n \"\"\"\n return self.sort_values(return_indexer=True, ascending=ascending)"} {"instruction": "Write a Python 3 script for\nreturning index with requested level removed.", "response": "def droplevel(self, level=0):\n \"\"\"\n Return index with requested level(s) removed.\n\n If resulting index has only 1 level left, the result will be\n of Index type, not MultiIndex.\n\n .. versionadded:: 0.23.1 (support for non-MultiIndex)\n\n Parameters\n ----------\n level : int, str, or list-like, default 0\n If a string is given, must be the name of a level\n If list-like, elements must be names or indexes of levels.\n\n Returns\n -------\n Index or MultiIndex\n \"\"\"\n if not isinstance(level, (tuple, list)):\n level = [level]\n\n levnums = sorted(self._get_level_number(lev) for lev in level)[::-1]\n\n if len(level) == 0:\n return self\n if len(level) >= self.nlevels:\n raise ValueError(\"Cannot remove {} levels from an index with {} \"\n \"levels: at least one level must be \"\n \"left.\".format(len(level), self.nlevels))\n # The two checks above guarantee that here self is a MultiIndex\n\n new_levels = list(self.levels)\n new_codes = list(self.codes)\n new_names = list(self.names)\n\n for i in levnums:\n new_levels.pop(i)\n new_codes.pop(i)\n new_names.pop(i)\n\n if len(new_levels) == 1:\n\n # set nan if needed\n mask = new_codes[0] == -1\n result = new_levels[0].take(new_codes[0])\n if mask.any():\n result = result.putmask(mask, np.nan)\n\n result.name = new_names[0]\n return result\n else:\n from .multi import MultiIndex\n return MultiIndex(levels=new_levels, codes=new_codes,\n names=new_names, verify_integrity=False)"} {"instruction": "Write a Python 3 function for\nreturning if each value is NaN.", "response": "def _isnan(self):\n \"\"\"\n Return if each value is NaN.\n \"\"\"\n if self._can_hold_na:\n return isna(self)\n else:\n # shouldn't reach to this condition by checking hasnans beforehand\n values = np.empty(len(self), dtype=np.bool_)\n values.fill(False)\n return values"} {"instruction": "Make a summary of the following Python 3 code\ndef get_duplicates(self):\n warnings.warn(\"'get_duplicates' is deprecated and will be removed in \"\n \"a future release. You can use \"\n \"idx[idx.duplicated()].unique() instead\",\n FutureWarning, stacklevel=2)\n\n return self[self.duplicated()].unique()", "response": "Return a list of duplicate index elements."} {"instruction": "Write a Python 3 function that can\nreturn an index containing unique values.", "response": "def _get_unique_index(self, dropna=False):\n \"\"\"\n Returns an index containing unique values.\n\n Parameters\n ----------\n dropna : bool\n If True, NaN values are dropped.\n\n Returns\n -------\n uniques : index\n \"\"\"\n if self.is_unique and not dropna:\n return self\n\n values = self.values\n\n if not self.is_unique:\n values = self.unique()\n\n if dropna:\n try:\n if self.hasnans:\n values = values[~isna(values)]\n except NotImplementedError:\n pass\n\n return self._shallow_copy(values)"} {"instruction": "Implement a Python 3 function for\nreturning a shallow copy of self unless the name changes in which the set operation will be self.", "response": "def _get_reconciled_name_object(self, other):\n \"\"\"\n If the result of a set operation will be self,\n return self, unless the name changes, in which\n case make a shallow copy of self.\n \"\"\"\n name = get_op_result_name(self, other)\n if self.name != name:\n return self._shallow_copy(name=name)\n return self"} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef union(self, other, sort=None):\n self._validate_sort_keyword(sort)\n self._assert_can_do_setop(other)\n other = ensure_index(other)\n\n if len(other) == 0 or self.equals(other):\n return self._get_reconciled_name_object(other)\n\n if len(self) == 0:\n return other._get_reconciled_name_object(self)\n\n # TODO: is_dtype_union_equal is a hack around\n # 1. buggy set ops with duplicates (GH #13432)\n # 2. CategoricalIndex lacking setops (GH #10186)\n # Once those are fixed, this workaround can be removed\n if not is_dtype_union_equal(self.dtype, other.dtype):\n this = self.astype('O')\n other = other.astype('O')\n return this.union(other, sort=sort)\n\n # TODO(EA): setops-refactor, clean all this up\n if is_period_dtype(self) or is_datetime64tz_dtype(self):\n lvals = self._ndarray_values\n else:\n lvals = self._values\n if is_period_dtype(other) or is_datetime64tz_dtype(other):\n rvals = other._ndarray_values\n else:\n rvals = other._values\n\n if sort is None and self.is_monotonic and other.is_monotonic:\n try:\n result = self._outer_indexer(lvals, rvals)[0]\n except TypeError:\n # incomparable objects\n result = list(lvals)\n\n # worth making this faster? a very unusual case\n value_set = set(lvals)\n result.extend([x for x in rvals if x not in value_set])\n else:\n indexer = self.get_indexer(other)\n indexer, = (indexer == -1).nonzero()\n\n if len(indexer) > 0:\n other_diff = algos.take_nd(rvals, indexer,\n allow_fill=False)\n result = _concat._concat_compat((lvals, other_diff))\n\n else:\n result = lvals\n\n if sort is None:\n try:\n result = sorting.safe_sort(result)\n except TypeError as e:\n warnings.warn(\"{}, sort order is undefined for \"\n \"incomparable objects\".format(e),\n RuntimeWarning, stacklevel=3)\n\n # for subclasses\n return self._wrap_setop_result(other, result)", "response": "Return the union of two Index objects."} {"instruction": "Implement a function in Python 3 to\nreturn the intersection of two Index objects.", "response": "def intersection(self, other, sort=False):\n \"\"\"\n Form the intersection of two Index objects.\n\n This returns a new Index with elements common to the index and `other`.\n\n Parameters\n ----------\n other : Index or array-like\n sort : False or None, default False\n Whether to sort the resulting index.\n\n * False : do not sort the result.\n * None : sort the result, except when `self` and `other` are equal\n or when the values cannot be compared.\n\n .. versionadded:: 0.24.0\n\n .. versionchanged:: 0.24.1\n\n Changed the default from ``True`` to ``False``, to match\n the behaviour of 0.23.4 and earlier.\n\n Returns\n -------\n intersection : Index\n\n Examples\n --------\n\n >>> idx1 = pd.Index([1, 2, 3, 4])\n >>> idx2 = pd.Index([3, 4, 5, 6])\n >>> idx1.intersection(idx2)\n Int64Index([3, 4], dtype='int64')\n \"\"\"\n self._validate_sort_keyword(sort)\n self._assert_can_do_setop(other)\n other = ensure_index(other)\n\n if self.equals(other):\n return self._get_reconciled_name_object(other)\n\n if not is_dtype_equal(self.dtype, other.dtype):\n this = self.astype('O')\n other = other.astype('O')\n return this.intersection(other, sort=sort)\n\n # TODO(EA): setops-refactor, clean all this up\n if is_period_dtype(self):\n lvals = self._ndarray_values\n else:\n lvals = self._values\n if is_period_dtype(other):\n rvals = other._ndarray_values\n else:\n rvals = other._values\n\n if self.is_monotonic and other.is_monotonic:\n try:\n result = self._inner_indexer(lvals, rvals)[0]\n return self._wrap_setop_result(other, result)\n except TypeError:\n pass\n\n try:\n indexer = Index(rvals).get_indexer(lvals)\n indexer = indexer.take((indexer != -1).nonzero()[0])\n except Exception:\n # duplicates\n indexer = algos.unique1d(\n Index(rvals).get_indexer_non_unique(lvals)[0])\n indexer = indexer[indexer != -1]\n\n taken = other.take(indexer)\n\n if sort is None:\n taken = sorting.safe_sort(taken.values)\n if self.name != other.name:\n name = None\n else:\n name = self.name\n return self._shallow_copy(taken, name=name)\n\n if self.name != other.name:\n taken.name = None\n\n return taken"} {"instruction": "Here you have a function in Python 3, explain what it does\ndef difference(self, other, sort=None):\n self._validate_sort_keyword(sort)\n self._assert_can_do_setop(other)\n\n if self.equals(other):\n # pass an empty np.ndarray with the appropriate dtype\n return self._shallow_copy(self._data[:0])\n\n other, result_name = self._convert_can_do_setop(other)\n\n this = self._get_unique_index()\n\n indexer = this.get_indexer(other)\n indexer = indexer.take((indexer != -1).nonzero()[0])\n\n label_diff = np.setdiff1d(np.arange(this.size), indexer,\n assume_unique=True)\n the_diff = this.values.take(label_diff)\n if sort is None:\n try:\n the_diff = sorting.safe_sort(the_diff)\n except TypeError:\n pass\n\n return this._shallow_copy(the_diff, name=result_name, freq=None)", "response": "Return a new Index with elements from the index that are not in the other."} {"instruction": "Create a Python 3 function to\ncompute the symmetric difference of two Index objects.", "response": "def symmetric_difference(self, other, result_name=None, sort=None):\n \"\"\"\n Compute the symmetric difference of two Index objects.\n\n Parameters\n ----------\n other : Index or array-like\n result_name : str\n sort : False or None, default None\n Whether to sort the resulting index. By default, the\n values are attempted to be sorted, but any TypeError from\n incomparable elements is caught by pandas.\n\n * None : Attempt to sort the result, but catch any TypeErrors\n from comparing incomparable elements.\n * False : Do not sort the result.\n\n .. versionadded:: 0.24.0\n\n .. versionchanged:: 0.24.1\n\n Changed the default value from ``True`` to ``None``\n (without change in behaviour).\n\n Returns\n -------\n symmetric_difference : Index\n\n Notes\n -----\n ``symmetric_difference`` contains elements that appear in either\n ``idx1`` or ``idx2`` but not both. Equivalent to the Index created by\n ``idx1.difference(idx2) | idx2.difference(idx1)`` with duplicates\n dropped.\n\n Examples\n --------\n >>> idx1 = pd.Index([1, 2, 3, 4])\n >>> idx2 = pd.Index([2, 3, 4, 5])\n >>> idx1.symmetric_difference(idx2)\n Int64Index([1, 5], dtype='int64')\n\n You can also use the ``^`` operator:\n\n >>> idx1 ^ idx2\n Int64Index([1, 5], dtype='int64')\n \"\"\"\n self._validate_sort_keyword(sort)\n self._assert_can_do_setop(other)\n other, result_name_update = self._convert_can_do_setop(other)\n if result_name is None:\n result_name = result_name_update\n\n this = self._get_unique_index()\n other = other._get_unique_index()\n indexer = this.get_indexer(other)\n\n # {this} minus {other}\n common_indexer = indexer.take((indexer != -1).nonzero()[0])\n left_indexer = np.setdiff1d(np.arange(this.size), common_indexer,\n assume_unique=True)\n left_diff = this.values.take(left_indexer)\n\n # {other} minus {this}\n right_indexer = (indexer == -1).nonzero()[0]\n right_diff = other.values.take(right_indexer)\n\n the_diff = _concat._concat_compat([left_diff, right_diff])\n if sort is None:\n try:\n the_diff = sorting.safe_sort(the_diff)\n except TypeError:\n pass\n\n attribs = self._get_attributes_dict()\n attribs['name'] = result_name\n if 'freq' in attribs:\n attribs['freq'] = None\n return self._shallow_copy_with_infer(the_diff, **attribs)"} {"instruction": "Explain what the following Python 3 code does\ndef _get_fill_indexer_searchsorted(self, target, method, limit=None):\n if limit is not None:\n raise ValueError('limit argument for %r method only well-defined '\n 'if index and target are monotonic' % method)\n\n side = 'left' if method == 'pad' else 'right'\n\n # find exact matches first (this simplifies the algorithm)\n indexer = self.get_indexer(target)\n nonexact = (indexer == -1)\n indexer[nonexact] = self._searchsorted_monotonic(target[nonexact],\n side)\n if side == 'left':\n # searchsorted returns \"indices into a sorted array such that,\n # if the corresponding elements in v were inserted before the\n # indices, the order of a would be preserved\".\n # Thus, we need to subtract 1 to find values to the left.\n indexer[nonexact] -= 1\n # This also mapped not found values (values of 0 from\n # np.searchsorted) to -1, which conveniently is also our\n # sentinel for missing values\n else:\n # Mark indices to the right of the largest value as not found\n indexer[indexer == len(self)] = -1\n return indexer", "response": "This is a fallback method that works for monotonic decreasing\n indexes and non - monotonic targets."} {"instruction": "Implement a Python 3 function for\ngetting the indexer for the nearest index labels.", "response": "def _get_nearest_indexer(self, target, limit, tolerance):\n \"\"\"\n Get the indexer for the nearest index labels; requires an index with\n values that can be subtracted from each other (e.g., not strings or\n tuples).\n \"\"\"\n left_indexer = self.get_indexer(target, 'pad', limit=limit)\n right_indexer = self.get_indexer(target, 'backfill', limit=limit)\n\n target = np.asarray(target)\n left_distances = abs(self.values[left_indexer] - target)\n right_distances = abs(self.values[right_indexer] - target)\n\n op = operator.lt if self.is_monotonic_increasing else operator.le\n indexer = np.where(op(left_distances, right_distances) |\n (right_indexer == -1), left_indexer, right_indexer)\n if tolerance is not None:\n indexer = self._filter_indexer_tolerance(target, indexer,\n tolerance)\n return indexer"} {"instruction": "Write a Python 3 script for\nconverting a list - like index or array to a numpy. ndarray.", "response": "def _convert_listlike_indexer(self, keyarr, kind=None):\n \"\"\"\n Parameters\n ----------\n keyarr : list-like\n Indexer to convert.\n\n Returns\n -------\n indexer : numpy.ndarray or None\n Return an ndarray or None if cannot convert.\n keyarr : numpy.ndarray\n Return tuple-safe keys.\n \"\"\"\n if isinstance(keyarr, Index):\n keyarr = self._convert_index_indexer(keyarr)\n else:\n keyarr = self._convert_arr_indexer(keyarr)\n\n indexer = self._convert_list_indexer(keyarr, kind=kind)\n return indexer, keyarr"} {"instruction": "Make a summary of the following Python 3 code\ndef _invalid_indexer(self, form, key):\n raise TypeError(\"cannot do {form} indexing on {klass} with these \"\n \"indexers [{key}] of {kind}\".format(\n form=form, klass=type(self), key=key,\n kind=type(key)))", "response": "Raises a TypeError if the key is not a valid index."} {"instruction": "Implement a function in Python 3 to\ncreate index with target's values (move/add/delete values as necessary). Parameters ---------- target : an iterable Returns ------- new_index : pd.Index Resulting index. indexer : np.ndarray or None Indices of output values in original index.", "response": "def reindex(self, target, method=None, level=None, limit=None,\n tolerance=None):\n \"\"\"\n Create index with target's values (move/add/delete values\n as necessary).\n\n Parameters\n ----------\n target : an iterable\n\n Returns\n -------\n new_index : pd.Index\n Resulting index.\n indexer : np.ndarray or None\n Indices of output values in original index.\n \"\"\"\n # GH6552: preserve names when reindexing to non-named target\n # (i.e. neither Index nor Series).\n preserve_names = not hasattr(target, 'name')\n\n # GH7774: preserve dtype/tz if target is empty and not an Index.\n target = _ensure_has_len(target) # target may be an iterator\n\n if not isinstance(target, Index) and len(target) == 0:\n attrs = self._get_attributes_dict()\n attrs.pop('freq', None) # don't preserve freq\n values = self._data[:0] # appropriately-dtyped empty array\n target = self._simple_new(values, dtype=self.dtype, **attrs)\n else:\n target = ensure_index(target)\n\n if level is not None:\n if method is not None:\n raise TypeError('Fill method not supported if level passed')\n _, indexer, _ = self._join_level(target, level, how='right',\n return_indexers=True)\n else:\n if self.equals(target):\n indexer = None\n else:\n\n if self.is_unique:\n indexer = self.get_indexer(target, method=method,\n limit=limit,\n tolerance=tolerance)\n else:\n if method is not None or limit is not None:\n raise ValueError(\"cannot reindex a non-unique index \"\n \"with a method or limit\")\n indexer, missing = self.get_indexer_non_unique(target)\n\n if preserve_names and target.nlevels == 1 and target.name != self.name:\n target = target.copy()\n target.name = self.name\n\n return target, indexer"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef _reindex_non_unique(self, target):\n\n target = ensure_index(target)\n indexer, missing = self.get_indexer_non_unique(target)\n check = indexer != -1\n new_labels = self.take(indexer[check])\n new_indexer = None\n\n if len(missing):\n length = np.arange(len(indexer))\n\n missing = ensure_platform_int(missing)\n missing_labels = target.take(missing)\n missing_indexer = ensure_int64(length[~check])\n cur_labels = self.take(indexer[check]).values\n cur_indexer = ensure_int64(length[check])\n\n new_labels = np.empty(tuple([len(indexer)]), dtype=object)\n new_labels[cur_indexer] = cur_labels\n new_labels[missing_indexer] = missing_labels\n\n # a unique indexer\n if target.is_unique:\n\n # see GH5553, make sure we use the right indexer\n new_indexer = np.arange(len(indexer))\n new_indexer[cur_indexer] = np.arange(len(cur_labels))\n new_indexer[missing_indexer] = -1\n\n # we have a non_unique selector, need to use the original\n # indexer here\n else:\n\n # need to retake to have the same size as the indexer\n indexer[~check] = -1\n\n # reset the new indexer to account for the new size\n new_indexer = np.arange(len(self.take(indexer)))\n new_indexer[~check] = -1\n\n new_index = self._shallow_copy_with_infer(new_labels, freq=None)\n return new_index, indexer, new_indexer", "response": "Create a new index with target s values and a possibly non - unique target."} {"instruction": "Create a Python 3 function to\ntry to convert an array of data into an integer index.", "response": "def _try_convert_to_int_index(cls, data, copy, name, dtype):\n \"\"\"\n Attempt to convert an array of data into an integer index.\n\n Parameters\n ----------\n data : The data to convert.\n copy : Whether to copy the data or not.\n name : The name of the index returned.\n\n Returns\n -------\n int_index : data converted to either an Int64Index or a\n UInt64Index\n\n Raises\n ------\n ValueError if the conversion was not successful.\n \"\"\"\n\n from .numeric import Int64Index, UInt64Index\n if not is_unsigned_integer_dtype(dtype):\n # skip int64 conversion attempt if uint-like dtype is passed, as\n # this could return Int64Index when UInt64Index is what's desrired\n try:\n res = data.astype('i8', copy=False)\n if (res == data).all():\n return Int64Index(res, copy=copy, name=name)\n except (OverflowError, TypeError, ValueError):\n pass\n\n # Conversion to int64 failed (possibly due to overflow) or was skipped,\n # so let's try now with uint64.\n try:\n res = data.astype('u8', copy=False)\n if (res == data).all():\n return UInt64Index(res, copy=copy, name=name)\n except (OverflowError, TypeError, ValueError):\n pass\n\n raise ValueError"} {"instruction": "Create a Python 3 function for\ncoercing data to ndarray.", "response": "def _coerce_to_ndarray(cls, data):\n \"\"\"\n Coerces data to ndarray.\n\n Converts other iterables to list first and then to array.\n Does not touch ndarrays.\n\n Raises\n ------\n TypeError\n When the data passed in is a scalar.\n \"\"\"\n\n if not isinstance(data, (np.ndarray, Index)):\n if data is None or is_scalar(data):\n cls._scalar_data_error(data)\n\n # other iterable of some kind\n if not isinstance(data, (ABCSeries, list, tuple)):\n data = list(data)\n data = np.asarray(data)\n return data"} {"instruction": "Can you implement a function in Python 3 that\ncoerces a scalar item to an index.", "response": "def _coerce_scalar_to_index(self, item):\n \"\"\"\n We need to coerce a scalar to a compat for our index type.\n\n Parameters\n ----------\n item : scalar item to coerce\n \"\"\"\n dtype = self.dtype\n\n if self._is_numeric_dtype and isna(item):\n # We can't coerce to the numeric dtype of \"self\" (unless\n # it's float) if there are NaN values in our output.\n dtype = None\n\n return Index([item], dtype=dtype, **self._get_attributes_dict())"} {"instruction": "Here you have a function in Python 3, explain what it does\ndef _assert_can_do_op(self, value):\n if not is_scalar(value):\n msg = \"'value' must be a scalar, passed: {0}\"\n raise TypeError(msg.format(type(value).__name__))", "response": "Check value is valid for scalar op."} {"instruction": "Implement a function in Python 3 to\ncheck if the name is a Python identifier and if so return True.", "response": "def _can_hold_identifiers_and_holds_name(self, name):\n \"\"\"\n Faster check for ``name in self`` when we know `name` is a Python\n identifier (e.g. in NDFrame.__getattr__, which hits this to support\n . key lookup). For indexes that can't hold identifiers (everything\n but object & categorical) we just return False.\n\n https://github.com/pandas-dev/pandas/issues/19764\n \"\"\"\n if self.is_object() or self.is_categorical():\n return name in self\n return False"} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef append(self, other):\n\n to_concat = [self]\n\n if isinstance(other, (list, tuple)):\n to_concat = to_concat + list(other)\n else:\n to_concat.append(other)\n\n for obj in to_concat:\n if not isinstance(obj, Index):\n raise TypeError('all inputs must be Index')\n\n names = {obj.name for obj in to_concat}\n name = None if len(names) > 1 else self.name\n\n return self._concat(to_concat, name)", "response": "Append a collection of Index options together."} {"instruction": "How would you code a function in Python 3 to\nput a value into the Index of the values set with the mask.", "response": "def putmask(self, mask, value):\n \"\"\"\n Return a new Index of the values set with the mask.\n\n See Also\n --------\n numpy.ndarray.putmask\n \"\"\"\n values = self.values.copy()\n try:\n np.putmask(values, mask, self._convert_for_op(value))\n return self._shallow_copy(values)\n except (ValueError, TypeError) as err:\n if is_object_dtype(self):\n raise err\n\n # coerces to object\n return self.astype(object).putmask(mask, value)"} {"instruction": "Explain what the following Python 3 code does\ndef equals(self, other):\n if self.is_(other):\n return True\n\n if not isinstance(other, Index):\n return False\n\n if is_object_dtype(self) and not is_object_dtype(other):\n # if other is not object, use other's logic for coercion\n return other.equals(self)\n\n try:\n return array_equivalent(com.values_from_object(self),\n com.values_from_object(other))\n except Exception:\n return False", "response": "Determine if two Index objects contain the same elements."} {"instruction": "How would you explain what the following Python 3 function does\ndef asof(self, label):\n try:\n loc = self.get_loc(label, method='pad')\n except KeyError:\n return self._na_value\n else:\n if isinstance(loc, slice):\n loc = loc.indices(len(self))[-1]\n return self[loc]", "response": "Return the label from the index or the previous one if not present."} {"instruction": "How would you implement a function in Python 3 that\nreturns the locations of the original entry in the index for the asof function.", "response": "def asof_locs(self, where, mask):\n \"\"\"\n Find the locations (indices) of the labels from the index for\n every entry in the `where` argument.\n\n As in the `asof` function, if the label (a particular entry in\n `where`) is not in the index, the latest index label upto the\n passed label is chosen and its index returned.\n\n If all of the labels in the index are later than a label in `where`,\n -1 is returned.\n\n `mask` is used to ignore NA values in the index during calculation.\n\n Parameters\n ----------\n where : Index\n An Index consisting of an array of timestamps.\n mask : array-like\n Array of booleans denoting where values in the original\n data are not NA.\n\n Returns\n -------\n numpy.ndarray\n An array of locations (indices) of the labels from the Index\n which correspond to the return values of the `asof` function\n for every element in `where`.\n \"\"\"\n locs = self.values[mask].searchsorted(where.values, side='right')\n locs = np.where(locs > 0, locs - 1, 0)\n\n result = np.arange(len(self))[mask].take(locs)\n\n first = mask.argmax()\n result[(locs == 0) & (where.values < self.values[first])] = -1\n\n return result"} {"instruction": "Here you have a function in Python 3, explain what it does\ndef sort_values(self, return_indexer=False, ascending=True):\n _as = self.argsort()\n if not ascending:\n _as = _as[::-1]\n\n sorted_index = self.take(_as)\n\n if return_indexer:\n return sorted_index, _as\n else:\n return sorted_index", "response": "Return a sorted copy of the index and optionally return the indices that would sort the index itself."} {"instruction": "Write a Python 3 script to\nreturn the integer indices that would sort the index.", "response": "def argsort(self, *args, **kwargs):\n \"\"\"\n Return the integer indices that would sort the index.\n\n Parameters\n ----------\n *args\n Passed to `numpy.ndarray.argsort`.\n **kwargs\n Passed to `numpy.ndarray.argsort`.\n\n Returns\n -------\n numpy.ndarray\n Integer indices that would sort the index if used as\n an indexer.\n\n See Also\n --------\n numpy.argsort : Similar method for NumPy arrays.\n Index.sort_values : Return sorted copy of Index.\n\n Examples\n --------\n >>> idx = pd.Index(['b', 'a', 'd', 'c'])\n >>> idx\n Index(['b', 'a', 'd', 'c'], dtype='object')\n\n >>> order = idx.argsort()\n >>> order\n array([1, 0, 3, 2])\n\n >>> idx[order]\n Index(['a', 'b', 'c', 'd'], dtype='object')\n \"\"\"\n result = self.asi8\n if result is None:\n result = np.array(self)\n return result.argsort(*args, **kwargs)"} {"instruction": "Can you write a function in Python 3 where it\ngets the value of a key from a 1 - dimensional ndarray.", "response": "def get_value(self, series, key):\n \"\"\"\n Fast lookup of value from 1-dimensional ndarray. Only use this if you\n know what you're doing.\n \"\"\"\n\n # if we have something that is Index-like, then\n # use this, e.g. DatetimeIndex\n # Things like `Series._get_value` (via .at) pass the EA directly here.\n s = getattr(series, '_values', series)\n if isinstance(s, (ExtensionArray, Index)) and is_scalar(key):\n # GH 20882, 21257\n # Unify Index and ExtensionArray treatment\n # First try to convert the key to a location\n # If that fails, raise a KeyError if an integer\n # index, otherwise, see if key is an integer, and\n # try that\n try:\n iloc = self.get_loc(key)\n return s[iloc]\n except KeyError:\n if (len(self) > 0 and\n (self.holds_integer() or self.is_boolean())):\n raise\n elif is_integer(key):\n return s[key]\n\n s = com.values_from_object(series)\n k = com.values_from_object(key)\n\n k = self._convert_scalar_indexer(k, kind='getitem')\n try:\n return self._engine.get_value(s, k,\n tz=getattr(series.dtype, 'tz', None))\n except KeyError as e1:\n if len(self) > 0 and (self.holds_integer() or self.is_boolean()):\n raise\n\n try:\n return libindex.get_value_box(s, key)\n except IndexError:\n raise\n except TypeError:\n # generator/iterator-like\n if is_iterator(key):\n raise InvalidIndexError(key)\n else:\n raise e1\n except Exception: # pragma: no cover\n raise e1\n except TypeError:\n # python 3\n if is_scalar(key): # pragma: no cover\n raise IndexError(key)\n raise InvalidIndexError(key)"} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef set_value(self, arr, key, value):\n self._engine.set_value(com.values_from_object(arr),\n com.values_from_object(key), value)", "response": "Set the value of a key in a 1 - dimensional ndarray."} {"instruction": "Implement a function in Python 3 to\nreturn an indexer for the given target.", "response": "def get_indexer_for(self, target, **kwargs):\n \"\"\"\n Guaranteed return of an indexer even when non-unique.\n\n This dispatches to get_indexer or get_indexer_nonunique\n as appropriate.\n \"\"\"\n if self.is_unique:\n return self.get_indexer(target, **kwargs)\n indexer, _ = self.get_indexer_non_unique(target, **kwargs)\n return indexer"} {"instruction": "Can you write a function in Python 3 where it\ngroups the index labels by a given array of values.", "response": "def groupby(self, values):\n \"\"\"\n Group the index labels by a given array of values.\n\n Parameters\n ----------\n values : array\n Values used to determine the groups.\n\n Returns\n -------\n groups : dict\n {group name -> group labels}\n \"\"\"\n\n # TODO: if we are a MultiIndex, we can do better\n # that converting to tuples\n if isinstance(values, ABCMultiIndex):\n values = values.values\n values = ensure_categorical(values)\n result = values._reverse_indexer()\n\n # map to the label\n result = {k: self.take(v) for k, v in result.items()}\n\n return result"} {"instruction": "Can you implement a function in Python 3 that\nmaps the values of the index to the corresponding values in the input dictionary or Series or function.", "response": "def map(self, mapper, na_action=None):\n \"\"\"\n Map values using input correspondence (a dict, Series, or function).\n\n Parameters\n ----------\n mapper : function, dict, or Series\n Mapping correspondence.\n na_action : {None, 'ignore'}\n If 'ignore', propagate NA values, without passing them to the\n mapping correspondence.\n\n Returns\n -------\n applied : Union[Index, MultiIndex], inferred\n The output of the mapping function applied to the index.\n If the function returns a tuple with more than one element\n a MultiIndex will be returned.\n \"\"\"\n\n from .multi import MultiIndex\n new_values = super()._map_values(mapper, na_action=na_action)\n\n attributes = self._get_attributes_dict()\n\n # we can return a MultiIndex\n if new_values.size and isinstance(new_values[0], tuple):\n if isinstance(self, MultiIndex):\n names = self.names\n elif attributes.get('name'):\n names = [attributes.get('name')] * len(new_values[0])\n else:\n names = None\n return MultiIndex.from_tuples(new_values,\n names=names)\n\n attributes['copy'] = False\n if not new_values.size:\n # empty\n attributes['dtype'] = self.dtype\n\n return Index(new_values, **attributes)"} {"instruction": "Given the following Python 3 function, write the documentation\ndef isin(self, values, level=None):\n if level is not None:\n self._validate_index_level(level)\n return algos.isin(self, values)", "response": "Return a boolean array where the index values are in values."} {"instruction": "Implement a function in Python 3 to\ntry to cast the key to an int if equivalent.", "response": "def _maybe_cast_indexer(self, key):\n \"\"\"\n If we have a float key and are not a floating index, then try to cast\n to an int if equivalent.\n \"\"\"\n\n if is_float(key) and not self.is_floating():\n try:\n ckey = int(key)\n if ckey == key:\n key = ckey\n except (OverflowError, ValueError, TypeError):\n pass\n return key"} {"instruction": "Can you create a Python 3 function that\nvalidates that we have appropriate indexer for the key.", "response": "def _validate_indexer(self, form, key, kind):\n \"\"\"\n If we are positional indexer, validate that we have appropriate\n typed bounds must be an integer.\n \"\"\"\n assert kind in ['ix', 'loc', 'getitem', 'iloc']\n\n if key is None:\n pass\n elif is_integer(key):\n pass\n elif kind in ['iloc', 'getitem']:\n self._invalid_indexer(form, key)\n return key"} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef get_slice_bound(self, label, side, kind):\n assert kind in ['ix', 'loc', 'getitem', None]\n\n if side not in ('left', 'right'):\n raise ValueError(\"Invalid value for side kwarg,\"\n \" must be either 'left' or 'right': %s\" %\n (side, ))\n\n original_label = label\n\n # For datetime indices label may be a string that has to be converted\n # to datetime boundary according to its resolution.\n label = self._maybe_cast_slice_bound(label, side, kind)\n\n # we need to look up the label\n try:\n slc = self._get_loc_only_exact_matches(label)\n except KeyError as err:\n try:\n return self._searchsorted_monotonic(label, side)\n except ValueError:\n # raise the original KeyError\n raise err\n\n if isinstance(slc, np.ndarray):\n # get_loc may return a boolean array or an array of indices, which\n # is OK as long as they are representable by a slice.\n if is_bool_dtype(slc):\n slc = lib.maybe_booleans_to_slice(slc.view('u1'))\n else:\n slc = lib.maybe_indices_to_slice(slc.astype('i8'), len(self))\n if isinstance(slc, np.ndarray):\n raise KeyError(\"Cannot get %s slice bound for non-unique \"\n \"label: %r\" % (side, original_label))\n\n if isinstance(slc, slice):\n if side == 'left':\n return slc.start\n else:\n return slc.stop\n else:\n if side == 'right':\n return slc + 1\n else:\n return slc", "response": "Calculate slice bound that corresponds to given label."} {"instruction": "Write a Python 3 script for\ncomputing slice locations for a single label.", "response": "def slice_locs(self, start=None, end=None, step=None, kind=None):\n \"\"\"\n Compute slice locations for input labels.\n\n Parameters\n ----------\n start : label, default None\n If None, defaults to the beginning\n end : label, default None\n If None, defaults to the end\n step : int, defaults None\n If None, defaults to 1\n kind : {'ix', 'loc', 'getitem'} or None\n\n Returns\n -------\n start, end : int\n\n See Also\n --------\n Index.get_loc : Get location for a single label.\n\n Notes\n -----\n This method only works if the index is monotonic or unique.\n\n Examples\n ---------\n >>> idx = pd.Index(list('abcd'))\n >>> idx.slice_locs(start='b', end='c')\n (1, 3)\n \"\"\"\n inc = (step is None or step >= 0)\n\n if not inc:\n # If it's a reverse slice, temporarily swap bounds.\n start, end = end, start\n\n # GH 16785: If start and end happen to be date strings with UTC offsets\n # attempt to parse and check that the offsets are the same\n if (isinstance(start, (str, datetime))\n and isinstance(end, (str, datetime))):\n try:\n ts_start = Timestamp(start)\n ts_end = Timestamp(end)\n except (ValueError, TypeError):\n pass\n else:\n if not tz_compare(ts_start.tzinfo, ts_end.tzinfo):\n raise ValueError(\"Both dates must have the \"\n \"same UTC offset\")\n\n start_slice = None\n if start is not None:\n start_slice = self.get_slice_bound(start, 'left', kind)\n if start_slice is None:\n start_slice = 0\n\n end_slice = None\n if end is not None:\n end_slice = self.get_slice_bound(end, 'right', kind)\n if end_slice is None:\n end_slice = len(self)\n\n if not inc:\n # Bounds at this moment are swapped, swap them back and shift by 1.\n #\n # slice_locs('B', 'A', step=-1): s='B', e='A'\n #\n # s='A' e='B'\n # AFTER SWAP: | |\n # v ------------------> V\n # -----------------------------------\n # | | |A|A|A|A| | | | | |B|B| | | | |\n # -----------------------------------\n # ^ <------------------ ^\n # SHOULD BE: | |\n # end=s-1 start=e-1\n #\n end_slice, start_slice = start_slice - 1, end_slice - 1\n\n # i == -1 triggers ``len(self) + i`` selection that points to the\n # last element, not before-the-first one, subtracting len(self)\n # compensates that.\n if end_slice == -1:\n end_slice -= len(self)\n if start_slice == -1:\n start_slice -= len(self)\n\n return start_slice, end_slice"} {"instruction": "Write a Python 3 function that can\ndelete the entry at the specified location.", "response": "def delete(self, loc):\n \"\"\"\n Make new Index with passed location(-s) deleted.\n\n Returns\n -------\n new_index : Index\n \"\"\"\n return self._shallow_copy(np.delete(self._data, loc))"} {"instruction": "Here you have a function in Python 3, explain what it does\ndef insert(self, loc, item):\n _self = np.asarray(self)\n item = self._coerce_scalar_to_index(item)._ndarray_values\n idx = np.concatenate((_self[:loc], item, _self[loc:]))\n return self._shallow_copy_with_infer(idx)", "response": "Make new Index inserting new item at location."} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef drop(self, labels, errors='raise'):\n arr_dtype = 'object' if self.dtype == 'object' else None\n labels = com.index_labels_to_array(labels, dtype=arr_dtype)\n indexer = self.get_indexer(labels)\n mask = indexer == -1\n if mask.any():\n if errors != 'ignore':\n raise KeyError(\n '{} not found in axis'.format(labels[mask]))\n indexer = indexer[~mask]\n return self.delete(indexer)", "response": "Make new Index with passed list of labels deleted."} {"instruction": "Write a Python 3 script to\nadd in comparison methods.", "response": "def _add_comparison_methods(cls):\n \"\"\"\n Add in comparison methods.\n \"\"\"\n cls.__eq__ = _make_comparison_op(operator.eq, cls)\n cls.__ne__ = _make_comparison_op(operator.ne, cls)\n cls.__lt__ = _make_comparison_op(operator.lt, cls)\n cls.__gt__ = _make_comparison_op(operator.gt, cls)\n cls.__le__ = _make_comparison_op(operator.le, cls)\n cls.__ge__ = _make_comparison_op(operator.ge, cls)"} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef _add_numeric_methods_add_sub_disabled(cls):\n cls.__add__ = make_invalid_op('__add__')\n cls.__radd__ = make_invalid_op('__radd__')\n cls.__iadd__ = make_invalid_op('__iadd__')\n cls.__sub__ = make_invalid_op('__sub__')\n cls.__rsub__ = make_invalid_op('__rsub__')\n cls.__isub__ = make_invalid_op('__isub__')", "response": "Add in the numeric add or sub methods to disable."} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef _add_numeric_methods_disabled(cls):\n cls.__pow__ = make_invalid_op('__pow__')\n cls.__rpow__ = make_invalid_op('__rpow__')\n cls.__mul__ = make_invalid_op('__mul__')\n cls.__rmul__ = make_invalid_op('__rmul__')\n cls.__floordiv__ = make_invalid_op('__floordiv__')\n cls.__rfloordiv__ = make_invalid_op('__rfloordiv__')\n cls.__truediv__ = make_invalid_op('__truediv__')\n cls.__rtruediv__ = make_invalid_op('__rtruediv__')\n cls.__mod__ = make_invalid_op('__mod__')\n cls.__divmod__ = make_invalid_op('__divmod__')\n cls.__neg__ = make_invalid_op('__neg__')\n cls.__pos__ = make_invalid_op('__pos__')\n cls.__abs__ = make_invalid_op('__abs__')\n cls.__inv__ = make_invalid_op('__inv__')", "response": "Add in numeric methods to disable other than add or sub."} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef _validate_for_numeric_unaryop(self, op, opstr):\n if not self._is_numeric_dtype:\n raise TypeError(\"cannot evaluate a numeric op \"\n \"{opstr} for type: {typ}\"\n .format(opstr=opstr, typ=type(self).__name__))", "response": "Validate if we can perform a numeric unary operation."} {"instruction": "How would you implement a function in Python 3 that\nvalidates that the other is of the appropriate type for the numeric binop.", "response": "def _validate_for_numeric_binop(self, other, op):\n \"\"\"\n Return valid other; evaluate or raise TypeError if we are not of\n the appropriate type.\n\n Notes\n -----\n This is an internal method called by ops.\n \"\"\"\n opstr = '__{opname}__'.format(opname=op.__name__)\n # if we are an inheritor of numeric,\n # but not actually numeric (e.g. DatetimeIndex/PeriodIndex)\n if not self._is_numeric_dtype:\n raise TypeError(\"cannot evaluate a numeric op {opstr} \"\n \"for type: {typ}\"\n .format(opstr=opstr, typ=type(self).__name__))\n\n if isinstance(other, Index):\n if not other._is_numeric_dtype:\n raise TypeError(\"cannot evaluate a numeric op \"\n \"{opstr} with type: {typ}\"\n .format(opstr=opstr, typ=type(other)))\n elif isinstance(other, np.ndarray) and not other.ndim:\n other = other.item()\n\n if isinstance(other, (Index, ABCSeries, np.ndarray)):\n if len(self) != len(other):\n raise ValueError(\"cannot evaluate a numeric op with \"\n \"unequal lengths\")\n other = com.values_from_object(other)\n if other.dtype.kind not in ['f', 'i', 'u']:\n raise TypeError(\"cannot evaluate a numeric op \"\n \"with a non-numeric dtype\")\n elif isinstance(other, (ABCDateOffset, np.timedelta64, timedelta)):\n # higher up to handle\n pass\n elif isinstance(other, (datetime, np.datetime64)):\n # higher up to handle\n pass\n else:\n if not (is_float(other) or is_integer(other)):\n raise TypeError(\"can only perform ops with scalar values\")\n\n return other"} {"instruction": "Make a summary of the following Python 3 code\ndef _add_numeric_methods_binary(cls):\n cls.__add__ = _make_arithmetic_op(operator.add, cls)\n cls.__radd__ = _make_arithmetic_op(ops.radd, cls)\n cls.__sub__ = _make_arithmetic_op(operator.sub, cls)\n cls.__rsub__ = _make_arithmetic_op(ops.rsub, cls)\n cls.__rpow__ = _make_arithmetic_op(ops.rpow, cls)\n cls.__pow__ = _make_arithmetic_op(operator.pow, cls)\n\n cls.__truediv__ = _make_arithmetic_op(operator.truediv, cls)\n cls.__rtruediv__ = _make_arithmetic_op(ops.rtruediv, cls)\n\n # TODO: rmod? rdivmod?\n cls.__mod__ = _make_arithmetic_op(operator.mod, cls)\n cls.__floordiv__ = _make_arithmetic_op(operator.floordiv, cls)\n cls.__rfloordiv__ = _make_arithmetic_op(ops.rfloordiv, cls)\n cls.__divmod__ = _make_arithmetic_op(divmod, cls)\n cls.__mul__ = _make_arithmetic_op(operator.mul, cls)\n cls.__rmul__ = _make_arithmetic_op(ops.rmul, cls)", "response": "Add in numeric methods."} {"instruction": "How would you implement a function in Python 3 that\nadds in numeric unary methods.", "response": "def _add_numeric_methods_unary(cls):\n \"\"\"\n Add in numeric unary methods.\n \"\"\"\n def _make_evaluate_unary(op, opstr):\n\n def _evaluate_numeric_unary(self):\n\n self._validate_for_numeric_unaryop(op, opstr)\n attrs = self._get_attributes_dict()\n attrs = self._maybe_update_attributes(attrs)\n return Index(op(self.values), **attrs)\n\n _evaluate_numeric_unary.__name__ = opstr\n return _evaluate_numeric_unary\n\n cls.__neg__ = _make_evaluate_unary(operator.neg, '__neg__')\n cls.__pos__ = _make_evaluate_unary(operator.pos, '__pos__')\n cls.__abs__ = _make_evaluate_unary(np.abs, '__abs__')\n cls.__inv__ = _make_evaluate_unary(lambda x: -x, '__inv__')"} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef _add_logical_methods(cls):\n _doc = \"\"\"\n %(desc)s\n\n Parameters\n ----------\n *args\n These parameters will be passed to numpy.%(outname)s.\n **kwargs\n These parameters will be passed to numpy.%(outname)s.\n\n Returns\n -------\n %(outname)s : bool or array_like (if axis is specified)\n A single element array_like may be converted to bool.\"\"\"\n\n _index_shared_docs['index_all'] = dedent(\"\"\"\n\n See Also\n --------\n Index.any : Return whether any element in an Index is True.\n Series.any : Return whether any element in a Series is True.\n Series.all : Return whether all elements in a Series are True.\n\n Notes\n -----\n Not a Number (NaN), positive infinity and negative infinity\n evaluate to True because these are not equal to zero.\n\n Examples\n --------\n **all**\n\n True, because nonzero integers are considered True.\n\n >>> pd.Index([1, 2, 3]).all()\n True\n\n False, because ``0`` is considered False.\n\n >>> pd.Index([0, 1, 2]).all()\n False\n\n **any**\n\n True, because ``1`` is considered True.\n\n >>> pd.Index([0, 0, 1]).any()\n True\n\n False, because ``0`` is considered False.\n\n >>> pd.Index([0, 0, 0]).any()\n False\n \"\"\")\n\n _index_shared_docs['index_any'] = dedent(\"\"\"\n\n See Also\n --------\n Index.all : Return whether all elements are True.\n Series.all : Return whether all elements are True.\n\n Notes\n -----\n Not a Number (NaN), positive infinity and negative infinity\n evaluate to True because these are not equal to zero.\n\n Examples\n --------\n >>> index = pd.Index([0, 1, 2])\n >>> index.any()\n True\n\n >>> index = pd.Index([0, 0, 0])\n >>> index.any()\n False\n \"\"\")\n\n def _make_logical_function(name, desc, f):\n @Substitution(outname=name, desc=desc)\n @Appender(_index_shared_docs['index_' + name])\n @Appender(_doc)\n def logical_func(self, *args, **kwargs):\n result = f(self.values)\n if (isinstance(result, (np.ndarray, ABCSeries, Index)) and\n result.ndim == 0):\n # return NumPy type\n return result.dtype.type(result.item())\n else: # pragma: no cover\n return result\n\n logical_func.__name__ = name\n return logical_func\n\n cls.all = _make_logical_function('all', 'Return whether all elements '\n 'are True.',\n np.all)\n cls.any = _make_logical_function('any',\n 'Return whether any element is True.',\n np.any)", "response": "Add in logical methods to the log entry class."} {"instruction": "Create a Python 3 function to\ncreate and return a BaseGrouper, which is an internal mapping of how to create the grouper indexers. This may be composed of multiple Grouping objects, indicating multiple groupers Groupers are ultimately index mappings. They can originate as: index mappings, keys to columns, functions, or Groupers Groupers enable local references to axis,level,sort, while the passed in axis, level, and sort are 'global'. This routine tries to figure out what the passing in references are and then creates a Grouping for each one, combined into a BaseGrouper. If observed & we have a categorical grouper, only show the observed values If validate, then check for key/level overlaps", "response": "def _get_grouper(obj, key=None, axis=0, level=None, sort=True,\n observed=False, mutated=False, validate=True):\n \"\"\"\n create and return a BaseGrouper, which is an internal\n mapping of how to create the grouper indexers.\n This may be composed of multiple Grouping objects, indicating\n multiple groupers\n\n Groupers are ultimately index mappings. They can originate as:\n index mappings, keys to columns, functions, or Groupers\n\n Groupers enable local references to axis,level,sort, while\n the passed in axis, level, and sort are 'global'.\n\n This routine tries to figure out what the passing in references\n are and then creates a Grouping for each one, combined into\n a BaseGrouper.\n\n If observed & we have a categorical grouper, only show the observed\n values\n\n If validate, then check for key/level overlaps\n\n \"\"\"\n group_axis = obj._get_axis(axis)\n\n # validate that the passed single level is compatible with the passed\n # axis of the object\n if level is not None:\n # TODO: These if-block and else-block are almost same.\n # MultiIndex instance check is removable, but it seems that there are\n # some processes only for non-MultiIndex in else-block,\n # eg. `obj.index.name != level`. We have to consider carefully whether\n # these are applicable for MultiIndex. Even if these are applicable,\n # we need to check if it makes no side effect to subsequent processes\n # on the outside of this condition.\n # (GH 17621)\n if isinstance(group_axis, MultiIndex):\n if is_list_like(level) and len(level) == 1:\n level = level[0]\n\n if key is None and is_scalar(level):\n # Get the level values from group_axis\n key = group_axis.get_level_values(level)\n level = None\n\n else:\n # allow level to be a length-one list-like object\n # (e.g., level=[0])\n # GH 13901\n if is_list_like(level):\n nlevels = len(level)\n if nlevels == 1:\n level = level[0]\n elif nlevels == 0:\n raise ValueError('No group keys passed!')\n else:\n raise ValueError('multiple levels only valid with '\n 'MultiIndex')\n\n if isinstance(level, str):\n if obj.index.name != level:\n raise ValueError('level name {} is not the name of the '\n 'index'.format(level))\n elif level > 0 or level < -1:\n raise ValueError(\n 'level > 0 or level < -1 only valid with MultiIndex')\n\n # NOTE: `group_axis` and `group_axis.get_level_values(level)`\n # are same in this section.\n level = None\n key = group_axis\n\n # a passed-in Grouper, directly convert\n if isinstance(key, Grouper):\n binner, grouper, obj = key._get_grouper(obj, validate=False)\n if key.key is None:\n return grouper, [], obj\n else:\n return grouper, {key.key}, obj\n\n # already have a BaseGrouper, just return it\n elif isinstance(key, BaseGrouper):\n return key, [], obj\n\n # In the future, a tuple key will always mean an actual key,\n # not an iterable of keys. In the meantime, we attempt to provide\n # a warning. We can assume that the user wanted a list of keys when\n # the key is not in the index. We just have to be careful with\n # unhashble elements of `key`. Any unhashable elements implies that\n # they wanted a list of keys.\n # https://github.com/pandas-dev/pandas/issues/18314\n is_tuple = isinstance(key, tuple)\n all_hashable = is_tuple and is_hashable(key)\n\n if is_tuple:\n if ((all_hashable and key not in obj and set(key).issubset(obj))\n or not all_hashable):\n # column names ('a', 'b') -> ['a', 'b']\n # arrays like (a, b) -> [a, b]\n msg = (\"Interpreting tuple 'by' as a list of keys, rather than \"\n \"a single key. Use 'by=[...]' instead of 'by=(...)'. In \"\n \"the future, a tuple will always mean a single key.\")\n warnings.warn(msg, FutureWarning, stacklevel=5)\n key = list(key)\n\n if not isinstance(key, list):\n keys = [key]\n match_axis_length = False\n else:\n keys = key\n match_axis_length = len(keys) == len(group_axis)\n\n # what are we after, exactly?\n any_callable = any(callable(g) or isinstance(g, dict) for g in keys)\n any_groupers = any(isinstance(g, Grouper) for g in keys)\n any_arraylike = any(isinstance(g, (list, tuple, Series, Index, np.ndarray))\n for g in keys)\n\n # is this an index replacement?\n if (not any_callable and not any_arraylike and not any_groupers and\n match_axis_length and level is None):\n if isinstance(obj, DataFrame):\n all_in_columns_index = all(g in obj.columns or g in\n obj.index.names for g in keys)\n elif isinstance(obj, Series):\n all_in_columns_index = all(g in obj.index.names for g in keys)\n\n if not all_in_columns_index:\n keys = [com.asarray_tuplesafe(keys)]\n\n if isinstance(level, (tuple, list)):\n if key is None:\n keys = [None] * len(level)\n levels = level\n else:\n levels = [level] * len(keys)\n\n groupings = []\n exclusions = []\n\n # if the actual grouper should be obj[key]\n def is_in_axis(key):\n if not _is_label_like(key):\n try:\n obj._data.items.get_loc(key)\n except Exception:\n return False\n\n return True\n\n # if the grouper is obj[name]\n def is_in_obj(gpr):\n try:\n return id(gpr) == id(obj[gpr.name])\n except Exception:\n return False\n\n for i, (gpr, level) in enumerate(zip(keys, levels)):\n\n if is_in_obj(gpr): # df.groupby(df['name'])\n in_axis, name = True, gpr.name\n exclusions.append(name)\n\n elif is_in_axis(gpr): # df.groupby('name')\n if gpr in obj:\n if validate:\n obj._check_label_or_level_ambiguity(gpr)\n in_axis, name, gpr = True, gpr, obj[gpr]\n exclusions.append(name)\n elif obj._is_level_reference(gpr):\n in_axis, name, level, gpr = False, None, gpr, None\n else:\n raise KeyError(gpr)\n elif isinstance(gpr, Grouper) and gpr.key is not None:\n # Add key to exclusions\n exclusions.append(gpr.key)\n in_axis, name = False, None\n else:\n in_axis, name = False, None\n\n if is_categorical_dtype(gpr) and len(gpr) != obj.shape[axis]:\n raise ValueError(\n (\"Length of grouper ({len_gpr}) and axis ({len_axis})\"\n \" must be same length\"\n .format(len_gpr=len(gpr), len_axis=obj.shape[axis])))\n\n # create the Grouping\n # allow us to passing the actual Grouping as the gpr\n ping = (Grouping(group_axis,\n gpr,\n obj=obj,\n name=name,\n level=level,\n sort=sort,\n observed=observed,\n in_axis=in_axis)\n if not isinstance(gpr, Grouping) else gpr)\n\n groupings.append(ping)\n\n if len(groupings) == 0:\n raise ValueError('No group keys passed!')\n\n # create the internals grouper\n grouper = BaseGrouper(group_axis, groupings, sort=sort, mutated=mutated)\n return grouper, exclusions, obj"} {"instruction": "Can you write a function in Python 3 where it\nsets the internal grouper for this particular specification.", "response": "def _set_grouper(self, obj, sort=False):\n \"\"\"\n given an object and the specifications, setup the internal grouper\n for this particular specification\n\n Parameters\n ----------\n obj : the subject object\n sort : bool, default False\n whether the resulting grouper should be sorted\n \"\"\"\n\n if self.key is not None and self.level is not None:\n raise ValueError(\n \"The Grouper cannot specify both a key and a level!\")\n\n # Keep self.grouper value before overriding\n if self._grouper is None:\n self._grouper = self.grouper\n\n # the key must be a valid info item\n if self.key is not None:\n key = self.key\n # The 'on' is already defined\n if (getattr(self.grouper, 'name', None) == key and\n isinstance(obj, ABCSeries)):\n ax = self._grouper.take(obj.index)\n else:\n if key not in obj._info_axis:\n raise KeyError(\n \"The grouper name {0} is not found\".format(key))\n ax = Index(obj[key], name=key)\n\n else:\n ax = obj._get_axis(self.axis)\n if self.level is not None:\n level = self.level\n\n # if a level is given it must be a mi level or\n # equivalent to the axis name\n if isinstance(ax, MultiIndex):\n level = ax._get_level_number(level)\n ax = Index(ax._get_level_values(level),\n name=ax.names[level])\n\n else:\n if level not in (0, ax.name):\n raise ValueError(\n \"The level {0} is not valid\".format(level))\n\n # possibly sort\n if (self.sort or sort) and not ax.is_monotonic:\n # use stable sort to support first, last, nth\n indexer = self.indexer = ax.argsort(kind='mergesort')\n ax = ax.take(indexer)\n obj = obj._take(indexer, axis=self.axis, is_copy=False)\n\n self.obj = obj\n self.grouper = ax\n return self.grouper"} {"instruction": "Can you write a function in Python 3 where it\npickles object to file.", "response": "def to_pickle(obj, path, compression='infer',\n protocol=pickle.HIGHEST_PROTOCOL):\n \"\"\"\n Pickle (serialize) object to file.\n\n Parameters\n ----------\n obj : any object\n Any python object.\n path : str\n File path where the pickled object will be stored.\n compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer'\n A string representing the compression to use in the output file. By\n default, infers from the file extension in specified path.\n\n .. versionadded:: 0.20.0\n protocol : int\n Int which indicates which protocol should be used by the pickler,\n default HIGHEST_PROTOCOL (see [1], paragraph 12.1.2). The possible\n values for this parameter depend on the version of Python. For Python\n 2.x, possible values are 0, 1, 2. For Python>=3.0, 3 is a valid value.\n For Python >= 3.4, 4 is a valid value. A negative value for the\n protocol parameter is equivalent to setting its value to\n HIGHEST_PROTOCOL.\n\n .. [1] https://docs.python.org/3/library/pickle.html\n .. versionadded:: 0.21.0\n\n See Also\n --------\n read_pickle : Load pickled pandas object (or any object) from file.\n DataFrame.to_hdf : Write DataFrame to an HDF5 file.\n DataFrame.to_sql : Write DataFrame to a SQL database.\n DataFrame.to_parquet : Write a DataFrame to the binary parquet format.\n\n Examples\n --------\n >>> original_df = pd.DataFrame({\"foo\": range(5), \"bar\": range(5, 10)})\n >>> original_df\n foo bar\n 0 0 5\n 1 1 6\n 2 2 7\n 3 3 8\n 4 4 9\n >>> pd.to_pickle(original_df, \"./dummy.pkl\")\n\n >>> unpickled_df = pd.read_pickle(\"./dummy.pkl\")\n >>> unpickled_df\n foo bar\n 0 0 5\n 1 1 6\n 2 2 7\n 3 3 8\n 4 4 9\n\n >>> import os\n >>> os.remove(\"./dummy.pkl\")\n \"\"\"\n path = _stringify_path(path)\n f, fh = _get_handle(path, 'wb',\n compression=compression,\n is_text=False)\n if protocol < 0:\n protocol = pickle.HIGHEST_PROTOCOL\n try:\n f.write(pickle.dumps(obj, protocol=protocol))\n finally:\n f.close()\n for _f in fh:\n _f.close()"} {"instruction": "Explain what the following Python 3 code does\ndef read_pickle(path, compression='infer'):\n path = _stringify_path(path)\n f, fh = _get_handle(path, 'rb', compression=compression, is_text=False)\n\n # 1) try standard libary Pickle\n # 2) try pickle_compat (older pandas version) to handle subclass changes\n # 3) try pickle_compat with latin1 encoding\n\n try:\n with warnings.catch_warnings(record=True):\n # We want to silence any warnings about, e.g. moved modules.\n warnings.simplefilter(\"ignore\", Warning)\n return pickle.load(f)\n except Exception: # noqa: E722\n try:\n return pc.load(f, encoding=None)\n except Exception: # noqa: E722\n return pc.load(f, encoding='latin1')\n finally:\n f.close()\n for _f in fh:\n _f.close()", "response": "Load pickled pandas object from file."} {"instruction": "How would you implement a function in Python 3 that\nreturns a masking array of same size / shape as arr Addon with entries equaling any member of values_to_mask set to True Addon", "response": "def mask_missing(arr, values_to_mask):\n \"\"\"\n Return a masking array of same size/shape as arr\n with entries equaling any member of values_to_mask set to True\n \"\"\"\n dtype, values_to_mask = infer_dtype_from_array(values_to_mask)\n\n try:\n values_to_mask = np.array(values_to_mask, dtype=dtype)\n\n except Exception:\n values_to_mask = np.array(values_to_mask, dtype=object)\n\n na_mask = isna(values_to_mask)\n nonna = values_to_mask[~na_mask]\n\n mask = None\n for x in nonna:\n if mask is None:\n\n # numpy elementwise comparison warning\n if is_numeric_v_string_like(arr, x):\n mask = False\n else:\n mask = arr == x\n\n # if x is a string and arr is not, then we get False and we must\n # expand the mask to size arr.shape\n if is_scalar(mask):\n mask = np.zeros(arr.shape, dtype=bool)\n else:\n\n # numpy elementwise comparison warning\n if is_numeric_v_string_like(arr, x):\n mask |= False\n else:\n mask |= arr == x\n\n if na_mask.any():\n if mask is None:\n mask = isna(arr)\n else:\n mask |= isna(arr)\n\n # GH 21977\n if mask is None:\n mask = np.zeros(arr.shape, dtype=bool)\n\n return mask"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef interpolate_1d(xvalues, yvalues, method='linear', limit=None,\n limit_direction='forward', limit_area=None, fill_value=None,\n bounds_error=False, order=None, **kwargs):\n \"\"\"\n Logic for the 1-d interpolation. The result should be 1-d, inputs\n xvalues and yvalues will each be 1-d arrays of the same length.\n\n Bounds_error is currently hardcoded to False since non-scipy ones don't\n take it as an argument.\n \"\"\"\n # Treat the original, non-scipy methods first.\n\n invalid = isna(yvalues)\n valid = ~invalid\n\n if not valid.any():\n # have to call np.asarray(xvalues) since xvalues could be an Index\n # which can't be mutated\n result = np.empty_like(np.asarray(xvalues), dtype=np.float64)\n result.fill(np.nan)\n return result\n\n if valid.all():\n return yvalues\n\n if method == 'time':\n if not getattr(xvalues, 'is_all_dates', None):\n # if not issubclass(xvalues.dtype.type, np.datetime64):\n raise ValueError('time-weighted interpolation only works '\n 'on Series or DataFrames with a '\n 'DatetimeIndex')\n method = 'values'\n\n valid_limit_directions = ['forward', 'backward', 'both']\n limit_direction = limit_direction.lower()\n if limit_direction not in valid_limit_directions:\n msg = ('Invalid limit_direction: expecting one of {valid!r}, '\n 'got {invalid!r}.')\n raise ValueError(msg.format(valid=valid_limit_directions,\n invalid=limit_direction))\n\n if limit_area is not None:\n valid_limit_areas = ['inside', 'outside']\n limit_area = limit_area.lower()\n if limit_area not in valid_limit_areas:\n raise ValueError('Invalid limit_area: expecting one of {}, got '\n '{}.'.format(valid_limit_areas, limit_area))\n\n # default limit is unlimited GH #16282\n if limit is None:\n # limit = len(xvalues)\n pass\n elif not is_integer(limit):\n raise ValueError('Limit must be an integer')\n elif limit < 1:\n raise ValueError('Limit must be greater than 0')\n\n from pandas import Series\n ys = Series(yvalues)\n\n # These are sets of index pointers to invalid values... i.e. {0, 1, etc...\n all_nans = set(np.flatnonzero(invalid))\n start_nans = set(range(ys.first_valid_index()))\n end_nans = set(range(1 + ys.last_valid_index(), len(valid)))\n mid_nans = all_nans - start_nans - end_nans\n\n # Like the sets above, preserve_nans contains indices of invalid values,\n # but in this case, it is the final set of indices that need to be\n # preserved as NaN after the interpolation.\n\n # For example if limit_direction='forward' then preserve_nans will\n # contain indices of NaNs at the beginning of the series, and NaNs that\n # are more than'limit' away from the prior non-NaN.\n\n # set preserve_nans based on direction using _interp_limit\n if limit_direction == 'forward':\n preserve_nans = start_nans | set(_interp_limit(invalid, limit, 0))\n elif limit_direction == 'backward':\n preserve_nans = end_nans | set(_interp_limit(invalid, 0, limit))\n else:\n # both directions... just use _interp_limit\n preserve_nans = set(_interp_limit(invalid, limit, limit))\n\n # if limit_area is set, add either mid or outside indices\n # to preserve_nans GH #16284\n if limit_area == 'inside':\n # preserve NaNs on the outside\n preserve_nans |= start_nans | end_nans\n elif limit_area == 'outside':\n # preserve NaNs on the inside\n preserve_nans |= mid_nans\n\n # sort preserve_nans and covert to list\n preserve_nans = sorted(preserve_nans)\n\n xvalues = getattr(xvalues, 'values', xvalues)\n yvalues = getattr(yvalues, 'values', yvalues)\n result = yvalues.copy()\n\n if method in ['linear', 'time', 'index', 'values']:\n if method in ('values', 'index'):\n inds = np.asarray(xvalues)\n # hack for DatetimeIndex, #1646\n if needs_i8_conversion(inds.dtype.type):\n inds = inds.view(np.int64)\n if inds.dtype == np.object_:\n inds = lib.maybe_convert_objects(inds)\n else:\n inds = xvalues\n result[invalid] = np.interp(inds[invalid], inds[valid], yvalues[valid])\n result[preserve_nans] = np.nan\n return result\n\n sp_methods = ['nearest', 'zero', 'slinear', 'quadratic', 'cubic',\n 'barycentric', 'krogh', 'spline', 'polynomial',\n 'from_derivatives', 'piecewise_polynomial', 'pchip', 'akima']\n\n if method in sp_methods:\n inds = np.asarray(xvalues)\n # hack for DatetimeIndex, #1646\n if issubclass(inds.dtype.type, np.datetime64):\n inds = inds.view(np.int64)\n result[invalid] = _interpolate_scipy_wrapper(inds[valid],\n yvalues[valid],\n inds[invalid],\n method=method,\n fill_value=fill_value,\n bounds_error=bounds_error,\n order=order, **kwargs)\n result[preserve_nans] = np.nan\n return result", "response": "Interpolate two 1 - d arrays of xvalues and yvalues."} {"instruction": "Can you implement a function in Python 3 that\nwraps for scipy. interpolate. interp1d.", "response": "def _interpolate_scipy_wrapper(x, y, new_x, method, fill_value=None,\n bounds_error=False, order=None, **kwargs):\n \"\"\"\n Passed off to scipy.interpolate.interp1d. method is scipy's kind.\n Returns an array interpolated at new_x. Add any new methods to\n the list in _clean_interp_method.\n \"\"\"\n try:\n from scipy import interpolate\n # TODO: Why is DatetimeIndex being imported here?\n from pandas import DatetimeIndex # noqa\n except ImportError:\n raise ImportError('{method} interpolation requires SciPy'\n .format(method=method))\n\n new_x = np.asarray(new_x)\n\n # ignores some kwargs that could be passed along.\n alt_methods = {\n 'barycentric': interpolate.barycentric_interpolate,\n 'krogh': interpolate.krogh_interpolate,\n 'from_derivatives': _from_derivatives,\n 'piecewise_polynomial': _from_derivatives,\n }\n\n if getattr(x, 'is_all_dates', False):\n # GH 5975, scipy.interp1d can't hande datetime64s\n x, new_x = x._values.astype('i8'), new_x.astype('i8')\n\n if method == 'pchip':\n try:\n alt_methods['pchip'] = interpolate.pchip_interpolate\n except AttributeError:\n raise ImportError(\"Your version of Scipy does not support \"\n \"PCHIP interpolation.\")\n elif method == 'akima':\n try:\n from scipy.interpolate import Akima1DInterpolator # noqa\n alt_methods['akima'] = _akima_interpolate\n except ImportError:\n raise ImportError(\"Your version of Scipy does not support \"\n \"Akima interpolation.\")\n\n interp1d_methods = ['nearest', 'zero', 'slinear', 'quadratic', 'cubic',\n 'polynomial']\n if method in interp1d_methods:\n if method == 'polynomial':\n method = order\n terp = interpolate.interp1d(x, y, kind=method, fill_value=fill_value,\n bounds_error=bounds_error)\n new_y = terp(new_x)\n elif method == 'spline':\n # GH #10633, #24014\n if isna(order) or (order <= 0):\n raise ValueError(\"order needs to be specified and greater than 0; \"\n \"got order: {}\".format(order))\n terp = interpolate.UnivariateSpline(x, y, k=order, **kwargs)\n new_y = terp(new_x)\n else:\n # GH 7295: need to be able to write for some reason\n # in some circumstances: check all three\n if not x.flags.writeable:\n x = x.copy()\n if not y.flags.writeable:\n y = y.copy()\n if not new_x.flags.writeable:\n new_x = new_x.copy()\n method = alt_methods[method]\n new_y = method(x, y, new_x, **kwargs)\n return new_y"} {"instruction": "Can you implement a function in Python 3 that\nconstructs a piecewise polynomial from the derivative of xi yi x.", "response": "def _from_derivatives(xi, yi, x, order=None, der=0, extrapolate=False):\n \"\"\"\n Convenience function for interpolate.BPoly.from_derivatives.\n\n Construct a piecewise polynomial in the Bernstein basis, compatible\n with the specified values and derivatives at breakpoints.\n\n Parameters\n ----------\n xi : array_like\n sorted 1D array of x-coordinates\n yi : array_like or list of array-likes\n yi[i][j] is the j-th derivative known at xi[i]\n order: None or int or array_like of ints. Default: None.\n Specifies the degree of local polynomials. If not None, some\n derivatives are ignored.\n der : int or list\n How many derivatives to extract; None for all potentially nonzero\n derivatives (that is a number equal to the number of points), or a\n list of derivatives to extract. This numberincludes the function\n value as 0th derivative.\n extrapolate : bool, optional\n Whether to extrapolate to ouf-of-bounds points based on first and last\n intervals, or to return NaNs. Default: True.\n\n See Also\n --------\n scipy.interpolate.BPoly.from_derivatives\n\n Returns\n -------\n y : scalar or array_like\n The result, of length R or length M or M by R.\n \"\"\"\n from scipy import interpolate\n\n # return the method for compat with scipy version & backwards compat\n method = interpolate.BPoly.from_derivatives\n m = method(xi, yi.reshape(-1, 1),\n orders=order, extrapolate=extrapolate)\n\n return m(x)"} {"instruction": "Write a Python 3 script for\nperforming an actual interpolation of values on a 2 - d array.", "response": "def interpolate_2d(values, method='pad', axis=0, limit=None, fill_value=None,\n dtype=None):\n \"\"\"\n Perform an actual interpolation of values, values will be make 2-d if\n needed fills inplace, returns the result.\n \"\"\"\n\n transf = (lambda x: x) if axis == 0 else (lambda x: x.T)\n\n # reshape a 1 dim if needed\n ndim = values.ndim\n if values.ndim == 1:\n if axis != 0: # pragma: no cover\n raise AssertionError(\"cannot interpolate on a ndim == 1 with \"\n \"axis != 0\")\n values = values.reshape(tuple((1,) + values.shape))\n\n if fill_value is None:\n mask = None\n else: # todo create faster fill func without masking\n mask = mask_missing(transf(values), fill_value)\n\n method = clean_fill_method(method)\n if method == 'pad':\n values = transf(pad_2d(\n transf(values), limit=limit, mask=mask, dtype=dtype))\n else:\n values = transf(backfill_2d(\n transf(values), limit=limit, mask=mask, dtype=dtype))\n\n # reshape back\n if ndim == 1:\n values = values[0]\n\n return values"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef _cast_values_for_fillna(values, dtype):\n # TODO: for int-dtypes we make a copy, but for everything else this\n # alters the values in-place. Is this intentional?\n\n if (is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype) or\n is_timedelta64_dtype(dtype)):\n values = values.view(np.int64)\n\n elif is_integer_dtype(values):\n # NB: this check needs to come after the datetime64 check above\n values = ensure_float64(values)\n\n return values", "response": "Cast values to a dtype that algos. pad and algos. backfill can handle."} {"instruction": "Can you create a Python 3 function that\nfills zeros in the result array with the fill value.", "response": "def fill_zeros(result, x, y, name, fill):\n \"\"\"\n If this is a reversed op, then flip x,y\n\n If we have an integer value (or array in y)\n and we have 0's, fill them with the fill,\n return the result.\n\n Mask the nan's from x.\n \"\"\"\n if fill is None or is_float_dtype(result):\n return result\n\n if name.startswith(('r', '__r')):\n x, y = y, x\n\n is_variable_type = (hasattr(y, 'dtype') or hasattr(y, 'type'))\n is_scalar_type = is_scalar(y)\n\n if not is_variable_type and not is_scalar_type:\n return result\n\n if is_scalar_type:\n y = np.array(y)\n\n if is_integer_dtype(y):\n\n if (y == 0).any():\n\n # GH 7325, mask and nans must be broadcastable (also: PR 9308)\n # Raveling and then reshaping makes np.putmask faster\n mask = ((y == 0) & ~np.isnan(result)).ravel()\n\n shape = result.shape\n result = result.astype('float64', copy=False).ravel()\n\n np.putmask(result, mask, fill)\n\n # if we have a fill of inf, then sign it correctly\n # (GH 6178 and PR 9308)\n if np.isinf(fill):\n signs = y if name.startswith(('r', '__r')) else x\n signs = np.sign(signs.astype('float', copy=False))\n negative_inf_mask = (signs.ravel() < 0) & mask\n np.putmask(result, negative_inf_mask, -fill)\n\n if \"floordiv\" in name: # (PR 9308)\n nan_mask = ((y == 0) & (x == 0)).ravel()\n np.putmask(result, nan_mask, np.nan)\n\n result = result.reshape(shape)\n\n return result"} {"instruction": "Can you create a Python 3 function that\nmasks the result of division by zero.", "response": "def mask_zero_div_zero(x, y, result, copy=False):\n \"\"\"\n Set results of 0 / 0 or 0 // 0 to np.nan, regardless of the dtypes\n of the numerator or the denominator.\n\n Parameters\n ----------\n x : ndarray\n y : ndarray\n result : ndarray\n copy : bool (default False)\n Whether to always create a new array or try to fill in the existing\n array if possible.\n\n Returns\n -------\n filled_result : ndarray\n\n Examples\n --------\n >>> x = np.array([1, 0, -1], dtype=np.int64)\n >>> y = 0 # int 0; numpy behavior is different with float\n >>> result = x / y\n >>> result # raw numpy result does not fill division by zero\n array([0, 0, 0])\n >>> mask_zero_div_zero(x, y, result)\n array([ inf, nan, -inf])\n \"\"\"\n if is_scalar(y):\n y = np.array(y)\n\n zmask = y == 0\n if zmask.any():\n shape = result.shape\n\n nan_mask = (zmask & (x == 0)).ravel()\n neginf_mask = (zmask & (x < 0)).ravel()\n posinf_mask = (zmask & (x > 0)).ravel()\n\n if nan_mask.any() or neginf_mask.any() or posinf_mask.any():\n # Fill negative/0 with -inf, positive/0 with +inf, 0/0 with NaN\n result = result.astype('float64', copy=copy).ravel()\n\n np.putmask(result, nan_mask, np.nan)\n np.putmask(result, posinf_mask, np.inf)\n np.putmask(result, neginf_mask, -np.inf)\n\n result = result.reshape(shape)\n\n return result"} {"instruction": "Can you implement a function in Python 3 that\ndispatches missing values to ndarray.", "response": "def dispatch_missing(op, left, right, result):\n \"\"\"\n Fill nulls caused by division by zero, casting to a diffferent dtype\n if necessary.\n\n Parameters\n ----------\n op : function (operator.add, operator.div, ...)\n left : object (Index for non-reversed ops)\n right : object (Index fof reversed ops)\n result : ndarray\n\n Returns\n -------\n result : ndarray\n \"\"\"\n opstr = '__{opname}__'.format(opname=op.__name__).replace('____', '__')\n if op in [operator.truediv, operator.floordiv,\n getattr(operator, 'div', None)]:\n result = mask_zero_div_zero(left, right, result)\n elif op is operator.mod:\n result = fill_zeros(result, left, right, opstr, np.nan)\n elif op is divmod:\n res0 = mask_zero_div_zero(left, right, result[0])\n res1 = fill_zeros(result[1], left, right, opstr, np.nan)\n result = (res0, res1)\n return result"} {"instruction": "Explain what the following Python 3 code does\ndef _interp_limit(invalid, fw_limit, bw_limit):\n # handle forward first; the backward direction is the same except\n # 1. operate on the reversed array\n # 2. subtract the returned indices from N - 1\n N = len(invalid)\n f_idx = set()\n b_idx = set()\n\n def inner(invalid, limit):\n limit = min(limit, N)\n windowed = _rolling_window(invalid, limit + 1).all(1)\n idx = (set(np.where(windowed)[0] + limit) |\n set(np.where((~invalid[:limit + 1]).cumsum() == 0)[0]))\n return idx\n\n if fw_limit is not None:\n\n if fw_limit == 0:\n f_idx = set(np.where(invalid)[0])\n else:\n f_idx = inner(invalid, fw_limit)\n\n if bw_limit is not None:\n\n if bw_limit == 0:\n # then we don't even need to care about backwards\n # just use forwards\n return f_idx\n else:\n b_idx = list(inner(invalid[::-1], bw_limit))\n b_idx = set(N - 1 - np.asarray(b_idx))\n if fw_limit == 0:\n return b_idx\n\n return f_idx & b_idx", "response": "Returns a generator that yields the set of indexers that won t be filled in if they exceed the limits."} {"instruction": "Here you have a function in Python 3, explain what it does\ndef _rolling_window(a, window):\n # https://stackoverflow.com/a/6811241\n shape = a.shape[:-1] + (a.shape[-1] - window + 1, window)\n strides = a.strides + (a.strides[-1],)\n return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)", "response": "Compute a rolling window of size window."} {"instruction": "Given the following Python 3 function, write the documentation\ndef get_console_size():\n from pandas import get_option\n\n display_width = get_option('display.width')\n # deprecated.\n display_height = get_option('display.max_rows')\n\n # Consider\n # interactive shell terminal, can detect term size\n # interactive non-shell terminal (ipnb/ipqtconsole), cannot detect term\n # size non-interactive script, should disregard term size\n\n # in addition\n # width,height have default values, but setting to 'None' signals\n # should use Auto-Detection, But only in interactive shell-terminal.\n # Simple. yeah.\n\n if in_interactive_session():\n if in_ipython_frontend():\n # sane defaults for interactive non-shell terminal\n # match default for width,height in config_init\n from pandas._config.config import get_default_val\n terminal_width = get_default_val('display.width')\n terminal_height = get_default_val('display.max_rows')\n else:\n # pure terminal\n terminal_width, terminal_height = get_terminal_size()\n else:\n terminal_width, terminal_height = None, None\n\n # Note if the User sets width/Height to None (auto-detection)\n # and we're in a script (non-inter), this will return (None,None)\n # caller needs to deal.\n return (display_width or terminal_width, display_height or terminal_height)", "response": "Return console size as tuple = width height."} {"instruction": "Make a summary of the following Python 3 code\ndef in_interactive_session():\n from pandas import get_option\n\n def check_main():\n try:\n import __main__ as main\n except ModuleNotFoundError:\n return get_option('mode.sim_interactive')\n return (not hasattr(main, '__file__') or\n get_option('mode.sim_interactive'))\n\n try:\n return __IPYTHON__ or check_main() # noqa\n except NameError:\n return check_main()", "response": "check if we re running in an interactive shell"} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef recode_for_groupby(c, sort, observed):\n\n # we only care about observed values\n if observed:\n unique_codes = unique1d(c.codes)\n\n take_codes = unique_codes[unique_codes != -1]\n if c.ordered:\n take_codes = np.sort(take_codes)\n\n # we recode according to the uniques\n categories = c.categories.take(take_codes)\n codes = _recode_for_categories(c.codes,\n c.categories,\n categories)\n\n # return a new categorical that maps our new codes\n # and categories\n dtype = CategoricalDtype(categories, ordered=c.ordered)\n return Categorical(codes, dtype=dtype, fastpath=True), c\n\n # Already sorted according to c.categories; all is fine\n if sort:\n return c, None\n\n # sort=False should order groups in as-encountered order (GH-8868)\n cat = c.unique()\n\n # But for groupby to work, all categories should be present,\n # including those missing from the data (GH-13179), which .unique()\n # above dropped\n cat = cat.add_categories(\n c.categories[~c.categories.isin(cat.categories)])\n\n return c.reorder_categories(cat.categories), None", "response": "Recode the categories of a categorical object."} {"instruction": "Can you create a Python 3 function that\nreverses the codes_to_groupby to account for sort / observed. Parameters ---------- c : Categorical sort : boolean The value of the sort parameter groupby was called with. ci : CategoricalIndex The codes / categories to recode Returns ------- CategoricalIndex", "response": "def recode_from_groupby(c, sort, ci):\n \"\"\"\n Reverse the codes_to_groupby to account for sort / observed.\n\n Parameters\n ----------\n c : Categorical\n sort : boolean\n The value of the sort parameter groupby was called with.\n ci : CategoricalIndex\n The codes / categories to recode\n\n Returns\n -------\n CategoricalIndex\n \"\"\"\n\n # we re-order to the original category orderings\n if sort:\n return ci.set_categories(c.categories)\n\n # we are not sorting, so add unobserved to the end\n return ci.add_categories(\n c.categories[~c.categories.isin(ci.categories)])"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef to_parquet(df, path, engine='auto', compression='snappy', index=None,\n partition_cols=None, **kwargs):\n \"\"\"\n Write a DataFrame to the parquet format.\n\n Parameters\n ----------\n path : str\n File path or Root Directory path. Will be used as Root Directory path\n while writing a partitioned dataset.\n\n .. versionchanged:: 0.24.0\n\n engine : {'auto', 'pyarrow', 'fastparquet'}, default 'auto'\n Parquet library to use. If 'auto', then the option\n ``io.parquet.engine`` is used. The default ``io.parquet.engine``\n behavior is to try 'pyarrow', falling back to 'fastparquet' if\n 'pyarrow' is unavailable.\n compression : {'snappy', 'gzip', 'brotli', None}, default 'snappy'\n Name of the compression to use. Use ``None`` for no compression.\n index : bool, default None\n If ``True``, include the dataframe's index(es) in the file output. If\n ``False``, they will not be written to the file. If ``None``, the\n engine's default behavior will be used.\n\n .. versionadded 0.24.0\n\n partition_cols : list, optional, default None\n Column names by which to partition the dataset\n Columns are partitioned in the order they are given\n\n .. versionadded:: 0.24.0\n\n kwargs\n Additional keyword arguments passed to the engine\n \"\"\"\n impl = get_engine(engine)\n return impl.write(df, path, compression=compression, index=index,\n partition_cols=partition_cols, **kwargs)", "response": "Write a DataFrame to the Parquet format."} {"instruction": "Given the following Python 3 function, write the documentation\ndef read_parquet(path, engine='auto', columns=None, **kwargs):\n\n impl = get_engine(engine)\n return impl.read(path, columns=columns, **kwargs)", "response": "Load a DataFrame from a Parquet file."} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef generate_bins_generic(values, binner, closed):\n lenidx = len(values)\n lenbin = len(binner)\n\n if lenidx <= 0 or lenbin <= 0:\n raise ValueError(\"Invalid length for values or for binner\")\n\n # check binner fits data\n if values[0] < binner[0]:\n raise ValueError(\"Values falls before first bin\")\n\n if values[lenidx - 1] > binner[lenbin - 1]:\n raise ValueError(\"Values falls after last bin\")\n\n bins = np.empty(lenbin - 1, dtype=np.int64)\n\n j = 0 # index into values\n bc = 0 # bin count\n\n # linear scan, presume nothing about values/binner except that it fits ok\n for i in range(0, lenbin - 1):\n r_bin = binner[i + 1]\n\n # count values in current bin, advance to next bin\n while j < lenidx and (values[j] < r_bin or\n (closed == 'right' and values[j] == r_bin)):\n j += 1\n\n bins[bc] = j\n bc += 1\n\n return bins", "response": "Generate bin edge offsets and bin labels for one array using another array of values."} {"instruction": "Explain what the following Python 3 code does\ndef get_iterator(self, data, axis=0):\n splitter = self._get_splitter(data, axis=axis)\n keys = self._get_group_keys()\n for key, (i, group) in zip(keys, splitter):\n yield key, group", "response": "Groupby iterator returning tuples of names subsetted objects"} {"instruction": "How would you implement a function in Python 3 that\nreturns a DataFrame of the groups of the items in the groupings.", "response": "def groups(self):\n \"\"\" dict {group name -> group labels} \"\"\"\n if len(self.groupings) == 1:\n return self.groupings[0].groups\n else:\n to_groupby = lzip(*(ping.grouper for ping in self.groupings))\n to_groupby = Index(to_groupby)\n return self.axis.groupby(to_groupby)"} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef groups(self):\n\n # this is mainly for compat\n # GH 3881\n result = {key: value for key, value in zip(self.binlabels, self.bins)\n if key is not NaT}\n return result", "response": "dict of group name - > group labels"} {"instruction": "Make a summary of the following Python 3 code\ndef get_iterator(self, data, axis=0):\n if isinstance(data, NDFrame):\n slicer = lambda start, edge: data._slice(\n slice(start, edge), axis=axis)\n length = len(data.axes[axis])\n else:\n slicer = lambda start, edge: data[slice(start, edge)]\n length = len(data)\n\n start = 0\n for edge, label in zip(self.bins, self.binlabels):\n if label is not NaT:\n yield label, slicer(start, edge)\n start = edge\n\n if start < length:\n yield self.binlabels[-1], slicer(start, None)", "response": "Returns an iterator over the set of items in the specified axis."} {"instruction": "Create a Python 3 function to\nreturn a DataFrame with all the non - empty fields in the data in the new table.", "response": "def json_normalize(data, record_path=None, meta=None,\n meta_prefix=None,\n record_prefix=None,\n errors='raise',\n sep='.'):\n \"\"\"\n Normalize semi-structured JSON data into a flat table.\n\n Parameters\n ----------\n data : dict or list of dicts\n Unserialized JSON objects\n record_path : string or list of strings, default None\n Path in each object to list of records. If not passed, data will be\n assumed to be an array of records\n meta : list of paths (string or list of strings), default None\n Fields to use as metadata for each record in resulting table\n meta_prefix : string, default None\n record_prefix : string, default None\n If True, prefix records with dotted (?) path, e.g. foo.bar.field if\n path to records is ['foo', 'bar']\n errors : {'raise', 'ignore'}, default 'raise'\n\n * 'ignore' : will ignore KeyError if keys listed in meta are not\n always present\n * 'raise' : will raise KeyError if keys listed in meta are not\n always present\n\n .. versionadded:: 0.20.0\n\n sep : string, default '.'\n Nested records will generate names separated by sep,\n e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar\n\n .. versionadded:: 0.20.0\n\n Returns\n -------\n frame : DataFrame\n\n Examples\n --------\n\n >>> from pandas.io.json import json_normalize\n >>> data = [{'id': 1, 'name': {'first': 'Coleen', 'last': 'Volk'}},\n ... {'name': {'given': 'Mose', 'family': 'Regner'}},\n ... {'id': 2, 'name': 'Faye Raker'}]\n >>> json_normalize(data)\n id name name.family name.first name.given name.last\n 0 1.0 NaN NaN Coleen NaN Volk\n 1 NaN NaN Regner NaN Mose NaN\n 2 2.0 Faye Raker NaN NaN NaN NaN\n\n >>> data = [{'state': 'Florida',\n ... 'shortname': 'FL',\n ... 'info': {\n ... 'governor': 'Rick Scott'\n ... },\n ... 'counties': [{'name': 'Dade', 'population': 12345},\n ... {'name': 'Broward', 'population': 40000},\n ... {'name': 'Palm Beach', 'population': 60000}]},\n ... {'state': 'Ohio',\n ... 'shortname': 'OH',\n ... 'info': {\n ... 'governor': 'John Kasich'\n ... },\n ... 'counties': [{'name': 'Summit', 'population': 1234},\n ... {'name': 'Cuyahoga', 'population': 1337}]}]\n >>> result = json_normalize(data, 'counties', ['state', 'shortname',\n ... ['info', 'governor']])\n >>> result\n name population info.governor state shortname\n 0 Dade 12345 Rick Scott Florida FL\n 1 Broward 40000 Rick Scott Florida FL\n 2 Palm Beach 60000 Rick Scott Florida FL\n 3 Summit 1234 John Kasich Ohio OH\n 4 Cuyahoga 1337 John Kasich Ohio OH\n\n >>> data = {'A': [1, 2]}\n >>> json_normalize(data, 'A', record_prefix='Prefix.')\n Prefix.0\n 0 1\n 1 2\n \"\"\"\n def _pull_field(js, spec):\n result = js\n if isinstance(spec, list):\n for field in spec:\n result = result[field]\n else:\n result = result[spec]\n\n return result\n\n if isinstance(data, list) and not data:\n return DataFrame()\n\n # A bit of a hackjob\n if isinstance(data, dict):\n data = [data]\n\n if record_path is None:\n if any([isinstance(x, dict) for x in y.values()] for y in data):\n # naive normalization, this is idempotent for flat records\n # and potentially will inflate the data considerably for\n # deeply nested structures:\n # {VeryLong: { b: 1,c:2}} -> {VeryLong.b:1 ,VeryLong.c:@}\n #\n # TODO: handle record value which are lists, at least error\n # reasonably\n data = nested_to_record(data, sep=sep)\n return DataFrame(data)\n elif not isinstance(record_path, list):\n record_path = [record_path]\n\n if meta is None:\n meta = []\n elif not isinstance(meta, list):\n meta = [meta]\n\n meta = [m if isinstance(m, list) else [m] for m in meta]\n\n # Disastrously inefficient for now\n records = []\n lengths = []\n\n meta_vals = defaultdict(list)\n if not isinstance(sep, str):\n sep = str(sep)\n meta_keys = [sep.join(val) for val in meta]\n\n def _recursive_extract(data, path, seen_meta, level=0):\n if isinstance(data, dict):\n data = [data]\n if len(path) > 1:\n for obj in data:\n for val, key in zip(meta, meta_keys):\n if level + 1 == len(val):\n seen_meta[key] = _pull_field(obj, val[-1])\n\n _recursive_extract(obj[path[0]], path[1:],\n seen_meta, level=level + 1)\n else:\n for obj in data:\n recs = _pull_field(obj, path[0])\n\n # For repeating the metadata later\n lengths.append(len(recs))\n\n for val, key in zip(meta, meta_keys):\n if level + 1 > len(val):\n meta_val = seen_meta[key]\n else:\n try:\n meta_val = _pull_field(obj, val[level:])\n except KeyError as e:\n if errors == 'ignore':\n meta_val = np.nan\n else:\n raise KeyError(\"Try running with \"\n \"errors='ignore' as key \"\n \"{err} is not always present\"\n .format(err=e))\n meta_vals[key].append(meta_val)\n\n records.extend(recs)\n\n _recursive_extract(data, record_path, {}, level=0)\n\n result = DataFrame(records)\n\n if record_prefix is not None:\n result = result.rename(\n columns=lambda x: \"{p}{c}\".format(p=record_prefix, c=x))\n\n # Data types, a problem\n for k, v in meta_vals.items():\n if meta_prefix is not None:\n k = meta_prefix + k\n\n if k in result:\n raise ValueError('Conflicting metadata name {name}, '\n 'need distinguishing prefix '.format(name=k))\n\n # forcing dtype to object to avoid the metadata being casted to string\n result[k] = np.array(v, dtype=object).repeat(lengths)\n\n return result"} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef _get_indices(self, names):\n\n def get_converter(s):\n # possibly convert to the actual key types\n # in the indices, could be a Timestamp or a np.datetime64\n if isinstance(s, (Timestamp, datetime.datetime)):\n return lambda key: Timestamp(key)\n elif isinstance(s, np.datetime64):\n return lambda key: Timestamp(key).asm8\n else:\n return lambda key: key\n\n if len(names) == 0:\n return []\n\n if len(self.indices) > 0:\n index_sample = next(iter(self.indices))\n else:\n index_sample = None # Dummy sample\n\n name_sample = names[0]\n if isinstance(index_sample, tuple):\n if not isinstance(name_sample, tuple):\n msg = (\"must supply a tuple to get_group with multiple\"\n \" grouping keys\")\n raise ValueError(msg)\n if not len(name_sample) == len(index_sample):\n try:\n # If the original grouper was a tuple\n return [self.indices[name] for name in names]\n except KeyError:\n # turns out it wasn't a tuple\n msg = (\"must supply a same-length tuple to get_group\"\n \" with multiple grouping keys\")\n raise ValueError(msg)\n\n converters = [get_converter(s) for s in index_sample]\n names = (tuple(f(n) for f, n in zip(converters, name))\n for name in names)\n\n else:\n converter = get_converter(index_sample)\n names = (converter(name) for name in names)\n\n return [self.indices.get(name, []) for name in names]", "response": "Internal method to get multiple indices for a given set of names."} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef _set_group_selection(self):\n grp = self.grouper\n if not (self.as_index and\n getattr(grp, 'groupings', None) is not None and\n self.obj.ndim > 1 and\n self._group_selection is None):\n return\n\n ax = self.obj._info_axis\n groupers = [g.name for g in grp.groupings\n if g.level is None and g.in_axis]\n\n if len(groupers):\n # GH12839 clear selected obj cache when group selection changes\n self._group_selection = ax.difference(Index(groupers),\n sort=False).tolist()\n self._reset_cache('_selected_obj')", "response": "Create group based selection."} {"instruction": "Implement a Python 3 function for\nconstructing a DataFrame from group with provided name.", "response": "def get_group(self, name, obj=None):\n \"\"\"\n Construct NDFrame from group with provided name.\n\n Parameters\n ----------\n name : object\n the name of the group to get as a DataFrame\n obj : NDFrame, default None\n the NDFrame to take the DataFrame out of. If\n it is None, the object groupby was called on will\n be used\n\n Returns\n -------\n group : same type as obj\n \"\"\"\n if obj is None:\n obj = self._selected_obj\n\n inds = self._get_index(name)\n if not len(inds):\n raise KeyError(name)\n\n return obj._take(inds, axis=self.axis)"} {"instruction": "Can you write a function in Python 3 where it\nreturns the cumulative count of the entries in the grouper.", "response": "def _cumcount_array(self, ascending=True):\n \"\"\"\n Parameters\n ----------\n ascending : bool, default True\n If False, number in reverse, from length of group - 1 to 0.\n\n Notes\n -----\n this is currently implementing sort=False\n (though the default is sort=True) for groupby in general\n \"\"\"\n ids, _, ngroups = self.grouper.group_info\n sorter = get_group_index_sorter(ids, ngroups)\n ids, count = ids[sorter], len(ids)\n\n if count == 0:\n return np.empty(0, dtype=np.int64)\n\n run = np.r_[True, ids[:-1] != ids[1:]]\n rep = np.diff(np.r_[np.nonzero(run)[0], count])\n out = (~run).cumsum()\n\n if ascending:\n out -= np.repeat(out[run], rep)\n else:\n out = np.repeat(out[np.r_[run[1:], True]], rep) - out\n\n rev = np.empty(count, dtype=np.intp)\n rev[sorter] = np.arange(count, dtype=np.intp)\n return out[rev].astype(np.int64, copy=False)"} {"instruction": "How would you explain what the following Python 3 function does\ndef _try_cast(self, result, obj, numeric_only=False):\n if obj.ndim > 1:\n dtype = obj._values.dtype\n else:\n dtype = obj.dtype\n\n if not is_scalar(result):\n if is_datetime64tz_dtype(dtype):\n # GH 23683\n # Prior results _may_ have been generated in UTC.\n # Ensure we localize to UTC first before converting\n # to the target timezone\n try:\n result = obj._values._from_sequence(\n result, dtype='datetime64[ns, UTC]'\n )\n result = result.astype(dtype)\n except TypeError:\n # _try_cast was called at a point where the result\n # was already tz-aware\n pass\n elif is_extension_array_dtype(dtype):\n # The function can return something of any type, so check\n # if the type is compatible with the calling EA.\n try:\n result = obj._values._from_sequence(result, dtype=dtype)\n except Exception:\n # https://github.com/pandas-dev/pandas/issues/22850\n # pandas has no control over what 3rd-party ExtensionArrays\n # do in _values_from_sequence. We still want ops to work\n # though, so we catch any regular Exception.\n pass\n elif numeric_only and is_numeric_dtype(dtype) or not numeric_only:\n result = maybe_downcast_to_dtype(result, dtype)\n\n return result", "response": "Try to cast the result to our obj original type."} {"instruction": "Explain what the following Python 3 code does\ndef _transform_should_cast(self, func_nm):\n return (self.size().fillna(0) > 0).any() and (\n func_nm not in base.cython_cast_blacklist)", "response": "Returns True if the transform should cast the result of aggregation\n ."} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef _bool_agg(self, val_test, skipna):\n\n def objs_to_bool(vals: np.ndarray) -> Tuple[np.ndarray, Type]:\n if is_object_dtype(vals):\n vals = np.array([bool(x) for x in vals])\n else:\n vals = vals.astype(np.bool)\n\n return vals.view(np.uint8), np.bool\n\n def result_to_bool(result: np.ndarray, inference: Type) -> np.ndarray:\n return result.astype(inference, copy=False)\n\n return self._get_cythonized_result('group_any_all', self.grouper,\n aggregate=True,\n cython_dtype=np.uint8,\n needs_values=True,\n needs_mask=True,\n pre_processing=objs_to_bool,\n post_processing=result_to_bool,\n val_test=val_test, skipna=skipna)", "response": "Shared func to call any / all Cython GroupBy implementations."} {"instruction": "Create a Python 3 function to\ncompute the mean of the elements of the current DataFrame.", "response": "def mean(self, *args, **kwargs):\n \"\"\"\n Compute mean of groups, excluding missing values.\n\n Returns\n -------\n pandas.Series or pandas.DataFrame\n %(see_also)s\n Examples\n --------\n >>> df = pd.DataFrame({'A': [1, 1, 2, 1, 2],\n ... 'B': [np.nan, 2, 3, 4, 5],\n ... 'C': [1, 2, 1, 1, 2]}, columns=['A', 'B', 'C'])\n\n Groupby one column and return the mean of the remaining columns in\n each group.\n\n >>> df.groupby('A').mean()\n >>>\n B C\n A\n 1 3.0 1.333333\n 2 4.0 1.500000\n\n Groupby two columns and return the mean of the remaining column.\n\n >>> df.groupby(['A', 'B']).mean()\n >>>\n C\n A B\n 1 2.0 2\n 4.0 1\n 2 3.0 1\n 5.0 2\n\n Groupby one column and return the mean of only particular column in\n the group.\n\n >>> df.groupby('A')['B'].mean()\n >>>\n A\n 1 3.0\n 2 4.0\n Name: B, dtype: float64\n \"\"\"\n nv.validate_groupby_func('mean', args, kwargs, ['numeric_only'])\n try:\n return self._cython_agg_general('mean', **kwargs)\n except GroupByError:\n raise\n except Exception: # pragma: no cover\n with _group_selection_context(self):\n f = lambda x: x.mean(axis=self.axis, **kwargs)\n return self._python_agg_general(f)"} {"instruction": "Explain what the following Python 3 code does\ndef median(self, **kwargs):\n try:\n return self._cython_agg_general('median', **kwargs)\n except GroupByError:\n raise\n except Exception: # pragma: no cover\n\n def f(x):\n if isinstance(x, np.ndarray):\n x = Series(x)\n return x.median(axis=self.axis, **kwargs)\n with _group_selection_context(self):\n return self._python_agg_general(f)", "response": "Compute the median of the elements in the current DataFrame."} {"instruction": "How would you code a function in Python 3 to\ncompute the standard deviation of the entries in the logarithm of the logarithm of the values in the logarithm of the values in the logarithm of the values in the logarithm of the values in the logarithm of the logarithm of the values in the logarithm of the logarithm of the logarithm.", "response": "def std(self, ddof=1, *args, **kwargs):\n \"\"\"\n Compute standard deviation of groups, excluding missing values.\n\n For multiple groupings, the result index will be a MultiIndex.\n\n Parameters\n ----------\n ddof : integer, default 1\n degrees of freedom\n \"\"\"\n\n # TODO: implement at Cython level?\n nv.validate_groupby_func('std', args, kwargs)\n return np.sqrt(self.var(ddof=ddof, **kwargs))"} {"instruction": "Make a summary of the following Python 3 code\ndef var(self, ddof=1, *args, **kwargs):\n nv.validate_groupby_func('var', args, kwargs)\n if ddof == 1:\n try:\n return self._cython_agg_general('var', **kwargs)\n except Exception:\n f = lambda x: x.var(ddof=ddof, **kwargs)\n with _group_selection_context(self):\n return self._python_agg_general(f)\n else:\n f = lambda x: x.var(ddof=ddof, **kwargs)\n with _group_selection_context(self):\n return self._python_agg_general(f)", "response": "Compute variance of groups excluding missing values."} {"instruction": "Write a Python 3 script for\ncomputing the standard error of the mean of groups excluding missing values.", "response": "def sem(self, ddof=1):\n \"\"\"\n Compute standard error of the mean of groups, excluding missing values.\n\n For multiple groupings, the result index will be a MultiIndex.\n\n Parameters\n ----------\n ddof : integer, default 1\n degrees of freedom\n \"\"\"\n\n return self.std(ddof=ddof) / np.sqrt(self.count())"} {"instruction": "Write a Python 3 function that can\nadd numeric operations to the GroupBy generically.", "response": "def _add_numeric_operations(cls):\n \"\"\"\n Add numeric operations to the GroupBy generically.\n \"\"\"\n\n def groupby_function(name, alias, npfunc,\n numeric_only=True, _convert=False,\n min_count=-1):\n\n _local_template = \"Compute %(f)s of group values\"\n\n @Substitution(name='groupby', f=name)\n @Appender(_common_see_also)\n @Appender(_local_template)\n def f(self, **kwargs):\n if 'numeric_only' not in kwargs:\n kwargs['numeric_only'] = numeric_only\n if 'min_count' not in kwargs:\n kwargs['min_count'] = min_count\n\n self._set_group_selection()\n try:\n return self._cython_agg_general(\n alias, alt=npfunc, **kwargs)\n except AssertionError as e:\n raise SpecificationError(str(e))\n except Exception:\n result = self.aggregate(\n lambda x: npfunc(x, axis=self.axis))\n if _convert:\n result = result._convert(datetime=True)\n return result\n\n set_function_name(f, name, cls)\n\n return f\n\n def first_compat(x, axis=0):\n\n def first(x):\n x = x.to_numpy()\n\n x = x[notna(x)]\n if len(x) == 0:\n return np.nan\n return x[0]\n\n if isinstance(x, DataFrame):\n return x.apply(first, axis=axis)\n else:\n return first(x)\n\n def last_compat(x, axis=0):\n\n def last(x):\n x = x.to_numpy()\n x = x[notna(x)]\n if len(x) == 0:\n return np.nan\n return x[-1]\n\n if isinstance(x, DataFrame):\n return x.apply(last, axis=axis)\n else:\n return last(x)\n\n cls.sum = groupby_function('sum', 'add', np.sum, min_count=0)\n cls.prod = groupby_function('prod', 'prod', np.prod, min_count=0)\n cls.min = groupby_function('min', 'min', np.min, numeric_only=False)\n cls.max = groupby_function('max', 'max', np.max, numeric_only=False)\n cls.first = groupby_function('first', 'first', first_compat,\n numeric_only=False)\n cls.last = groupby_function('last', 'last', last_compat,\n numeric_only=False)"} {"instruction": "Make a summary of the following Python 3 code\ndef resample(self, rule, *args, **kwargs):\n from pandas.core.resample import get_resampler_for_grouping\n return get_resampler_for_grouping(self, rule, *args, **kwargs)", "response": "Resample the time series of a specific time - grouper."} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef rolling(self, *args, **kwargs):\n from pandas.core.window import RollingGroupby\n return RollingGroupby(self, *args, **kwargs)", "response": "Return a rolling grouper providing rolling functionality per group."} {"instruction": "How would you explain what the following Python 3 function does\ndef expanding(self, *args, **kwargs):\n from pandas.core.window import ExpandingGroupby\n return ExpandingGroupby(self, *args, **kwargs)", "response": "Return an expanding grouper providing expanding\n functionality per group."} {"instruction": "Make a summary of the following Python 3 code\ndef _fill(self, direction, limit=None):\n # Need int value for Cython\n if limit is None:\n limit = -1\n\n return self._get_cythonized_result('group_fillna_indexer',\n self.grouper, needs_mask=True,\n cython_dtype=np.int64,\n result_is_index=True,\n direction=direction, limit=limit)", "response": "Returns a Series or DataFrame with filled values for the specified order."} {"instruction": "Here you have a function in Python 3, explain what it does\ndef nth(self, n, dropna=None):\n\n if isinstance(n, int):\n nth_values = [n]\n elif isinstance(n, (set, list, tuple)):\n nth_values = list(set(n))\n if dropna is not None:\n raise ValueError(\n \"dropna option with a list of nth values is not supported\")\n else:\n raise TypeError(\"n needs to be an int or a list/set/tuple of ints\")\n\n nth_values = np.array(nth_values, dtype=np.intp)\n self._set_group_selection()\n\n if not dropna:\n mask_left = np.in1d(self._cumcount_array(), nth_values)\n mask_right = np.in1d(self._cumcount_array(ascending=False) + 1,\n -nth_values)\n mask = mask_left | mask_right\n\n out = self._selected_obj[mask]\n if not self.as_index:\n return out\n\n ids, _, _ = self.grouper.group_info\n out.index = self.grouper.result_index[ids[mask]]\n\n return out.sort_index() if self.sort else out\n\n if dropna not in ['any', 'all']:\n if isinstance(self._selected_obj, Series) and dropna is True:\n warnings.warn(\"the dropna={dropna} keyword is deprecated,\"\n \"use dropna='all' instead. \"\n \"For a Series groupby, dropna must be \"\n \"either None, 'any' or 'all'.\".format(\n dropna=dropna),\n FutureWarning,\n stacklevel=2)\n dropna = 'all'\n else:\n # Note: when agg-ing picker doesn't raise this,\n # just returns NaN\n raise ValueError(\"For a DataFrame groupby, dropna must be \"\n \"either None, 'any' or 'all', \"\n \"(was passed {dropna}).\".format(\n dropna=dropna))\n\n # old behaviour, but with all and any support for DataFrames.\n # modified in GH 7559 to have better perf\n max_len = n if n >= 0 else - 1 - n\n dropped = self.obj.dropna(how=dropna, axis=self.axis)\n\n # get a new grouper for our dropped obj\n if self.keys is None and self.level is None:\n\n # we don't have the grouper info available\n # (e.g. we have selected out\n # a column that is not in the current object)\n axis = self.grouper.axis\n grouper = axis[axis.isin(dropped.index)]\n\n else:\n\n # create a grouper with the original parameters, but on the dropped\n # object\n from pandas.core.groupby.grouper import _get_grouper\n grouper, _, _ = _get_grouper(dropped, key=self.keys,\n axis=self.axis, level=self.level,\n sort=self.sort,\n mutated=self.mutated)\n\n grb = dropped.groupby(grouper, as_index=self.as_index, sort=self.sort)\n sizes, result = grb.size(), grb.nth(n)\n mask = (sizes < max_len).values\n\n # set the results which don't meet the criteria\n if len(result) and mask.any():\n result.loc[mask] = np.nan\n\n # reset/reindex to the original groups\n if (len(self.obj) == len(dropped) or\n len(result) == len(self.grouper.result_index)):\n result.index = self.grouper.result_index\n else:\n result = result.reindex(self.grouper.result_index)\n\n return result", "response": "Returns the nth value of the n - th entry in the record set."} {"instruction": "Can you create a Python 3 function that\nreturns the values at the given quantile.", "response": "def quantile(self, q=0.5, interpolation='linear'):\n \"\"\"\n Return group values at the given quantile, a la numpy.percentile.\n\n Parameters\n ----------\n q : float or array-like, default 0.5 (50% quantile)\n Value(s) between 0 and 1 providing the quantile(s) to compute.\n interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'}\n Method to use when the desired quantile falls between two points.\n\n Returns\n -------\n Series or DataFrame\n Return type determined by caller of GroupBy object.\n\n See Also\n --------\n Series.quantile : Similar method for Series.\n DataFrame.quantile : Similar method for DataFrame.\n numpy.percentile : NumPy method to compute qth percentile.\n\n Examples\n --------\n >>> df = pd.DataFrame([\n ... ['a', 1], ['a', 2], ['a', 3],\n ... ['b', 1], ['b', 3], ['b', 5]\n ... ], columns=['key', 'val'])\n >>> df.groupby('key').quantile()\n val\n key\n a 2.0\n b 3.0\n \"\"\"\n\n def pre_processor(\n vals: np.ndarray\n ) -> Tuple[np.ndarray, Optional[Type]]:\n if is_object_dtype(vals):\n raise TypeError(\"'quantile' cannot be performed against \"\n \"'object' dtypes!\")\n\n inference = None\n if is_integer_dtype(vals):\n inference = np.int64\n elif is_datetime64_dtype(vals):\n inference = 'datetime64[ns]'\n vals = vals.astype(np.float)\n\n return vals, inference\n\n def post_processor(\n vals: np.ndarray,\n inference: Optional[Type]\n ) -> np.ndarray:\n if inference:\n # Check for edge case\n if not (is_integer_dtype(inference) and\n interpolation in {'linear', 'midpoint'}):\n vals = vals.astype(inference)\n\n return vals\n\n return self._get_cythonized_result('group_quantile', self.grouper,\n aggregate=True,\n needs_values=True,\n needs_mask=True,\n cython_dtype=np.float64,\n pre_processing=pre_processor,\n post_processing=post_processor,\n q=q, interpolation=interpolation)"} {"instruction": "Write a Python 3 script to\nreturn the number of rows in each group of the entry in the grouper.", "response": "def ngroup(self, ascending=True):\n \"\"\"\n Number each group from 0 to the number of groups - 1.\n\n This is the enumerative complement of cumcount. Note that the\n numbers given to the groups match the order in which the groups\n would be seen when iterating over the groupby object, not the\n order they are first observed.\n\n .. versionadded:: 0.20.2\n\n Parameters\n ----------\n ascending : bool, default True\n If False, number in reverse, from number of group - 1 to 0.\n\n See Also\n --------\n .cumcount : Number the rows in each group.\n\n Examples\n --------\n\n >>> df = pd.DataFrame({\"A\": list(\"aaabba\")})\n >>> df\n A\n 0 a\n 1 a\n 2 a\n 3 b\n 4 b\n 5 a\n >>> df.groupby('A').ngroup()\n 0 0\n 1 0\n 2 0\n 3 1\n 4 1\n 5 0\n dtype: int64\n >>> df.groupby('A').ngroup(ascending=False)\n 0 1\n 1 1\n 2 1\n 3 0\n 4 0\n 5 1\n dtype: int64\n >>> df.groupby([\"A\", [1,1,2,3,2,1]]).ngroup()\n 0 0\n 1 0\n 2 1\n 3 3\n 4 2\n 5 0\n dtype: int64\n \"\"\"\n\n with _group_selection_context(self):\n index = self._selected_obj.index\n result = Series(self.grouper.group_info[0], index)\n if not ascending:\n result = self.ngroups - 1 - result\n return result"} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef cumcount(self, ascending=True):\n\n with _group_selection_context(self):\n index = self._selected_obj.index\n cumcounts = self._cumcount_array(ascending=ascending)\n return Series(cumcounts, index)", "response": "Return a Series containing the cumulative count of each item in each group."} {"instruction": "Here you have a function in Python 3, explain what it does\ndef rank(self, method='average', ascending=True, na_option='keep',\n pct=False, axis=0):\n \"\"\"\n Provide the rank of values within each group.\n\n Parameters\n ----------\n method : {'average', 'min', 'max', 'first', 'dense'}, default 'average'\n * average: average rank of group\n * min: lowest rank in group\n * max: highest rank in group\n * first: ranks assigned in order they appear in the array\n * dense: like 'min', but rank always increases by 1 between groups\n ascending : boolean, default True\n False for ranks by high (1) to low (N)\n na_option : {'keep', 'top', 'bottom'}, default 'keep'\n * keep: leave NA values where they are\n * top: smallest rank if ascending\n * bottom: smallest rank if descending\n pct : boolean, default False\n Compute percentage rank of data within each group\n axis : int, default 0\n The axis of the object over which to compute the rank.\n\n Returns\n -------\n DataFrame with ranking of values within each group\n \"\"\"\n if na_option not in {'keep', 'top', 'bottom'}:\n msg = \"na_option must be one of 'keep', 'top', or 'bottom'\"\n raise ValueError(msg)\n return self._cython_transform('rank', numeric_only=False,\n ties_method=method, ascending=ascending,\n na_option=na_option, pct=pct, axis=axis)", "response": "Return a DataFrame with the rank of the values within each group."} {"instruction": "How would you explain what the following Python 3 function does\ndef _get_cythonized_result(self, how, grouper, aggregate=False,\n cython_dtype=None, needs_values=False,\n needs_mask=False, needs_ngroups=False,\n result_is_index=False,\n pre_processing=None, post_processing=None,\n **kwargs):\n \"\"\"\n Get result for Cythonized functions.\n\n Parameters\n ----------\n how : str, Cythonized function name to be called\n grouper : Grouper object containing pertinent group info\n aggregate : bool, default False\n Whether the result should be aggregated to match the number of\n groups\n cython_dtype : default None\n Type of the array that will be modified by the Cython call. If\n `None`, the type will be inferred from the values of each slice\n needs_values : bool, default False\n Whether the values should be a part of the Cython call\n signature\n needs_mask : bool, default False\n Whether boolean mask needs to be part of the Cython call\n signature\n needs_ngroups : bool, default False\n Whether number of groups is part of the Cython call signature\n result_is_index : bool, default False\n Whether the result of the Cython operation is an index of\n values to be retrieved, instead of the actual values themselves\n pre_processing : function, default None\n Function to be applied to `values` prior to passing to Cython.\n Function should return a tuple where the first element is the\n values to be passed to Cython and the second element is an optional\n type which the values should be converted to after being returned\n by the Cython operation. Raises if `needs_values` is False.\n post_processing : function, default None\n Function to be applied to result of Cython function. Should accept\n an array of values as the first argument and type inferences as its\n second argument, i.e. the signature should be\n (ndarray, Type).\n **kwargs : dict\n Extra arguments to be passed back to Cython funcs\n\n Returns\n -------\n `Series` or `DataFrame` with filled values\n \"\"\"\n if result_is_index and aggregate:\n raise ValueError(\"'result_is_index' and 'aggregate' cannot both \"\n \"be True!\")\n if post_processing:\n if not callable(pre_processing):\n raise ValueError(\"'post_processing' must be a callable!\")\n if pre_processing:\n if not callable(pre_processing):\n raise ValueError(\"'pre_processing' must be a callable!\")\n if not needs_values:\n raise ValueError(\"Cannot use 'pre_processing' without \"\n \"specifying 'needs_values'!\")\n\n labels, _, ngroups = grouper.group_info\n output = collections.OrderedDict()\n base_func = getattr(libgroupby, how)\n\n for name, obj in self._iterate_slices():\n if aggregate:\n result_sz = ngroups\n else:\n result_sz = len(obj.values)\n\n if not cython_dtype:\n cython_dtype = obj.values.dtype\n\n result = np.zeros(result_sz, dtype=cython_dtype)\n func = partial(base_func, result, labels)\n inferences = None\n\n if needs_values:\n vals = obj.values\n if pre_processing:\n vals, inferences = pre_processing(vals)\n func = partial(func, vals)\n\n if needs_mask:\n mask = isna(obj.values).view(np.uint8)\n func = partial(func, mask)\n\n if needs_ngroups:\n func = partial(func, ngroups)\n\n func(**kwargs) # Call func to modify indexer values in place\n\n if result_is_index:\n result = algorithms.take_nd(obj.values, result)\n\n if post_processing:\n result = post_processing(result, inferences)\n\n output[name] = result\n\n if aggregate:\n return self._wrap_aggregated_output(output)\n else:\n return self._wrap_transformed_output(output)", "response": "Returns a Cythonized result for the given function name and grouper."} {"instruction": "Create a Python 3 function to\nshift each group by periods observations.", "response": "def shift(self, periods=1, freq=None, axis=0, fill_value=None):\n \"\"\"\n Shift each group by periods observations.\n\n Parameters\n ----------\n periods : integer, default 1\n number of periods to shift\n freq : frequency string\n axis : axis to shift, default 0\n fill_value : optional\n\n .. versionadded:: 0.24.0\n \"\"\"\n\n if freq is not None or axis != 0 or not isna(fill_value):\n return self.apply(lambda x: x.shift(periods, freq,\n axis, fill_value))\n\n return self._get_cythonized_result('group_shift_indexer',\n self.grouper, cython_dtype=np.int64,\n needs_ngroups=True,\n result_is_index=True,\n periods=periods)"} {"instruction": "How would you explain what the following Python 3 function does\ndef head(self, n=5):\n self._reset_group_selection()\n mask = self._cumcount_array() < n\n return self._selected_obj[mask]", "response": "Return first n rows of each group."} {"instruction": "Given the following Python 3 function, write the documentation\ndef tail(self, n=5):\n self._reset_group_selection()\n mask = self._cumcount_array(ascending=False) < n\n return self._selected_obj[mask]", "response": "Return last n rows of each group."} {"instruction": "How would you explain what the following Python 3 function does\ndef next_monday(dt):\n if dt.weekday() == 5:\n return dt + timedelta(2)\n elif dt.weekday() == 6:\n return dt + timedelta(1)\n return dt", "response": "Returns next Monday in a given date"} {"instruction": "Explain what the following Python 3 code does\ndef next_monday_or_tuesday(dt):\n dow = dt.weekday()\n if dow == 5 or dow == 6:\n return dt + timedelta(2)\n elif dow == 0:\n return dt + timedelta(1)\n return dt", "response": "Returns the next Monday or Tuesday of a given date."} {"instruction": "Write a Python 3 script to\nreturn the previous Friday.", "response": "def previous_friday(dt):\n \"\"\"\n If holiday falls on Saturday or Sunday, use previous Friday instead.\n \"\"\"\n if dt.weekday() == 5:\n return dt - timedelta(1)\n elif dt.weekday() == 6:\n return dt - timedelta(2)\n return dt"} {"instruction": "Write a Python 3 script to\nconvert a date in weekend to Monday", "response": "def weekend_to_monday(dt):\n \"\"\"\n If holiday falls on Sunday or Saturday,\n use day thereafter (Monday) instead.\n Needed for holidays such as Christmas observation in Europe\n \"\"\"\n if dt.weekday() == 6:\n return dt + timedelta(1)\n elif dt.weekday() == 5:\n return dt + timedelta(2)\n return dt"} {"instruction": "Can you implement a function in Python 3 that\nreturns the datetime object nearest to the first holiday.", "response": "def nearest_workday(dt):\n \"\"\"\n If holiday falls on Saturday, use day before (Friday) instead;\n if holiday falls on Sunday, use day thereafter (Monday) instead.\n \"\"\"\n if dt.weekday() == 5:\n return dt - timedelta(1)\n elif dt.weekday() == 6:\n return dt + timedelta(1)\n return dt"} {"instruction": "Write a Python 3 script to\nreturn next weekday used for observances", "response": "def next_workday(dt):\n \"\"\"\n returns next weekday used for observances\n \"\"\"\n dt += timedelta(days=1)\n while dt.weekday() > 4:\n # Mon-Fri are 0-4\n dt += timedelta(days=1)\n return dt"} {"instruction": "How would you explain what the following Python 3 function does\ndef previous_workday(dt):\n dt -= timedelta(days=1)\n while dt.weekday() > 4:\n # Mon-Fri are 0-4\n dt -= timedelta(days=1)\n return dt", "response": "Returns the previous weekday used for observances\n "} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef dates(self, start_date, end_date, return_name=False):\n start_date = Timestamp(start_date)\n end_date = Timestamp(end_date)\n\n filter_start_date = start_date\n filter_end_date = end_date\n\n if self.year is not None:\n dt = Timestamp(datetime(self.year, self.month, self.day))\n if return_name:\n return Series(self.name, index=[dt])\n else:\n return [dt]\n\n dates = self._reference_dates(start_date, end_date)\n holiday_dates = self._apply_rule(dates)\n if self.days_of_week is not None:\n holiday_dates = holiday_dates[np.in1d(holiday_dates.dayofweek,\n self.days_of_week)]\n\n if self.start_date is not None:\n filter_start_date = max(self.start_date.tz_localize(\n filter_start_date.tz), filter_start_date)\n if self.end_date is not None:\n filter_end_date = min(self.end_date.tz_localize(\n filter_end_date.tz), filter_end_date)\n holiday_dates = holiday_dates[(holiday_dates >= filter_start_date) &\n (holiday_dates <= filter_end_date)]\n if return_name:\n return Series(self.name, index=holiday_dates)\n return holiday_dates", "response": "Calculate holidays observed between start date and end date."} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef _reference_dates(self, start_date, end_date):\n if self.start_date is not None:\n start_date = self.start_date.tz_localize(start_date.tz)\n\n if self.end_date is not None:\n end_date = self.end_date.tz_localize(start_date.tz)\n\n year_offset = DateOffset(years=1)\n reference_start_date = Timestamp(\n datetime(start_date.year - 1, self.month, self.day))\n\n reference_end_date = Timestamp(\n datetime(end_date.year + 1, self.month, self.day))\n # Don't process unnecessary holidays\n dates = date_range(start=reference_start_date,\n end=reference_end_date,\n freq=year_offset, tz=start_date.tz)\n\n return dates", "response": "Get the reference dates for the holiday."} {"instruction": "Write a Python 3 function for\napplying the given offset to a DatetimeIndex of dates.", "response": "def _apply_rule(self, dates):\n \"\"\"\n Apply the given offset/observance to a DatetimeIndex of dates.\n\n Parameters\n ----------\n dates : DatetimeIndex\n Dates to apply the given offset/observance rule\n\n Returns\n -------\n Dates with rules applied\n \"\"\"\n if self.observance is not None:\n return dates.map(lambda d: self.observance(d))\n\n if self.offset is not None:\n if not isinstance(self.offset, list):\n offsets = [self.offset]\n else:\n offsets = self.offset\n for offset in offsets:\n\n # if we are adding a non-vectorized value\n # ignore the PerformanceWarnings:\n with warnings.catch_warnings():\n warnings.simplefilter(\"ignore\", PerformanceWarning)\n dates += offset\n return dates"} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef holidays(self, start=None, end=None, return_name=False):\n if self.rules is None:\n raise Exception('Holiday Calendar {name} does not have any '\n 'rules specified'.format(name=self.name))\n\n if start is None:\n start = AbstractHolidayCalendar.start_date\n\n if end is None:\n end = AbstractHolidayCalendar.end_date\n\n start = Timestamp(start)\n end = Timestamp(end)\n\n holidays = None\n # If we don't have a cache or the dates are outside the prior cache, we\n # get them again\n if (self._cache is None or start < self._cache[0] or\n end > self._cache[1]):\n for rule in self.rules:\n rule_holidays = rule.dates(start, end, return_name=True)\n\n if holidays is None:\n holidays = rule_holidays\n else:\n holidays = holidays.append(rule_holidays)\n\n self._cache = (start, end, holidays.sort_index())\n\n holidays = self._cache[2]\n holidays = holidays[start:end]\n\n if return_name:\n return holidays\n else:\n return holidays.index", "response": "Returns a list of dates and holidays between start and end dates."} {"instruction": "Explain what the following Python 3 code does\ndef merge_class(base, other):\n try:\n other = other.rules\n except AttributeError:\n pass\n\n if not isinstance(other, list):\n other = [other]\n other_holidays = {holiday.name: holiday for holiday in other}\n\n try:\n base = base.rules\n except AttributeError:\n pass\n\n if not isinstance(base, list):\n base = [base]\n base_holidays = {holiday.name: holiday for holiday in base}\n\n other_holidays.update(base_holidays)\n return list(other_holidays.values())", "response": "Merge two calendars together."} {"instruction": "Create a Python 3 function for\nmerging two calendars together.", "response": "def merge(self, other, inplace=False):\n \"\"\"\n Merge holiday calendars together. The caller's class\n rules take precedence. The merge will be done\n based on each holiday's name.\n\n Parameters\n ----------\n other : holiday calendar\n inplace : bool (default=False)\n If True set rule_table to holidays, else return array of Holidays\n \"\"\"\n holidays = self.merge_class(self, other)\n if inplace:\n self.rules = holidays\n else:\n return holidays"} {"instruction": "Create a Python 3 function to\nregister an option in the package - wide pandas config object.", "response": "def register_option(key, defval, doc='', validator=None, cb=None):\n \"\"\"Register an option in the package-wide pandas config object\n\n Parameters\n ----------\n key - a fully-qualified key, e.g. \"x.y.option - z\".\n defval - the default value of the option\n doc - a string description of the option\n validator - a function of a single argument, should raise `ValueError` if\n called with a value which is not a legal value for the option.\n cb - a function of a single argument \"key\", which is called\n immediately after an option value is set/reset. key is\n the full name of the option.\n\n Returns\n -------\n Nothing.\n\n Raises\n ------\n ValueError if `validator` is specified and `defval` is not a valid value.\n\n \"\"\"\n import tokenize\n import keyword\n key = key.lower()\n\n if key in _registered_options:\n msg = \"Option '{key}' has already been registered\"\n raise OptionError(msg.format(key=key))\n if key in _reserved_keys:\n msg = \"Option '{key}' is a reserved key\"\n raise OptionError(msg.format(key=key))\n\n # the default value should be legal\n if validator:\n validator(defval)\n\n # walk the nested dict, creating dicts as needed along the path\n path = key.split('.')\n\n for k in path:\n if not bool(re.match('^' + tokenize.Name + '$', k)):\n raise ValueError(\"{k} is not a valid identifier\".format(k=k))\n if keyword.iskeyword(k):\n raise ValueError(\"{k} is a python keyword\".format(k=k))\n\n cursor = _global_config\n msg = \"Path prefix to option '{option}' is already an option\"\n for i, p in enumerate(path[:-1]):\n if not isinstance(cursor, dict):\n raise OptionError(msg.format(option='.'.join(path[:i])))\n if p not in cursor:\n cursor[p] = {}\n cursor = cursor[p]\n\n if not isinstance(cursor, dict):\n raise OptionError(msg.format(option='.'.join(path[:-1])))\n\n cursor[path[-1]] = defval # initialize\n\n # save the option metadata\n _registered_options[key] = RegisteredOption(key=key, defval=defval,\n doc=doc, validator=validator,\n cb=cb)"} {"instruction": "Implement a Python 3 function for\nmarking an option as deprecated.", "response": "def deprecate_option(key, msg=None, rkey=None, removal_ver=None):\n \"\"\"\n Mark option `key` as deprecated, if code attempts to access this option,\n a warning will be produced, using `msg` if given, or a default message\n if not.\n if `rkey` is given, any access to the key will be re-routed to `rkey`.\n\n Neither the existence of `key` nor that if `rkey` is checked. If they\n do not exist, any subsequence access will fail as usual, after the\n deprecation warning is given.\n\n Parameters\n ----------\n key - the name of the option to be deprecated. must be a fully-qualified\n option name (e.g \"x.y.z.rkey\").\n\n msg - (Optional) a warning message to output when the key is referenced.\n if no message is given a default message will be emitted.\n\n rkey - (Optional) the name of an option to reroute access to.\n If specified, any referenced `key` will be re-routed to `rkey`\n including set/get/reset.\n rkey must be a fully-qualified option name (e.g \"x.y.z.rkey\").\n used by the default message if no `msg` is specified.\n\n removal_ver - (Optional) specifies the version in which this option will\n be removed. used by the default message if no `msg`\n is specified.\n\n Returns\n -------\n Nothing\n\n Raises\n ------\n OptionError - if key has already been deprecated.\n\n \"\"\"\n\n key = key.lower()\n\n if key in _deprecated_options:\n msg = \"Option '{key}' has already been defined as deprecated.\"\n raise OptionError(msg.format(key=key))\n\n _deprecated_options[key] = DeprecatedOption(key, msg, rkey, removal_ver)"} {"instruction": "Can you write a function in Python 3 where it\nreturns a list of keys matching pat", "response": "def _select_options(pat):\n \"\"\"returns a list of keys matching `pat`\n\n if pat==\"all\", returns all registered options\n \"\"\"\n\n # short-circuit for exact key\n if pat in _registered_options:\n return [pat]\n\n # else look through all of them\n keys = sorted(_registered_options.keys())\n if pat == 'all': # reserved key\n return keys\n\n return [k for k in keys if re.search(pat, k, re.I)]"} {"instruction": "Make a summary of the following Python 3 code\ndef _translate_key(key):\n\n d = _get_deprecated_option(key)\n if d:\n return d.rkey or key\n else:\n return key", "response": "Translate a key id deprecated and a replacement key defined in the\n option."} {"instruction": "Can you write a function in Python 3 where it\nbuilds a formatted description of a registered option and prints it", "response": "def _build_option_description(k):\n \"\"\" Builds a formatted description of a registered option and prints it \"\"\"\n\n o = _get_registered_option(k)\n d = _get_deprecated_option(k)\n\n s = '{k} '.format(k=k)\n\n if o.doc:\n s += '\\n'.join(o.doc.strip().split('\\n'))\n else:\n s += 'No description available.'\n\n if o:\n s += ('\\n [default: {default}] [currently: {current}]'\n .format(default=o.defval, current=_get_option(k, True)))\n\n if d:\n s += '\\n (Deprecated'\n s += (', use `{rkey}` instead.'\n .format(rkey=d.rkey if d.rkey else ''))\n s += ')'\n\n return s"} {"instruction": "Make a summary of the following Python 3 code\ndef config_prefix(prefix):\n\n # Note: reset_option relies on set_option, and on key directly\n # it does not fit in to this monkey-patching scheme\n\n global register_option, get_option, set_option, reset_option\n\n def wrap(func):\n def inner(key, *args, **kwds):\n pkey = '{prefix}.{key}'.format(prefix=prefix, key=key)\n return func(pkey, *args, **kwds)\n\n return inner\n\n _register_option = register_option\n _get_option = get_option\n _set_option = set_option\n set_option = wrap(set_option)\n get_option = wrap(get_option)\n register_option = wrap(register_option)\n yield None\n set_option = _set_option\n get_option = _get_option\n register_option = _register_option", "response": "Contextmanager for multiple API calls with a common prefix"} {"instruction": "Write a Python 3 script for\nparsing a string of CSS attribute names into a list of property value pairs.", "response": "def parse(self, declarations_str):\n \"\"\"Generates (prop, value) pairs from declarations\n\n In a future version may generate parsed tokens from tinycss/tinycss2\n \"\"\"\n for decl in declarations_str.split(';'):\n if not decl.strip():\n continue\n prop, sep, val = decl.partition(':')\n prop = prop.strip().lower()\n # TODO: don't lowercase case sensitive parts of values (strings)\n val = val.strip().lower()\n if sep:\n yield prop, val\n else:\n warnings.warn('Ill-formatted attribute: expected a colon '\n 'in {decl!r}'.format(decl=decl), CSSWarning)"} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef array(data: Sequence[object],\n dtype: Optional[Union[str, np.dtype, ExtensionDtype]] = None,\n copy: bool = True,\n ) -> ABCExtensionArray:\n \"\"\"\n Create an array.\n\n .. versionadded:: 0.24.0\n\n Parameters\n ----------\n data : Sequence of objects\n The scalars inside `data` should be instances of the\n scalar type for `dtype`. It's expected that `data`\n represents a 1-dimensional array of data.\n\n When `data` is an Index or Series, the underlying array\n will be extracted from `data`.\n\n dtype : str, np.dtype, or ExtensionDtype, optional\n The dtype to use for the array. This may be a NumPy\n dtype or an extension type registered with pandas using\n :meth:`pandas.api.extensions.register_extension_dtype`.\n\n If not specified, there are two possibilities:\n\n 1. When `data` is a :class:`Series`, :class:`Index`, or\n :class:`ExtensionArray`, the `dtype` will be taken\n from the data.\n 2. Otherwise, pandas will attempt to infer the `dtype`\n from the data.\n\n Note that when `data` is a NumPy array, ``data.dtype`` is\n *not* used for inferring the array type. This is because\n NumPy cannot represent all the types of data that can be\n held in extension arrays.\n\n Currently, pandas will infer an extension dtype for sequences of\n\n ============================== =====================================\n Scalar Type Array Type\n ============================== =====================================\n :class:`pandas.Interval` :class:`pandas.arrays.IntervalArray`\n :class:`pandas.Period` :class:`pandas.arrays.PeriodArray`\n :class:`datetime.datetime` :class:`pandas.arrays.DatetimeArray`\n :class:`datetime.timedelta` :class:`pandas.arrays.TimedeltaArray`\n ============================== =====================================\n\n For all other cases, NumPy's usual inference rules will be used.\n\n copy : bool, default True\n Whether to copy the data, even if not necessary. Depending\n on the type of `data`, creating the new array may require\n copying data, even if ``copy=False``.\n\n Returns\n -------\n ExtensionArray\n The newly created array.\n\n Raises\n ------\n ValueError\n When `data` is not 1-dimensional.\n\n See Also\n --------\n numpy.array : Construct a NumPy array.\n Series : Construct a pandas Series.\n Index : Construct a pandas Index.\n arrays.PandasArray : ExtensionArray wrapping a NumPy array.\n Series.array : Extract the array stored within a Series.\n\n Notes\n -----\n Omitting the `dtype` argument means pandas will attempt to infer the\n best array type from the values in the data. As new array types are\n added by pandas and 3rd party libraries, the \"best\" array type may\n change. We recommend specifying `dtype` to ensure that\n\n 1. the correct array type for the data is returned\n 2. the returned array type doesn't change as new extension types\n are added by pandas and third-party libraries\n\n Additionally, if the underlying memory representation of the returned\n array matters, we recommend specifying the `dtype` as a concrete object\n rather than a string alias or allowing it to be inferred. For example,\n a future version of pandas or a 3rd-party library may include a\n dedicated ExtensionArray for string data. In this event, the following\n would no longer return a :class:`arrays.PandasArray` backed by a NumPy\n array.\n\n >>> pd.array(['a', 'b'], dtype=str)\n \n ['a', 'b']\n Length: 2, dtype: str32\n\n This would instead return the new ExtensionArray dedicated for string\n data. If you really need the new array to be backed by a NumPy array,\n specify that in the dtype.\n\n >>> pd.array(['a', 'b'], dtype=np.dtype(\"\n ['a', 'b']\n Length: 2, dtype: str32\n\n Or use the dedicated constructor for the array you're expecting, and\n wrap that in a PandasArray\n\n >>> pd.array(np.array(['a', 'b'], dtype='\n ['a', 'b']\n Length: 2, dtype: str32\n\n Finally, Pandas has arrays that mostly overlap with NumPy\n\n * :class:`arrays.DatetimeArray`\n * :class:`arrays.TimedeltaArray`\n\n When data with a ``datetime64[ns]`` or ``timedelta64[ns]`` dtype is\n passed, pandas will always return a ``DatetimeArray`` or ``TimedeltaArray``\n rather than a ``PandasArray``. This is for symmetry with the case of\n timezone-aware data, which NumPy does not natively support.\n\n >>> pd.array(['2015', '2016'], dtype='datetime64[ns]')\n \n ['2015-01-01 00:00:00', '2016-01-01 00:00:00']\n Length: 2, dtype: datetime64[ns]\n\n >>> pd.array([\"1H\", \"2H\"], dtype='timedelta64[ns]')\n \n ['01:00:00', '02:00:00']\n Length: 2, dtype: timedelta64[ns]\n\n Examples\n --------\n If a dtype is not specified, `data` is passed through to\n :meth:`numpy.array`, and a :class:`arrays.PandasArray` is returned.\n\n >>> pd.array([1, 2])\n \n [1, 2]\n Length: 2, dtype: int64\n\n Or the NumPy dtype can be specified\n\n >>> pd.array([1, 2], dtype=np.dtype(\"int32\"))\n \n [1, 2]\n Length: 2, dtype: int32\n\n You can use the string alias for `dtype`\n\n >>> pd.array(['a', 'b', 'a'], dtype='category')\n [a, b, a]\n Categories (2, object): [a, b]\n\n Or specify the actual dtype\n\n >>> pd.array(['a', 'b', 'a'],\n ... dtype=pd.CategoricalDtype(['a', 'b', 'c'], ordered=True))\n [a, b, a]\n Categories (3, object): [a < b < c]\n\n Because omitting the `dtype` passes the data through to NumPy,\n a mixture of valid integers and NA will return a floating-point\n NumPy array.\n\n >>> pd.array([1, 2, np.nan])\n \n [1.0, 2.0, nan]\n Length: 3, dtype: float64\n\n To use pandas' nullable :class:`pandas.arrays.IntegerArray`, specify\n the dtype:\n\n >>> pd.array([1, 2, np.nan], dtype='Int64')\n \n [1, 2, NaN]\n Length: 3, dtype: Int64\n\n Pandas will infer an ExtensionArray for some types of data:\n\n >>> pd.array([pd.Period('2000', freq=\"D\"), pd.Period(\"2000\", freq=\"D\")])\n \n ['2000-01-01', '2000-01-01']\n Length: 2, dtype: period[D]\n\n `data` must be 1-dimensional. A ValueError is raised when the input\n has the wrong dimensionality.\n\n >>> pd.array(1)\n Traceback (most recent call last):\n ...\n ValueError: Cannot pass scalar '1' to 'pandas.array'.\n \"\"\"\n from pandas.core.arrays import (\n period_array, ExtensionArray, IntervalArray, PandasArray,\n DatetimeArray,\n TimedeltaArray,\n )\n from pandas.core.internals.arrays import extract_array\n\n if lib.is_scalar(data):\n msg = (\n \"Cannot pass scalar '{}' to 'pandas.array'.\"\n )\n raise ValueError(msg.format(data))\n\n data = extract_array(data, extract_numpy=True)\n\n if dtype is None and isinstance(data, ExtensionArray):\n dtype = data.dtype\n\n # this returns None for not-found dtypes.\n if isinstance(dtype, str):\n dtype = registry.find(dtype) or dtype\n\n if is_extension_array_dtype(dtype):\n cls = dtype.construct_array_type()\n return cls._from_sequence(data, dtype=dtype, copy=copy)\n\n if dtype is None:\n inferred_dtype = lib.infer_dtype(data, skipna=False)\n if inferred_dtype == 'period':\n try:\n return period_array(data, copy=copy)\n except tslibs.IncompatibleFrequency:\n # We may have a mixture of frequencies.\n # We choose to return an ndarray, rather than raising.\n pass\n elif inferred_dtype == 'interval':\n try:\n return IntervalArray(data, copy=copy)\n except ValueError:\n # We may have a mixture of `closed` here.\n # We choose to return an ndarray, rather than raising.\n pass\n\n elif inferred_dtype.startswith('datetime'):\n # datetime, datetime64\n try:\n return DatetimeArray._from_sequence(data, copy=copy)\n except ValueError:\n # Mixture of timezones, fall back to PandasArray\n pass\n\n elif inferred_dtype.startswith('timedelta'):\n # timedelta, timedelta64\n return TimedeltaArray._from_sequence(data, copy=copy)\n\n # TODO(BooleanArray): handle this type\n\n # Pandas overrides NumPy for\n # 1. datetime64[ns]\n # 2. timedelta64[ns]\n # so that a DatetimeArray is returned.\n if is_datetime64_ns_dtype(dtype):\n return DatetimeArray._from_sequence(data, dtype=dtype, copy=copy)\n elif is_timedelta64_ns_dtype(dtype):\n return TimedeltaArray._from_sequence(data, dtype=dtype, copy=copy)\n\n result = PandasArray._from_sequence(data, dtype=dtype, copy=copy)\n return result", "response": "Create an array of objects."} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef maybe_convert_platform_interval(values):\n if isinstance(values, (list, tuple)) and len(values) == 0:\n # GH 19016\n # empty lists/tuples get object dtype by default, but this is not\n # prohibited for IntervalArray, so coerce to integer instead\n return np.array([], dtype=np.int64)\n elif is_categorical_dtype(values):\n values = np.asarray(values)\n\n return maybe_convert_platform(values)", "response": "Try to do platform conversion with special casing for IntervalArray."} {"instruction": "Write a Python 3 function that can\ncheck if the object is a file - like object.", "response": "def is_file_like(obj):\n \"\"\"\n Check if the object is a file-like object.\n\n For objects to be considered file-like, they must\n be an iterator AND have either a `read` and/or `write`\n method as an attribute.\n\n Note: file-like objects must be iterable, but\n iterable objects need not be file-like.\n\n .. versionadded:: 0.20.0\n\n Parameters\n ----------\n obj : The object to check\n\n Returns\n -------\n is_file_like : bool\n Whether `obj` has file-like properties.\n\n Examples\n --------\n >>> buffer(StringIO(\"data\"))\n >>> is_file_like(buffer)\n True\n >>> is_file_like([1, 2, 3])\n False\n \"\"\"\n\n if not (hasattr(obj, 'read') or hasattr(obj, 'write')):\n return False\n\n if not hasattr(obj, \"__iter__\"):\n return False\n\n return True"} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef is_list_like(obj, allow_sets=True):\n\n return (isinstance(obj, abc.Iterable) and\n # we do not count strings/unicode/bytes as list-like\n not isinstance(obj, (str, bytes)) and\n\n # exclude zero-dimensional numpy arrays, effectively scalars\n not (isinstance(obj, np.ndarray) and obj.ndim == 0) and\n\n # exclude sets if allow_sets is False\n not (allow_sets is False and isinstance(obj, abc.Set)))", "response": "Checks if the object is list - like."} {"instruction": "Can you create a Python 3 function that\nchecks if the object is list - like and that all of its elements are also list - like.", "response": "def is_nested_list_like(obj):\n \"\"\"\n Check if the object is list-like, and that all of its elements\n are also list-like.\n\n .. versionadded:: 0.20.0\n\n Parameters\n ----------\n obj : The object to check\n\n Returns\n -------\n is_list_like : bool\n Whether `obj` has list-like properties.\n\n Examples\n --------\n >>> is_nested_list_like([[1, 2, 3]])\n True\n >>> is_nested_list_like([{1, 2, 3}, {1, 2, 3}])\n True\n >>> is_nested_list_like([\"foo\"])\n False\n >>> is_nested_list_like([])\n False\n >>> is_nested_list_like([[1, 2, 3], 1])\n False\n\n Notes\n -----\n This won't reliably detect whether a consumable iterator (e. g.\n a generator) is a nested-list-like without consuming the iterator.\n To avoid consuming it, we always return False if the outer container\n doesn't define `__len__`.\n\n See Also\n --------\n is_list_like\n \"\"\"\n return (is_list_like(obj) and hasattr(obj, '__len__') and\n len(obj) > 0 and all(is_list_like(item) for item in obj))"} {"instruction": "Write a Python 3 script to\ncheck if the object is dict - like.", "response": "def is_dict_like(obj):\n \"\"\"\n Check if the object is dict-like.\n\n Parameters\n ----------\n obj : The object to check\n\n Returns\n -------\n is_dict_like : bool\n Whether `obj` has dict-like properties.\n\n Examples\n --------\n >>> is_dict_like({1: 2})\n True\n >>> is_dict_like([1, 2, 3])\n False\n >>> is_dict_like(dict)\n False\n >>> is_dict_like(dict())\n True\n \"\"\"\n dict_like_attrs = (\"__getitem__\", \"keys\", \"__contains__\")\n return (all(hasattr(obj, attr) for attr in dict_like_attrs)\n # [GH 25196] exclude classes\n and not isinstance(obj, type))"} {"instruction": "Can you write a function in Python 3 where it\nchecks if the object is a sequence of objects.", "response": "def is_sequence(obj):\n \"\"\"\n Check if the object is a sequence of objects.\n String types are not included as sequences here.\n\n Parameters\n ----------\n obj : The object to check\n\n Returns\n -------\n is_sequence : bool\n Whether `obj` is a sequence of objects.\n\n Examples\n --------\n >>> l = [1, 2, 3]\n >>>\n >>> is_sequence(l)\n True\n >>> is_sequence(iter(l))\n False\n \"\"\"\n\n try:\n iter(obj) # Can iterate over it.\n len(obj) # Has a length associated with it.\n return not isinstance(obj, (str, bytes))\n except (TypeError, AttributeError):\n return False"} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef _new_DatetimeIndex(cls, d):\n\n if \"data\" in d and not isinstance(d[\"data\"], DatetimeIndex):\n # Avoid need to verify integrity by calling simple_new directly\n data = d.pop(\"data\")\n result = cls._simple_new(data, **d)\n else:\n with warnings.catch_warnings():\n # we ignore warnings from passing verify_integrity=False\n # TODO: If we knew what was going in to **d, we might be able to\n # go through _simple_new instead\n warnings.simplefilter(\"ignore\")\n result = cls.__new__(cls, verify_integrity=False, **d)\n\n return result", "response": "This is called upon unpickling and will not create a new object."} {"instruction": "Here you have a function in Python 3, explain what it does\ndef date_range(start=None, end=None, periods=None, freq=None, tz=None,\n normalize=False, name=None, closed=None, **kwargs):\n \"\"\"\n Return a fixed frequency DatetimeIndex.\n\n Parameters\n ----------\n start : str or datetime-like, optional\n Left bound for generating dates.\n end : str or datetime-like, optional\n Right bound for generating dates.\n periods : integer, optional\n Number of periods to generate.\n freq : str or DateOffset, default 'D'\n Frequency strings can have multiples, e.g. '5H'. See\n :ref:`here ` for a list of\n frequency aliases.\n tz : str or tzinfo, optional\n Time zone name for returning localized DatetimeIndex, for example\n 'Asia/Hong_Kong'. By default, the resulting DatetimeIndex is\n timezone-naive.\n normalize : bool, default False\n Normalize start/end dates to midnight before generating date range.\n name : str, default None\n Name of the resulting DatetimeIndex.\n closed : {None, 'left', 'right'}, optional\n Make the interval closed with respect to the given frequency to\n the 'left', 'right', or both sides (None, the default).\n **kwargs\n For compatibility. Has no effect on the result.\n\n Returns\n -------\n rng : DatetimeIndex\n\n See Also\n --------\n DatetimeIndex : An immutable container for datetimes.\n timedelta_range : Return a fixed frequency TimedeltaIndex.\n period_range : Return a fixed frequency PeriodIndex.\n interval_range : Return a fixed frequency IntervalIndex.\n\n Notes\n -----\n Of the four parameters ``start``, ``end``, ``periods``, and ``freq``,\n exactly three must be specified. If ``freq`` is omitted, the resulting\n ``DatetimeIndex`` will have ``periods`` linearly spaced elements between\n ``start`` and ``end`` (closed on both sides).\n\n To learn more about the frequency strings, please see `this link\n `__.\n\n Examples\n --------\n **Specifying the values**\n\n The next four examples generate the same `DatetimeIndex`, but vary\n the combination of `start`, `end` and `periods`.\n\n Specify `start` and `end`, with the default daily frequency.\n\n >>> pd.date_range(start='1/1/2018', end='1/08/2018')\n DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03', '2018-01-04',\n '2018-01-05', '2018-01-06', '2018-01-07', '2018-01-08'],\n dtype='datetime64[ns]', freq='D')\n\n Specify `start` and `periods`, the number of periods (days).\n\n >>> pd.date_range(start='1/1/2018', periods=8)\n DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03', '2018-01-04',\n '2018-01-05', '2018-01-06', '2018-01-07', '2018-01-08'],\n dtype='datetime64[ns]', freq='D')\n\n Specify `end` and `periods`, the number of periods (days).\n\n >>> pd.date_range(end='1/1/2018', periods=8)\n DatetimeIndex(['2017-12-25', '2017-12-26', '2017-12-27', '2017-12-28',\n '2017-12-29', '2017-12-30', '2017-12-31', '2018-01-01'],\n dtype='datetime64[ns]', freq='D')\n\n Specify `start`, `end`, and `periods`; the frequency is generated\n automatically (linearly spaced).\n\n >>> pd.date_range(start='2018-04-24', end='2018-04-27', periods=3)\n DatetimeIndex(['2018-04-24 00:00:00', '2018-04-25 12:00:00',\n '2018-04-27 00:00:00'],\n dtype='datetime64[ns]', freq=None)\n\n **Other Parameters**\n\n Changed the `freq` (frequency) to ``'M'`` (month end frequency).\n\n >>> pd.date_range(start='1/1/2018', periods=5, freq='M')\n DatetimeIndex(['2018-01-31', '2018-02-28', '2018-03-31', '2018-04-30',\n '2018-05-31'],\n dtype='datetime64[ns]', freq='M')\n\n Multiples are allowed\n\n >>> pd.date_range(start='1/1/2018', periods=5, freq='3M')\n DatetimeIndex(['2018-01-31', '2018-04-30', '2018-07-31', '2018-10-31',\n '2019-01-31'],\n dtype='datetime64[ns]', freq='3M')\n\n `freq` can also be specified as an Offset object.\n\n >>> pd.date_range(start='1/1/2018', periods=5, freq=pd.offsets.MonthEnd(3))\n DatetimeIndex(['2018-01-31', '2018-04-30', '2018-07-31', '2018-10-31',\n '2019-01-31'],\n dtype='datetime64[ns]', freq='3M')\n\n Specify `tz` to set the timezone.\n\n >>> pd.date_range(start='1/1/2018', periods=5, tz='Asia/Tokyo')\n DatetimeIndex(['2018-01-01 00:00:00+09:00', '2018-01-02 00:00:00+09:00',\n '2018-01-03 00:00:00+09:00', '2018-01-04 00:00:00+09:00',\n '2018-01-05 00:00:00+09:00'],\n dtype='datetime64[ns, Asia/Tokyo]', freq='D')\n\n `closed` controls whether to include `start` and `end` that are on the\n boundary. The default includes boundary points on either end.\n\n >>> pd.date_range(start='2017-01-01', end='2017-01-04', closed=None)\n DatetimeIndex(['2017-01-01', '2017-01-02', '2017-01-03', '2017-01-04'],\n dtype='datetime64[ns]', freq='D')\n\n Use ``closed='left'`` to exclude `end` if it falls on the boundary.\n\n >>> pd.date_range(start='2017-01-01', end='2017-01-04', closed='left')\n DatetimeIndex(['2017-01-01', '2017-01-02', '2017-01-03'],\n dtype='datetime64[ns]', freq='D')\n\n Use ``closed='right'`` to exclude `start` if it falls on the boundary.\n\n >>> pd.date_range(start='2017-01-01', end='2017-01-04', closed='right')\n DatetimeIndex(['2017-01-02', '2017-01-03', '2017-01-04'],\n dtype='datetime64[ns]', freq='D')\n \"\"\"\n\n if freq is None and com._any_none(periods, start, end):\n freq = 'D'\n\n dtarr = DatetimeArray._generate_range(\n start=start, end=end, periods=periods,\n freq=freq, tz=tz, normalize=normalize,\n closed=closed, **kwargs)\n return DatetimeIndex._simple_new(\n dtarr, tz=dtarr.tz, freq=dtarr.freq, name=name)", "response": "Generate a fixed frequency DatetimeIndex from start and end dates."} {"instruction": "How would you code a function in Python 3 to\nreturn a fixed frequency DatetimeIndex with business day as the default.", "response": "def bdate_range(start=None, end=None, periods=None, freq='B', tz=None,\n normalize=True, name=None, weekmask=None, holidays=None,\n closed=None, **kwargs):\n \"\"\"\n Return a fixed frequency DatetimeIndex, with business day as the default\n frequency\n\n Parameters\n ----------\n start : string or datetime-like, default None\n Left bound for generating dates.\n end : string or datetime-like, default None\n Right bound for generating dates.\n periods : integer, default None\n Number of periods to generate.\n freq : string or DateOffset, default 'B' (business daily)\n Frequency strings can have multiples, e.g. '5H'.\n tz : string or None\n Time zone name for returning localized DatetimeIndex, for example\n Asia/Beijing.\n normalize : bool, default False\n Normalize start/end dates to midnight before generating date range.\n name : string, default None\n Name of the resulting DatetimeIndex.\n weekmask : string or None, default None\n Weekmask of valid business days, passed to ``numpy.busdaycalendar``,\n only used when custom frequency strings are passed. The default\n value None is equivalent to 'Mon Tue Wed Thu Fri'.\n\n .. versionadded:: 0.21.0\n\n holidays : list-like or None, default None\n Dates to exclude from the set of valid business days, passed to\n ``numpy.busdaycalendar``, only used when custom frequency strings\n are passed.\n\n .. versionadded:: 0.21.0\n\n closed : string, default None\n Make the interval closed with respect to the given frequency to\n the 'left', 'right', or both sides (None).\n **kwargs\n For compatibility. Has no effect on the result.\n\n Returns\n -------\n DatetimeIndex\n\n Notes\n -----\n Of the four parameters: ``start``, ``end``, ``periods``, and ``freq``,\n exactly three must be specified. Specifying ``freq`` is a requirement\n for ``bdate_range``. Use ``date_range`` if specifying ``freq`` is not\n desired.\n\n To learn more about the frequency strings, please see `this link\n `__.\n\n Examples\n --------\n Note how the two weekend days are skipped in the result.\n\n >>> pd.bdate_range(start='1/1/2018', end='1/08/2018')\n DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03', '2018-01-04',\n '2018-01-05', '2018-01-08'],\n dtype='datetime64[ns]', freq='B')\n \"\"\"\n if freq is None:\n msg = 'freq must be specified for bdate_range; use date_range instead'\n raise TypeError(msg)\n\n if is_string_like(freq) and freq.startswith('C'):\n try:\n weekmask = weekmask or 'Mon Tue Wed Thu Fri'\n freq = prefix_mapping[freq](holidays=holidays, weekmask=weekmask)\n except (KeyError, TypeError):\n msg = 'invalid custom frequency string: {freq}'.format(freq=freq)\n raise ValueError(msg)\n elif holidays or weekmask:\n msg = ('a custom frequency string is required when holidays or '\n 'weekmask are passed, got frequency {freq}').format(freq=freq)\n raise ValueError(msg)\n\n return date_range(start=start, end=end, periods=periods,\n freq=freq, tz=tz, normalize=normalize, name=name,\n closed=closed, **kwargs)"} {"instruction": "Explain what the following Python 3 code does\ndef cdate_range(start=None, end=None, periods=None, freq='C', tz=None,\n normalize=True, name=None, closed=None, **kwargs):\n \"\"\"\n Return a fixed frequency DatetimeIndex, with CustomBusinessDay as the\n default frequency\n\n .. deprecated:: 0.21.0\n\n Parameters\n ----------\n start : string or datetime-like, default None\n Left bound for generating dates\n end : string or datetime-like, default None\n Right bound for generating dates\n periods : integer, default None\n Number of periods to generate\n freq : string or DateOffset, default 'C' (CustomBusinessDay)\n Frequency strings can have multiples, e.g. '5H'\n tz : string, default None\n Time zone name for returning localized DatetimeIndex, for example\n Asia/Beijing\n normalize : bool, default False\n Normalize start/end dates to midnight before generating date range\n name : string, default None\n Name of the resulting DatetimeIndex\n weekmask : string, Default 'Mon Tue Wed Thu Fri'\n weekmask of valid business days, passed to ``numpy.busdaycalendar``\n holidays : list\n list/array of dates to exclude from the set of valid business days,\n passed to ``numpy.busdaycalendar``\n closed : string, default None\n Make the interval closed with respect to the given frequency to\n the 'left', 'right', or both sides (None)\n\n Notes\n -----\n Of the three parameters: ``start``, ``end``, and ``periods``, exactly two\n must be specified.\n\n To learn more about the frequency strings, please see `this link\n `__.\n\n Returns\n -------\n rng : DatetimeIndex\n \"\"\"\n warnings.warn(\"cdate_range is deprecated and will be removed in a future \"\n \"version, instead use pd.bdate_range(..., freq='{freq}')\"\n .format(freq=freq), FutureWarning, stacklevel=2)\n\n if freq == 'C':\n holidays = kwargs.pop('holidays', [])\n weekmask = kwargs.pop('weekmask', 'Mon Tue Wed Thu Fri')\n freq = CDay(holidays=holidays, weekmask=weekmask)\n\n return date_range(start=start, end=end, periods=periods, freq=freq,\n tz=tz, normalize=normalize, name=name,\n closed=closed, **kwargs)", "response": "Return a fixed frequency DatetimeIndex with CustomBusinessDay as the base for the given start and end dates."} {"instruction": "How would you implement a function in Python 3 that\nsplits data into blocks obj index and return conformed data.", "response": "def _create_blocks(self):\n \"\"\"\n Split data into blocks & return conformed data.\n \"\"\"\n\n obj, index = self._convert_freq()\n if index is not None:\n index = self._on\n\n # filter out the on from the object\n if self.on is not None:\n if obj.ndim == 2:\n obj = obj.reindex(columns=obj.columns.difference([self.on]),\n copy=False)\n blocks = obj._to_dict_of_blocks(copy=False).values()\n\n return blocks, obj, index"} {"instruction": "Here you have a function in Python 3, explain what it does\ndef _gotitem(self, key, ndim, subset=None):\n\n # create a new object to prevent aliasing\n if subset is None:\n subset = self.obj\n self = self._shallow_copy(subset)\n self._reset_cache()\n if subset.ndim == 2:\n if is_scalar(key) and key in subset or is_list_like(key):\n self._selection = key\n return self", "response": "Internal method to handle the key selection."} {"instruction": "Implement a function in Python 3 to\nreturn index as ndarrays.", "response": "def _get_index(self, index=None):\n \"\"\"\n Return index as ndarrays.\n\n Returns\n -------\n tuple of (index, index_as_ndarray)\n \"\"\"\n\n if self.is_freq_type:\n if index is None:\n index = self._on\n return index, index.asi8\n return index, index"} {"instruction": "Write a Python 3 script to\nwrap a single result.", "response": "def _wrap_result(self, result, block=None, obj=None):\n \"\"\"\n Wrap a single result.\n \"\"\"\n\n if obj is None:\n obj = self._selected_obj\n index = obj.index\n\n if isinstance(result, np.ndarray):\n\n # coerce if necessary\n if block is not None:\n if is_timedelta64_dtype(block.values.dtype):\n from pandas import to_timedelta\n result = to_timedelta(\n result.ravel(), unit='ns').values.reshape(result.shape)\n\n if result.ndim == 1:\n from pandas import Series\n return Series(result, index, name=obj.name)\n\n return type(obj)(result, index=index, columns=block.columns)\n return result"} {"instruction": "Explain what the following Python 3 code does\ndef _wrap_results(self, results, blocks, obj):\n\n from pandas import Series, concat\n from pandas.core.index import ensure_index\n\n final = []\n for result, block in zip(results, blocks):\n\n result = self._wrap_result(result, block=block, obj=obj)\n if result.ndim == 1:\n return result\n final.append(result)\n\n # if we have an 'on' column\n # we want to put it back into the results\n # in the same location\n columns = self._selected_obj.columns\n if self.on is not None and not self._on.equals(obj.index):\n\n name = self._on.name\n final.append(Series(self._on, index=obj.index, name=name))\n\n if self._selection is not None:\n\n selection = ensure_index(self._selection)\n\n # need to reorder to include original location of\n # the on column (if its not already there)\n if name not in selection:\n columns = self.obj.columns\n indexer = columns.get_indexer(selection.tolist() + [name])\n columns = columns.take(sorted(indexer))\n\n if not len(final):\n return obj.astype('float64')\n return concat(final, axis=1).reindex(columns=columns, copy=False)", "response": "Wrap the results into a list of ndarrays."} {"instruction": "Here you have a function in Python 3, explain what it does\ndef _center_window(self, result, window):\n if self.axis > result.ndim - 1:\n raise ValueError(\"Requested axis is larger then no. of argument \"\n \"dimensions\")\n\n offset = _offset(window, True)\n if offset > 0:\n if isinstance(result, (ABCSeries, ABCDataFrame)):\n result = result.slice_shift(-offset, axis=self.axis)\n else:\n lead_indexer = [slice(None)] * result.ndim\n lead_indexer[self.axis] = slice(offset, None)\n result = np.copy(result[tuple(lead_indexer)])\n return result", "response": "Center the result in the window."} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef _prep_window(self, **kwargs):\n\n window = self._get_window()\n if isinstance(window, (list, tuple, np.ndarray)):\n return com.asarray_tuplesafe(window).astype(float)\n elif is_integer(window):\n import scipy.signal as sig\n\n # the below may pop from kwargs\n def _validate_win_type(win_type, kwargs):\n arg_map = {'kaiser': ['beta'],\n 'gaussian': ['std'],\n 'general_gaussian': ['power', 'width'],\n 'slepian': ['width']}\n if win_type in arg_map:\n return tuple([win_type] + _pop_args(win_type,\n arg_map[win_type],\n kwargs))\n return win_type\n\n def _pop_args(win_type, arg_names, kwargs):\n msg = '%s window requires %%s' % win_type\n all_args = []\n for n in arg_names:\n if n not in kwargs:\n raise ValueError(msg % n)\n all_args.append(kwargs.pop(n))\n return all_args\n\n win_type = _validate_win_type(self.win_type, kwargs)\n # GH #15662. `False` makes symmetric window, rather than periodic.\n return sig.get_window(win_type, window, False).astype(float)", "response": "Prepare the window for the current time period."} {"instruction": "How would you code a function in Python 3 to\napply a moving window on the data.", "response": "def _apply_window(self, mean=True, **kwargs):\n \"\"\"\n Applies a moving window of type ``window_type`` on the data.\n\n Parameters\n ----------\n mean : bool, default True\n If True computes weighted mean, else weighted sum\n\n Returns\n -------\n y : same type as input argument\n\n \"\"\"\n window = self._prep_window(**kwargs)\n center = self.center\n\n blocks, obj, index = self._create_blocks()\n results = []\n for b in blocks:\n try:\n values = self._prep_values(b.values)\n except TypeError:\n results.append(b.values.copy())\n continue\n\n if values.size == 0:\n results.append(values.copy())\n continue\n\n offset = _offset(window, center)\n additional_nans = np.array([np.NaN] * offset)\n\n def f(arg, *args, **kwargs):\n minp = _use_window(self.min_periods, len(window))\n return libwindow.roll_window(np.concatenate((arg,\n additional_nans))\n if center else arg, window, minp,\n avg=mean)\n\n result = np.apply_along_axis(f, self.axis, values)\n\n if center:\n result = self._center_window(result, window)\n results.append(result)\n\n return self._wrap_results(results, blocks, obj)"} {"instruction": "Implement a Python 3 function for\ndispatching to apply; we are stripping all of the _apply kwargs and performing the original function call on the grouped object.", "response": "def _apply(self, func, name, window=None, center=None,\n check_minp=None, **kwargs):\n \"\"\"\n Dispatch to apply; we are stripping all of the _apply kwargs and\n performing the original function call on the grouped object.\n \"\"\"\n\n def f(x, name=name, *args):\n x = self._shallow_copy(x)\n\n if isinstance(name, str):\n return getattr(x, name)(*args, **kwargs)\n\n return x.apply(name, *args, **kwargs)\n\n return self._groupby.apply(f)"} {"instruction": "How would you explain what the following Python 3 function does\ndef _apply(self, func, name=None, window=None, center=None,\n check_minp=None, **kwargs):\n \"\"\"\n Rolling statistical measure using supplied function.\n\n Designed to be used with passed-in Cython array-based functions.\n\n Parameters\n ----------\n func : str/callable to apply\n name : str, optional\n name of this function\n window : int/array, default to _get_window()\n center : bool, default to self.center\n check_minp : function, default to _use_window\n\n Returns\n -------\n y : type of input\n \"\"\"\n if center is None:\n center = self.center\n if window is None:\n window = self._get_window()\n\n if check_minp is None:\n check_minp = _use_window\n\n blocks, obj, index = self._create_blocks()\n index, indexi = self._get_index(index=index)\n results = []\n for b in blocks:\n values = self._prep_values(b.values)\n\n if values.size == 0:\n results.append(values.copy())\n continue\n\n # if we have a string function name, wrap it\n if isinstance(func, str):\n cfunc = getattr(libwindow, func, None)\n if cfunc is None:\n raise ValueError(\"we do not support this function \"\n \"in libwindow.{func}\".format(func=func))\n\n def func(arg, window, min_periods=None, closed=None):\n minp = check_minp(min_periods, window)\n # ensure we are only rolling on floats\n arg = ensure_float64(arg)\n return cfunc(arg,\n window, minp, indexi, closed, **kwargs)\n\n # calculation function\n if center:\n offset = _offset(window, center)\n additional_nans = np.array([np.NaN] * offset)\n\n def calc(x):\n return func(np.concatenate((x, additional_nans)),\n window, min_periods=self.min_periods,\n closed=self.closed)\n else:\n\n def calc(x):\n return func(x, window, min_periods=self.min_periods,\n closed=self.closed)\n\n with np.errstate(all='ignore'):\n if values.ndim > 1:\n result = np.apply_along_axis(calc, self.axis, values)\n else:\n result = calc(values)\n\n if center:\n result = self._center_window(result, window)\n\n results.append(result)\n\n return self._wrap_results(results, blocks, obj)", "response": "Applies a function to the internal state of the object."} {"instruction": "How would you explain what the following Python 3 function does\ndef _validate_freq(self):\n from pandas.tseries.frequencies import to_offset\n try:\n return to_offset(self.window)\n except (TypeError, ValueError):\n raise ValueError(\"passed window {0} is not \"\n \"compatible with a datetimelike \"\n \"index\".format(self.window))", "response": "Validate & return window frequency."} {"instruction": "Write a Python 3 function for\ngetting the window length over which to perform some operation.", "response": "def _get_window(self, other=None):\n \"\"\"\n Get the window length over which to perform some operation.\n\n Parameters\n ----------\n other : object, default None\n The other object that is involved in the operation.\n Such an object is involved for operations like covariance.\n\n Returns\n -------\n window : int\n The window length.\n \"\"\"\n axis = self.obj._get_axis(self.axis)\n length = len(axis) + (other is not None) * len(axis)\n\n other = self.min_periods or -1\n return max(length, other)"} {"instruction": "Can you implement a function in Python 3 that\napplies a function to the current object and return the result.", "response": "def _apply(self, func, **kwargs):\n \"\"\"\n Rolling statistical measure using supplied function. Designed to be\n used with passed-in Cython array-based functions.\n\n Parameters\n ----------\n func : str/callable to apply\n\n Returns\n -------\n y : same type as input argument\n \"\"\"\n blocks, obj, index = self._create_blocks()\n results = []\n for b in blocks:\n try:\n values = self._prep_values(b.values)\n except TypeError:\n results.append(b.values.copy())\n continue\n\n if values.size == 0:\n results.append(values.copy())\n continue\n\n # if we have a string function name, wrap it\n if isinstance(func, str):\n cfunc = getattr(libwindow, func, None)\n if cfunc is None:\n raise ValueError(\"we do not support this function \"\n \"in libwindow.{func}\".format(func=func))\n\n def func(arg):\n return cfunc(arg, self.com, int(self.adjust),\n int(self.ignore_na), int(self.min_periods))\n\n results.append(np.apply_along_axis(func, self.axis, values))\n\n return self._wrap_results(results, blocks, obj)"} {"instruction": "How would you explain what the following Python 3 function does\ndef mean(self, *args, **kwargs):\n nv.validate_window_func('mean', args, kwargs)\n return self._apply('ewma', **kwargs)", "response": "Mean of the set of items in the logarithmic time series."} {"instruction": "Explain what the following Python 3 code does\ndef std(self, bias=False, *args, **kwargs):\n nv.validate_window_func('std', args, kwargs)\n return _zsqrt(self.var(bias=bias, **kwargs))", "response": "Exponential weighted moving stddev."} {"instruction": "How would you explain what the following Python 3 function does\ndef var(self, bias=False, *args, **kwargs):\n nv.validate_window_func('var', args, kwargs)\n\n def f(arg):\n return libwindow.ewmcov(arg, arg, self.com, int(self.adjust),\n int(self.ignore_na), int(self.min_periods),\n int(bias))\n\n return self._apply(f, **kwargs)", "response": "Exponential weighted moving variance."} {"instruction": "Create a Python 3 function for\nmaking sure that time and panels are conformable.", "response": "def _ensure_like_indices(time, panels):\n \"\"\"\n Makes sure that time and panels are conformable.\n \"\"\"\n n_time = len(time)\n n_panel = len(panels)\n u_panels = np.unique(panels) # this sorts!\n u_time = np.unique(time)\n if len(u_time) == n_time:\n time = np.tile(u_time, len(u_panels))\n if len(u_panels) == n_panel:\n panels = np.repeat(u_panels, len(u_time))\n return time, panels"} {"instruction": "Implement a function in Python 3 to\nreturn a MultiIndex suitable for a panel - like DataFrame.", "response": "def panel_index(time, panels, names=None):\n \"\"\"\n Returns a multi-index suitable for a panel-like DataFrame.\n\n Parameters\n ----------\n time : array-like\n Time index, does not have to repeat\n panels : array-like\n Panel index, does not have to repeat\n names : list, optional\n List containing the names of the indices\n\n Returns\n -------\n multi_index : MultiIndex\n Time index is the first level, the panels are the second level.\n\n Examples\n --------\n >>> years = range(1960,1963)\n >>> panels = ['A', 'B', 'C']\n >>> panel_idx = panel_index(years, panels)\n >>> panel_idx\n MultiIndex([(1960, 'A'), (1961, 'A'), (1962, 'A'), (1960, 'B'),\n (1961, 'B'), (1962, 'B'), (1960, 'C'), (1961, 'C'),\n (1962, 'C')], dtype=object)\n\n or\n\n >>> years = np.repeat(range(1960,1963), 3)\n >>> panels = np.tile(['A', 'B', 'C'], 3)\n >>> panel_idx = panel_index(years, panels)\n >>> panel_idx\n MultiIndex([(1960, 'A'), (1960, 'B'), (1960, 'C'), (1961, 'A'),\n (1961, 'B'), (1961, 'C'), (1962, 'A'), (1962, 'B'),\n (1962, 'C')], dtype=object)\n \"\"\"\n if names is None:\n names = ['time', 'panel']\n time, panels = _ensure_like_indices(time, panels)\n return MultiIndex.from_arrays([time, panels], sortorder=None, names=names)"} {"instruction": "Make a summary of the following Python 3 code\ndef _init_data(self, data, copy, dtype, **kwargs):\n if data is None:\n data = {}\n if dtype is not None:\n dtype = self._validate_dtype(dtype)\n\n passed_axes = [kwargs.pop(a, None) for a in self._AXIS_ORDERS]\n\n if kwargs:\n raise TypeError('_init_data() got an unexpected keyword '\n 'argument \"{0}\"'.format(list(kwargs.keys())[0]))\n\n axes = None\n if isinstance(data, BlockManager):\n if com._any_not_none(*passed_axes):\n axes = [x if x is not None else y\n for x, y in zip(passed_axes, data.axes)]\n mgr = data\n elif isinstance(data, dict):\n mgr = self._init_dict(data, passed_axes, dtype=dtype)\n copy = False\n dtype = None\n elif isinstance(data, (np.ndarray, list)):\n mgr = self._init_matrix(data, passed_axes, dtype=dtype, copy=copy)\n copy = False\n dtype = None\n elif is_scalar(data) and com._all_not_none(*passed_axes):\n values = cast_scalar_to_array([len(x) for x in passed_axes],\n data, dtype=dtype)\n mgr = self._init_matrix(values, passed_axes, dtype=values.dtype,\n copy=False)\n copy = False\n else: # pragma: no cover\n raise ValueError('Panel constructor not properly called!')\n\n NDFrame.__init__(self, mgr, axes=axes, copy=copy, dtype=dtype)", "response": "Generate ND initialization for the specified data."} {"instruction": "Explain what the following Python 3 code does\ndef from_dict(cls, data, intersect=False, orient='items', dtype=None):\n from collections import defaultdict\n\n orient = orient.lower()\n if orient == 'minor':\n new_data = defaultdict(OrderedDict)\n for col, df in data.items():\n for item, s in df.items():\n new_data[item][col] = s\n data = new_data\n elif orient != 'items': # pragma: no cover\n raise ValueError('Orientation must be one of {items, minor}.')\n\n d = cls._homogenize_dict(cls, data, intersect=intersect, dtype=dtype)\n ks = list(d['data'].keys())\n if not isinstance(d['data'], OrderedDict):\n ks = list(sorted(ks))\n d[cls._info_axis_name] = Index(ks)\n return cls(**d)", "response": "Construct a new Panel object from a dict of DataFrame objects."} {"instruction": "Make a summary of the following Python 3 code\ndef _get_plane_axes_index(self, axis):\n axis_name = self._get_axis_name(axis)\n\n if axis_name == 'major_axis':\n index = 'minor_axis'\n columns = 'items'\n if axis_name == 'minor_axis':\n index = 'major_axis'\n columns = 'items'\n elif axis_name == 'items':\n index = 'major_axis'\n columns = 'minor_axis'\n\n return index, columns", "response": "Get my plane axes indexes"} {"instruction": "How would you implement a function in Python 3 that\ngets my plane axes indexes", "response": "def _get_plane_axes(self, axis):\n \"\"\"\n Get my plane axes indexes: these are already\n (as compared with higher level planes),\n as we are returning a DataFrame axes.\n \"\"\"\n return [self._get_axis(axi)\n for axi in self._get_plane_axes_index(axis)]"} {"instruction": "Explain what the following Python 3 code does\ndef to_excel(self, path, na_rep='', engine=None, **kwargs):\n from pandas.io.excel import ExcelWriter\n\n if isinstance(path, str):\n writer = ExcelWriter(path, engine=engine)\n else:\n writer = path\n kwargs['na_rep'] = na_rep\n\n for item, df in self.iteritems():\n name = str(item)\n df.to_excel(writer, name, **kwargs)\n writer.save()", "response": "Writes each DataFrame in Panel to an Excel file."} {"instruction": "Explain what the following Python 3 code does\ndef get_value(self, *args, **kwargs):\n warnings.warn(\"get_value is deprecated and will be removed \"\n \"in a future release. Please use \"\n \".at[] or .iat[] accessors instead\", FutureWarning,\n stacklevel=2)\n return self._get_value(*args, **kwargs)", "response": "Quickly retrieve a single value at a specific location."} {"instruction": "Implement a function in Python 3 to\nconform input DataFrame to align with chosen axis pair.", "response": "def conform(self, frame, axis='items'):\n \"\"\"\n Conform input DataFrame to align with chosen axis pair.\n\n Parameters\n ----------\n frame : DataFrame\n axis : {'items', 'major', 'minor'}\n\n Axis the input corresponds to. E.g., if axis='major', then\n the frame's columns would be items, and the index would be\n values of the minor axis\n\n Returns\n -------\n DataFrame\n \"\"\"\n axes = self._get_plane_axes(axis)\n return frame.reindex(**self._extract_axes_for_slice(self, axes))"} {"instruction": "Write a Python 3 function that can\nround each value in Panel to a specified number of decimal places.", "response": "def round(self, decimals=0, *args, **kwargs):\n \"\"\"\n Round each value in Panel to a specified number of decimal places.\n\n .. versionadded:: 0.18.0\n\n Parameters\n ----------\n decimals : int\n Number of decimal places to round to (default: 0).\n If decimals is negative, it specifies the number of\n positions to the left of the decimal point.\n\n Returns\n -------\n Panel object\n\n See Also\n --------\n numpy.around\n \"\"\"\n nv.validate_round(args, kwargs)\n\n if is_integer(decimals):\n result = np.apply_along_axis(np.round, 0, self.values)\n return self._wrap_result(result, axis=0)\n raise TypeError(\"decimals must be an integer\")"} {"instruction": "How would you explain what the following Python 3 function does\ndef dropna(self, axis=0, how='any', inplace=False):\n axis = self._get_axis_number(axis)\n\n values = self.values\n mask = notna(values)\n\n for ax in reversed(sorted(set(range(self._AXIS_LEN)) - {axis})):\n mask = mask.sum(ax)\n\n per_slice = np.prod(values.shape[:axis] + values.shape[axis + 1:])\n\n if how == 'all':\n cond = mask > 0\n else:\n cond = mask == per_slice\n\n new_ax = self._get_axis(axis)[cond]\n result = self.reindex_axis(new_ax, axis=axis)\n if inplace:\n self._update_inplace(result)\n else:\n return result", "response": "Drop 2D from Panel along passed axis constant."} {"instruction": "Explain what the following Python 3 code does\ndef xs(self, key, axis=1):\n axis = self._get_axis_number(axis)\n if axis == 0:\n return self[key]\n\n self._consolidate_inplace()\n axis_number = self._get_axis_number(axis)\n new_data = self._data.xs(key, axis=axis_number, copy=False)\n result = self._construct_return_type(new_data)\n copy = new_data.is_mixed_type\n result._set_is_copy(self, copy=copy)\n return result", "response": "Return slice of panel along selected axis."} {"instruction": "Write a Python 3 function for\nreturning a new Series with i - th element removed.", "response": "def _ixs(self, i, axis=0):\n \"\"\"\n Parameters\n ----------\n i : int, slice, or sequence of integers\n axis : int\n \"\"\"\n\n ax = self._get_axis(axis)\n key = ax[i]\n\n # xs cannot handle a non-scalar key, so just reindex here\n # if we have a multi-index and a single tuple, then its a reduction\n # (GH 7516)\n if not (isinstance(ax, MultiIndex) and isinstance(key, tuple)):\n if is_list_like(key):\n indexer = {self._get_axis_name(axis): key}\n return self.reindex(**indexer)\n\n # a reduction\n if axis == 0:\n values = self._data.iget(i)\n return self._box_item_values(key, values)\n\n # xs by position\n self._consolidate_inplace()\n new_data = self._data.xs(i, axis=axis, copy=True, takeable=True)\n return self._construct_return_type(new_data)"} {"instruction": "Given the following Python 3 function, write the documentation\ndef to_frame(self, filter_observations=True):\n _, N, K = self.shape\n\n if filter_observations:\n # shaped like the return DataFrame\n mask = notna(self.values).all(axis=0)\n # size = mask.sum()\n selector = mask.ravel()\n else:\n # size = N * K\n selector = slice(None, None)\n\n data = {item: self[item].values.ravel()[selector]\n for item in self.items}\n\n def construct_multi_parts(idx, n_repeat, n_shuffle=1):\n # Replicates and shuffles MultiIndex, returns individual attributes\n codes = [np.repeat(x, n_repeat) for x in idx.codes]\n # Assumes that each label is divisible by n_shuffle\n codes = [x.reshape(n_shuffle, -1).ravel(order='F')\n for x in codes]\n codes = [x[selector] for x in codes]\n levels = idx.levels\n names = idx.names\n return codes, levels, names\n\n def construct_index_parts(idx, major=True):\n levels = [idx]\n if major:\n codes = [np.arange(N).repeat(K)[selector]]\n names = idx.name or 'major'\n else:\n codes = np.arange(K).reshape(1, K)[np.zeros(N, dtype=int)]\n codes = [codes.ravel()[selector]]\n names = idx.name or 'minor'\n names = [names]\n return codes, levels, names\n\n if isinstance(self.major_axis, MultiIndex):\n major_codes, major_levels, major_names = construct_multi_parts(\n self.major_axis, n_repeat=K)\n else:\n major_codes, major_levels, major_names = construct_index_parts(\n self.major_axis)\n\n if isinstance(self.minor_axis, MultiIndex):\n minor_codes, minor_levels, minor_names = construct_multi_parts(\n self.minor_axis, n_repeat=N, n_shuffle=K)\n else:\n minor_codes, minor_levels, minor_names = construct_index_parts(\n self.minor_axis, major=False)\n\n levels = major_levels + minor_levels\n codes = major_codes + minor_codes\n names = major_names + minor_names\n\n index = MultiIndex(levels=levels, codes=codes, names=names,\n verify_integrity=False)\n\n return DataFrame(data, index=index, columns=self.items)", "response": "Transform wide format into long stacked format as DataFrame whose columns are the Panel s items levels and names."} {"instruction": "How would you implement a function in Python 3 that\napplies a function to each element of the Panel along a specific axis.", "response": "def apply(self, func, axis='major', **kwargs):\n \"\"\"\n Apply function along axis (or axes) of the Panel.\n\n Parameters\n ----------\n func : function\n Function to apply to each combination of 'other' axes\n e.g. if axis = 'items', the combination of major_axis/minor_axis\n will each be passed as a Series; if axis = ('items', 'major'),\n DataFrames of items & major axis will be passed\n axis : {'items', 'minor', 'major'}, or {0, 1, 2}, or a tuple with two\n axes\n **kwargs\n Additional keyword arguments will be passed to the function.\n\n Returns\n -------\n result : Panel, DataFrame, or Series\n\n Examples\n --------\n\n Returns a Panel with the square root of each element\n\n >>> p = pd.Panel(np.random.rand(4, 3, 2)) # doctest: +SKIP\n >>> p.apply(np.sqrt)\n\n Equivalent to p.sum(1), returning a DataFrame\n\n >>> p.apply(lambda x: x.sum(), axis=1) # doctest: +SKIP\n\n Equivalent to previous:\n\n >>> p.apply(lambda x: x.sum(), axis='major') # doctest: +SKIP\n\n Return the shapes of each DataFrame over axis 2 (i.e the shapes of\n items x major), as a Series\n\n >>> p.apply(lambda x: x.shape, axis=(0,1)) # doctest: +SKIP\n \"\"\"\n\n if kwargs and not isinstance(func, np.ufunc):\n f = lambda x: func(x, **kwargs)\n else:\n f = func\n\n # 2d-slabs\n if isinstance(axis, (tuple, list)) and len(axis) == 2:\n return self._apply_2d(f, axis=axis)\n\n axis = self._get_axis_number(axis)\n\n # try ufunc like\n if isinstance(f, np.ufunc):\n try:\n with np.errstate(all='ignore'):\n result = np.apply_along_axis(func, axis, self.values)\n return self._wrap_result(result, axis=axis)\n except (AttributeError):\n pass\n\n # 1d\n return self._apply_1d(f, axis=axis)"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef _apply_2d(self, func, axis):\n ndim = self.ndim\n axis = [self._get_axis_number(a) for a in axis]\n\n # construct slabs, in 2-d this is a DataFrame result\n indexer_axis = list(range(ndim))\n for a in axis:\n indexer_axis.remove(a)\n indexer_axis = indexer_axis[0]\n\n slicer = [slice(None, None)] * ndim\n ax = self._get_axis(indexer_axis)\n\n results = []\n for i, e in enumerate(ax):\n slicer[indexer_axis] = i\n sliced = self.iloc[tuple(slicer)]\n\n obj = func(sliced)\n results.append((e, obj))\n\n return self._construct_return_type(dict(results))", "response": "Apply func to each entry in the specified axis."} {"instruction": "Can you write a function in Python 3 where it\nconstructs the type of the result.", "response": "def _construct_return_type(self, result, axes=None):\n \"\"\"\n Return the type for the ndim of the result.\n \"\"\"\n ndim = getattr(result, 'ndim', None)\n\n # need to assume they are the same\n if ndim is None:\n if isinstance(result, dict):\n ndim = getattr(list(result.values())[0], 'ndim', 0)\n\n # have a dict, so top-level is +1 dim\n if ndim != 0:\n ndim += 1\n\n # scalar\n if ndim == 0:\n return Series(result)\n\n # same as self\n elif self.ndim == ndim:\n # return the construction dictionary for these axes\n if axes is None:\n return self._constructor(result)\n return self._constructor(result, **self._construct_axes_dict())\n\n # sliced\n elif self.ndim == ndim + 1:\n if axes is None:\n return self._constructor_sliced(result)\n return self._constructor_sliced(\n result, **self._extract_axes_for_slice(self, axes))\n\n raise ValueError('invalid _construct_return_type [self->{self}] '\n '[result->{result}]'.format(self=self, result=result))"} {"instruction": "Write a Python 3 script for\nreturning the number of observations over requested axis.", "response": "def count(self, axis='major'):\n \"\"\"\n Return number of observations over requested axis.\n\n Parameters\n ----------\n axis : {'items', 'major', 'minor'} or {0, 1, 2}\n\n Returns\n -------\n count : DataFrame\n \"\"\"\n i = self._get_axis_number(axis)\n\n values = self.values\n mask = np.isfinite(values)\n result = mask.sum(axis=i, dtype='int64')\n\n return self._wrap_result(result, axis)"} {"instruction": "Explain what the following Python 3 code does\ndef shift(self, periods=1, freq=None, axis='major'):\n if freq:\n return self.tshift(periods, freq, axis=axis)\n\n return super().slice_shift(periods, axis=axis)", "response": "Shift the DataFrame by desired number of periods with an optional time freq."} {"instruction": "Implement a function in Python 3 to\njoin two Panels together.", "response": "def join(self, other, how='left', lsuffix='', rsuffix=''):\n \"\"\"\n Join items with other Panel either on major and minor axes column.\n\n Parameters\n ----------\n other : Panel or list of Panels\n Index should be similar to one of the columns in this one\n how : {'left', 'right', 'outer', 'inner'}\n How to handle indexes of the two objects. Default: 'left'\n for joining on index, None otherwise\n * left: use calling frame's index\n * right: use input frame's index\n * outer: form union of indexes\n * inner: use intersection of indexes\n lsuffix : string\n Suffix to use from left frame's overlapping columns\n rsuffix : string\n Suffix to use from right frame's overlapping columns\n\n Returns\n -------\n joined : Panel\n \"\"\"\n from pandas.core.reshape.concat import concat\n\n if isinstance(other, Panel):\n join_major, join_minor = self._get_join_index(other, how)\n this = self.reindex(major=join_major, minor=join_minor)\n other = other.reindex(major=join_major, minor=join_minor)\n merged_data = this._data.merge(other._data, lsuffix, rsuffix)\n return self._constructor(merged_data)\n else:\n if lsuffix or rsuffix:\n raise ValueError('Suffixes not supported when passing '\n 'multiple panels')\n\n if how == 'left':\n how = 'outer'\n join_axes = [self.major_axis, self.minor_axis]\n elif how == 'right':\n raise ValueError('Right join not supported with multiple '\n 'panels')\n else:\n join_axes = None\n\n return concat([self] + list(other), axis=0, join=how,\n join_axes=join_axes, verify_integrity=True)"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef update(self, other, join='left', overwrite=True, filter_func=None,\n errors='ignore'):\n \"\"\"\n Modify Panel in place using non-NA values from other Panel.\n\n May also use object coercible to Panel. Will align on items.\n\n Parameters\n ----------\n other : Panel, or object coercible to Panel\n The object from which the caller will be udpated.\n join : {'left', 'right', 'outer', 'inner'}, default 'left'\n How individual DataFrames are joined.\n overwrite : bool, default True\n If True then overwrite values for common keys in the calling Panel.\n filter_func : callable(1d-array) -> 1d-array, default None\n Can choose to replace values other than NA. Return True for values\n that should be updated.\n errors : {'raise', 'ignore'}, default 'ignore'\n If 'raise', will raise an error if a DataFrame and other both.\n\n .. versionchanged :: 0.24.0\n Changed from `raise_conflict=False|True`\n to `errors='ignore'|'raise'`.\n\n See Also\n --------\n DataFrame.update : Similar method for DataFrames.\n dict.update : Similar method for dictionaries.\n \"\"\"\n\n if not isinstance(other, self._constructor):\n other = self._constructor(other)\n\n axis_name = self._info_axis_name\n axis_values = self._info_axis\n other = other.reindex(**{axis_name: axis_values})\n\n for frame in axis_values:\n self[frame].update(other[frame], join=join, overwrite=overwrite,\n filter_func=filter_func, errors=errors)", "response": "Modify Panel in place using non - NA values from other Panel."} {"instruction": "Explain what the following Python 3 code does\ndef _extract_axes(self, data, axes, **kwargs):\n return [self._extract_axis(self, data, axis=i, **kwargs)\n for i, a in enumerate(axes)]", "response": "Extract the axes from the data."} {"instruction": "Can you tell what is the following Python 3 function doing\ndef _extract_axes_for_slice(self, axes):\n return {self._AXIS_SLICEMAP[i]: a for i, a in\n zip(self._AXIS_ORDERS[self._AXIS_LEN - len(axes):], axes)}", "response": "Extract the axes for the slice dictionary."} {"instruction": "Implement a Python 3 function for\nconverting a dict of _constructor_sliced - like objects to a dictionary of aligned results and indices.", "response": "def _homogenize_dict(self, frames, intersect=True, dtype=None):\n \"\"\"\n Conform set of _constructor_sliced-like objects to either\n an intersection of indices / columns or a union.\n\n Parameters\n ----------\n frames : dict\n intersect : boolean, default True\n\n Returns\n -------\n dict of aligned results & indices\n \"\"\"\n\n result = dict()\n # caller differs dict/ODict, preserved type\n if isinstance(frames, OrderedDict):\n result = OrderedDict()\n\n adj_frames = OrderedDict()\n for k, v in frames.items():\n if isinstance(v, dict):\n adj_frames[k] = self._constructor_sliced(v)\n else:\n adj_frames[k] = v\n\n axes = self._AXIS_ORDERS[1:]\n axes_dict = {a: ax for a, ax in zip(axes, self._extract_axes(\n self, adj_frames, axes, intersect=intersect))}\n\n reindex_dict = {self._AXIS_SLICEMAP[a]: axes_dict[a] for a in axes}\n reindex_dict['copy'] = False\n for key, frame in adj_frames.items():\n if frame is not None:\n result[key] = frame.reindex(**reindex_dict)\n else:\n result[key] = None\n\n axes_dict['data'] = result\n axes_dict['dtype'] = dtype\n return axes_dict"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef get_group_index(labels, shape, sort, xnull):\n def _int64_cut_off(shape):\n acc = 1\n for i, mul in enumerate(shape):\n acc *= int(mul)\n if not acc < _INT64_MAX:\n return i\n return len(shape)\n\n def maybe_lift(lab, size):\n # promote nan values (assigned -1 label in lab array)\n # so that all output values are non-negative\n return (lab + 1, size + 1) if (lab == -1).any() else (lab, size)\n\n labels = map(ensure_int64, labels)\n if not xnull:\n labels, shape = map(list, zip(*map(maybe_lift, labels, shape)))\n\n labels = list(labels)\n shape = list(shape)\n\n # Iteratively process all the labels in chunks sized so less\n # than _INT64_MAX unique int ids will be required for each chunk\n while True:\n # how many levels can be done without overflow:\n nlev = _int64_cut_off(shape)\n\n # compute flat ids for the first `nlev` levels\n stride = np.prod(shape[1:nlev], dtype='i8')\n out = stride * labels[0].astype('i8', subok=False, copy=False)\n\n for i in range(1, nlev):\n if shape[i] == 0:\n stride = 0\n else:\n stride //= shape[i]\n out += labels[i] * stride\n\n if xnull: # exclude nulls\n mask = labels[0] == -1\n for lab in labels[1:nlev]:\n mask |= lab == -1\n out[mask] = -1\n\n if nlev == len(shape): # all levels done!\n break\n\n # compress what has been done so far in order to avoid overflow\n # to retain lexical ranks, obs_ids should be sorted\n comp_ids, obs_ids = compress_group_index(out, sort=sort)\n\n labels = [comp_ids] + labels[nlev:]\n shape = [len(obs_ids)] + shape[nlev:]\n\n return out", "response": "Returns the index of the unique items in the hypothetical list that are in the given label_list."} {"instruction": "Write a Python 3 function that can\nreconstruct the observed group ids from the observed group ids shape and labels", "response": "def decons_obs_group_ids(comp_ids, obs_ids, shape, labels, xnull):\n \"\"\"\n reconstruct labels from observed group ids\n\n Parameters\n ----------\n xnull: boolean,\n if nulls are excluded; i.e. -1 labels are passed through\n \"\"\"\n\n if not xnull:\n lift = np.fromiter(((a == -1).any() for a in labels), dtype='i8')\n shape = np.asarray(shape, dtype='i8') + lift\n\n if not is_int64_overflow_possible(shape):\n # obs ids are deconstructable! take the fast route!\n out = decons_group_index(obs_ids, shape)\n return out if xnull or not lift.any() \\\n else [x - y for x, y in zip(out, lift)]\n\n i = unique_label_indices(comp_ids)\n i8copy = lambda a: a.astype('i8', subok=False, copy=True)\n return [i8copy(lab[i]) for lab in labels]"} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef nargsort(items, kind='quicksort', ascending=True, na_position='last'):\n\n # specially handle Categorical\n if is_categorical_dtype(items):\n if na_position not in {'first', 'last'}:\n raise ValueError('invalid na_position: {!r}'.format(na_position))\n\n mask = isna(items)\n cnt_null = mask.sum()\n sorted_idx = items.argsort(ascending=ascending, kind=kind)\n if ascending and na_position == 'last':\n # NaN is coded as -1 and is listed in front after sorting\n sorted_idx = np.roll(sorted_idx, -cnt_null)\n elif not ascending and na_position == 'first':\n # NaN is coded as -1 and is listed in the end after sorting\n sorted_idx = np.roll(sorted_idx, cnt_null)\n return sorted_idx\n\n with warnings.catch_warnings():\n # https://github.com/pandas-dev/pandas/issues/25439\n # can be removed once ExtensionArrays are properly handled by nargsort\n warnings.filterwarnings(\n \"ignore\", category=FutureWarning,\n message=\"Converting timezone-aware DatetimeArray to\")\n items = np.asanyarray(items)\n idx = np.arange(len(items))\n mask = isna(items)\n non_nans = items[~mask]\n non_nan_idx = idx[~mask]\n nan_idx = np.nonzero(mask)[0]\n if not ascending:\n non_nans = non_nans[::-1]\n non_nan_idx = non_nan_idx[::-1]\n indexer = non_nan_idx[non_nans.argsort(kind=kind)]\n if not ascending:\n indexer = indexer[::-1]\n # Finally, place the NaNs at the end or the beginning according to\n # na_position\n if na_position == 'last':\n indexer = np.concatenate([indexer, nan_idx])\n elif na_position == 'first':\n indexer = np.concatenate([nan_idx, indexer])\n else:\n raise ValueError('invalid na_position: {!r}'.format(na_position))\n return indexer", "response": "Sort the items by the given kind."} {"instruction": "Create a Python 3 function for\nreturning a diction of labels -> indexers", "response": "def get_indexer_dict(label_list, keys):\n \"\"\" return a diction of {labels} -> {indexers} \"\"\"\n shape = list(map(len, keys))\n\n group_index = get_group_index(label_list, shape, sort=True, xnull=True)\n ngroups = ((group_index.size and group_index.max()) + 1) \\\n if is_int64_overflow_possible(shape) \\\n else np.prod(shape, dtype='i8')\n\n sorter = get_group_index_sorter(group_index, ngroups)\n\n sorted_labels = [lab.take(sorter) for lab in label_list]\n group_index = group_index.take(sorter)\n\n return lib.indices_fast(sorter, group_index, keys, sorted_labels)"} {"instruction": "Implement a function in Python 3 to\nsort the given values and reorder corresponding labels.", "response": "def safe_sort(values, labels=None, na_sentinel=-1, assume_unique=False):\n \"\"\"\n Sort ``values`` and reorder corresponding ``labels``.\n ``values`` should be unique if ``labels`` is not None.\n Safe for use with mixed types (int, str), orders ints before strs.\n\n .. versionadded:: 0.19.0\n\n Parameters\n ----------\n values : list-like\n Sequence; must be unique if ``labels`` is not None.\n labels : list_like\n Indices to ``values``. All out of bound indices are treated as\n \"not found\" and will be masked with ``na_sentinel``.\n na_sentinel : int, default -1\n Value in ``labels`` to mark \"not found\".\n Ignored when ``labels`` is None.\n assume_unique : bool, default False\n When True, ``values`` are assumed to be unique, which can speed up\n the calculation. Ignored when ``labels`` is None.\n\n Returns\n -------\n ordered : ndarray\n Sorted ``values``\n new_labels : ndarray\n Reordered ``labels``; returned when ``labels`` is not None.\n\n Raises\n ------\n TypeError\n * If ``values`` is not list-like or if ``labels`` is neither None\n nor list-like\n * If ``values`` cannot be sorted\n ValueError\n * If ``labels`` is not None and ``values`` contain duplicates.\n \"\"\"\n if not is_list_like(values):\n raise TypeError(\"Only list-like objects are allowed to be passed to\"\n \"safe_sort as values\")\n\n if not isinstance(values, np.ndarray):\n\n # don't convert to string types\n dtype, _ = infer_dtype_from_array(values)\n values = np.asarray(values, dtype=dtype)\n\n def sort_mixed(values):\n # order ints before strings, safe in py3\n str_pos = np.array([isinstance(x, str) for x in values],\n dtype=bool)\n nums = np.sort(values[~str_pos])\n strs = np.sort(values[str_pos])\n return np.concatenate([nums, np.asarray(strs, dtype=object)])\n\n sorter = None\n if lib.infer_dtype(values, skipna=False) == 'mixed-integer':\n # unorderable in py3 if mixed str/int\n ordered = sort_mixed(values)\n else:\n try:\n sorter = values.argsort()\n ordered = values.take(sorter)\n except TypeError:\n # try this anyway\n ordered = sort_mixed(values)\n\n # labels:\n\n if labels is None:\n return ordered\n\n if not is_list_like(labels):\n raise TypeError(\"Only list-like objects or None are allowed to be\"\n \"passed to safe_sort as labels\")\n labels = ensure_platform_int(np.asarray(labels))\n\n from pandas import Index\n if not assume_unique and not Index(values).is_unique:\n raise ValueError(\"values should be unique if labels is not None\")\n\n if sorter is None:\n # mixed types\n (hash_klass, _), values = algorithms._get_data_algo(\n values, algorithms._hashtables)\n t = hash_klass(len(values))\n t.map_locations(values)\n sorter = ensure_platform_int(t.lookup(ordered))\n\n reverse_indexer = np.empty(len(sorter), dtype=np.int_)\n reverse_indexer.put(sorter, np.arange(len(sorter)))\n\n mask = (labels < -len(values)) | (labels >= len(values)) | \\\n (labels == na_sentinel)\n\n # (Out of bound indices will be masked with `na_sentinel` next, so we may\n # deal with them here without performance loss using `mode='wrap'`.)\n new_labels = reverse_indexer.take(labels, mode='wrap')\n np.putmask(new_labels, mask, na_sentinel)\n\n return ordered, ensure_platform_int(new_labels)"} {"instruction": "Given the following Python 3 function, write the documentation\ndef _check_ne_builtin_clash(expr):\n names = expr.names\n overlap = names & _ne_builtins\n\n if overlap:\n s = ', '.join(map(repr, overlap))\n raise NumExprClobberingError('Variables in expression \"{expr}\" '\n 'overlap with builtins: ({s})'\n .format(expr=expr, s=s))", "response": "Attempt to prevent foot - shooting in a helpful way."} {"instruction": "Explain what the following Python 3 code does\ndef evaluate(self):\n if not self._is_aligned:\n self.result_type, self.aligned_axes = _align(self.expr.terms)\n\n # make sure no names in resolvers and locals/globals clash\n res = self._evaluate()\n return _reconstruct_object(self.result_type, res, self.aligned_axes,\n self.expr.terms.return_type)", "response": "Run the engine on the expression\n "} {"instruction": "Can you tell what is the following Python 3 function doing\ndef get_block_type(values, dtype=None):\n dtype = dtype or values.dtype\n vtype = dtype.type\n\n if is_sparse(dtype):\n # Need this first(ish) so that Sparse[datetime] is sparse\n cls = ExtensionBlock\n elif is_categorical(values):\n cls = CategoricalBlock\n elif issubclass(vtype, np.datetime64):\n assert not is_datetime64tz_dtype(values)\n cls = DatetimeBlock\n elif is_datetime64tz_dtype(values):\n cls = DatetimeTZBlock\n elif is_interval_dtype(dtype) or is_period_dtype(dtype):\n cls = ObjectValuesExtensionBlock\n elif is_extension_array_dtype(values):\n cls = ExtensionBlock\n elif issubclass(vtype, np.floating):\n cls = FloatBlock\n elif issubclass(vtype, np.timedelta64):\n assert issubclass(vtype, np.integer)\n cls = TimeDeltaBlock\n elif issubclass(vtype, np.complexfloating):\n cls = ComplexBlock\n elif issubclass(vtype, np.integer):\n cls = IntBlock\n elif dtype == np.bool_:\n cls = BoolBlock\n else:\n cls = ObjectBlock\n return cls", "response": "Find the appropriate Block subclass to use for the given values and dtype."} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef _extend_blocks(result, blocks=None):\n from pandas.core.internals import BlockManager\n if blocks is None:\n blocks = []\n if isinstance(result, list):\n for r in result:\n if isinstance(r, list):\n blocks.extend(r)\n else:\n blocks.append(r)\n elif isinstance(result, BlockManager):\n blocks.extend(result.blocks)\n else:\n blocks.append(result)\n return blocks", "response": "extend the blocks with the result"} {"instruction": "Explain what the following Python 3 code does\ndef _block_shape(values, ndim=1, shape=None):\n if values.ndim < ndim:\n if shape is None:\n shape = values.shape\n if not is_extension_array_dtype(values):\n # TODO: https://github.com/pandas-dev/pandas/issues/23023\n # block.shape is incorrect for \"2D\" ExtensionArrays\n # We can't, and don't need to, reshape.\n values = values.reshape(tuple((1, ) + shape))\n return values", "response": "ensure the shape of the block of data is at least 1 d"} {"instruction": "How would you explain what the following Python 3 function does\ndef _safe_reshape(arr, new_shape):\n if isinstance(arr, ABCSeries):\n arr = arr._values\n if not isinstance(arr, ABCExtensionArray):\n arr = arr.reshape(new_shape)\n return arr", "response": "Reshapes an array to have the specified shape."} {"instruction": "Can you tell what is the following Python 3 function doing\ndef _putmask_smart(v, m, n):\n\n # we cannot use np.asarray() here as we cannot have conversions\n # that numpy does when numeric are mixed with strings\n\n # n should be the length of the mask or a scalar here\n if not is_list_like(n):\n n = np.repeat(n, len(m))\n elif isinstance(n, np.ndarray) and n.ndim == 0: # numpy scalar\n n = np.repeat(np.array(n, ndmin=1), len(m))\n\n # see if we are only masking values that if putted\n # will work in the current dtype\n try:\n nn = n[m]\n\n # make sure that we have a nullable type\n # if we have nulls\n if not _isna_compat(v, nn[0]):\n raise ValueError\n\n # we ignore ComplexWarning here\n with warnings.catch_warnings(record=True):\n warnings.simplefilter(\"ignore\", np.ComplexWarning)\n nn_at = nn.astype(v.dtype)\n\n # avoid invalid dtype comparisons\n # between numbers & strings\n\n # only compare integers/floats\n # don't compare integers to datetimelikes\n if (not is_numeric_v_string_like(nn, nn_at) and\n (is_float_dtype(nn.dtype) or\n is_integer_dtype(nn.dtype) and\n is_float_dtype(nn_at.dtype) or\n is_integer_dtype(nn_at.dtype))):\n\n comp = (nn == nn_at)\n if is_list_like(comp) and comp.all():\n nv = v.copy()\n nv[m] = nn_at\n return nv\n except (ValueError, IndexError, TypeError, OverflowError):\n pass\n\n n = np.asarray(n)\n\n def _putmask_preserve(nv, n):\n try:\n nv[m] = n[m]\n except (IndexError, ValueError):\n nv[m] = n\n return nv\n\n # preserves dtype if possible\n if v.dtype.kind == n.dtype.kind:\n return _putmask_preserve(v, n)\n\n # change the dtype if needed\n dtype, _ = maybe_promote(n.dtype)\n\n if is_extension_type(v.dtype) and is_object_dtype(dtype):\n v = v.get_values(dtype)\n else:\n v = v.astype(dtype)\n\n return _putmask_preserve(v, n)", "response": "Put masking the values in the array v with the mask m and n."} {"instruction": "Implement a function in Python 3 to\ncheck that the number of dimensions of the log entry is consistent with the number of entries in the array.", "response": "def _check_ndim(self, values, ndim):\n \"\"\"\n ndim inference and validation.\n\n Infers ndim from 'values' if not provided to __init__.\n Validates that values.ndim and ndim are consistent if and only if\n the class variable '_validate_ndim' is True.\n\n Parameters\n ----------\n values : array-like\n ndim : int or None\n\n Returns\n -------\n ndim : int\n\n Raises\n ------\n ValueError : the number of dimensions do not match\n \"\"\"\n if ndim is None:\n ndim = values.ndim\n\n if self._validate_ndim and values.ndim != ndim:\n msg = (\"Wrong number of dimensions. values.ndim != ndim \"\n \"[{} != {}]\")\n raise ValueError(msg.format(values.ndim, ndim))\n\n return ndim"} {"instruction": "Explain what the following Python 3 code does\ndef is_categorical_astype(self, dtype):\n if dtype is Categorical or dtype is CategoricalDtype:\n # this is a pd.Categorical, but is not\n # a valid type for astypeing\n raise TypeError(\"invalid type {0} for astype\".format(dtype))\n\n elif is_categorical_dtype(dtype):\n return True\n\n return False", "response": "validate that we have a astypeable to categorical return a boolean if we are a categorical"} {"instruction": "Make a summary of the following Python 3 code\ndef get_values(self, dtype=None):\n if is_object_dtype(dtype):\n return self.values.astype(object)\n return self.values", "response": "return an internal format of the data"} {"instruction": "Implement a function in Python 3 to\ncreate a new block with type inference propagate any values that are not specified", "response": "def make_block(self, values, placement=None, ndim=None):\n \"\"\"\n Create a new block, with type inference propagate any values that are\n not specified\n \"\"\"\n if placement is None:\n placement = self.mgr_locs\n if ndim is None:\n ndim = self.ndim\n\n return make_block(values, placement=placement, ndim=ndim)"} {"instruction": "Can you create a Python 3 function that\nwraps given values in a block of same type as self.", "response": "def make_block_same_class(self, values, placement=None, ndim=None,\n dtype=None):\n \"\"\" Wrap given values in a block of same type as self. \"\"\"\n if dtype is not None:\n # issue 19431 fastparquet is passing this\n warnings.warn(\"dtype argument is deprecated, will be removed \"\n \"in a future release.\", DeprecationWarning)\n if placement is None:\n placement = self.mgr_locs\n return make_block(values, placement=placement, ndim=ndim,\n klass=self.__class__, dtype=dtype)"} {"instruction": "Make a summary of the following Python 3 code\ndef getitem_block(self, slicer, new_mgr_locs=None):\n if new_mgr_locs is None:\n if isinstance(slicer, tuple):\n axis0_slicer = slicer[0]\n else:\n axis0_slicer = slicer\n new_mgr_locs = self.mgr_locs[axis0_slicer]\n\n new_values = self._slice(slicer)\n\n if self._validate_ndim and new_values.ndim != self.ndim:\n raise ValueError(\"Only same dim slicing is allowed\")\n\n return self.make_block_same_class(new_values, new_mgr_locs)", "response": "Perform __getitem__ - like return result as block."} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef concat_same_type(self, to_concat, placement=None):\n values = self._concatenator([blk.values for blk in to_concat],\n axis=self.ndim - 1)\n return self.make_block_same_class(\n values, placement=placement or slice(0, len(values), 1))", "response": "Concatenate a list of single blocks of the same type."} {"instruction": "Make a summary of the following Python 3 code\ndef delete(self, loc):\n self.values = np.delete(self.values, loc, 0)\n self.mgr_locs = self.mgr_locs.delete(loc)", "response": "Delete given loc from block in - place."} {"instruction": "How would you explain what the following Python 3 function does\ndef apply(self, func, **kwargs):\n with np.errstate(all='ignore'):\n result = func(self.values, **kwargs)\n if not isinstance(result, Block):\n result = self.make_block(values=_block_shape(result,\n ndim=self.ndim))\n\n return result", "response": "apply the function to my values ; return a block if we are not one"} {"instruction": "Make a summary of the following Python 3 code\ndef fillna(self, value, limit=None, inplace=False, downcast=None):\n inplace = validate_bool_kwarg(inplace, 'inplace')\n\n if not self._can_hold_na:\n if inplace:\n return self\n else:\n return self.copy()\n\n mask = isna(self.values)\n if limit is not None:\n if not is_integer(limit):\n raise ValueError('Limit must be an integer')\n if limit < 1:\n raise ValueError('Limit must be greater than 0')\n if self.ndim > 2:\n raise NotImplementedError(\"number of dimensions for 'fillna' \"\n \"is currently limited to 2\")\n mask[mask.cumsum(self.ndim - 1) > limit] = False\n\n # fillna, but if we cannot coerce, then try again as an ObjectBlock\n try:\n values, _ = self._try_coerce_args(self.values, value)\n blocks = self.putmask(mask, value, inplace=inplace)\n blocks = [b.make_block(values=self._try_coerce_result(b.values))\n for b in blocks]\n return self._maybe_downcast(blocks, downcast)\n except (TypeError, ValueError):\n\n # we can't process the value, but nothing to do\n if not mask.any():\n return self if inplace else self.copy()\n\n # operate column-by-column\n def f(m, v, i):\n block = self.coerce_to_target_dtype(value)\n\n # slice out our block\n if i is not None:\n block = block.getitem_block(slice(i, i + 1))\n return block.fillna(value,\n limit=limit,\n inplace=inplace,\n downcast=None)\n\n return self.split_and_operate(mask, f, inplace)", "response": "fillna on the object block with the value."} {"instruction": "How would you explain what the following Python 3 function does\ndef split_and_operate(self, mask, f, inplace):\n\n if mask is None:\n mask = np.ones(self.shape, dtype=bool)\n new_values = self.values\n\n def make_a_block(nv, ref_loc):\n if isinstance(nv, Block):\n block = nv\n elif isinstance(nv, list):\n block = nv[0]\n else:\n # Put back the dimension that was taken from it and make\n # a block out of the result.\n try:\n nv = _block_shape(nv, ndim=self.ndim)\n except (AttributeError, NotImplementedError):\n pass\n block = self.make_block(values=nv,\n placement=ref_loc)\n return block\n\n # ndim == 1\n if self.ndim == 1:\n if mask.any():\n nv = f(mask, new_values, None)\n else:\n nv = new_values if inplace else new_values.copy()\n block = make_a_block(nv, self.mgr_locs)\n return [block]\n\n # ndim > 1\n new_blocks = []\n for i, ref_loc in enumerate(self.mgr_locs):\n m = mask[i]\n v = new_values[i]\n\n # need a new block\n if m.any():\n nv = f(m, v, i)\n else:\n nv = v if inplace else v.copy()\n\n block = make_a_block(nv, [ref_loc])\n new_blocks.append(block)\n\n return new_blocks", "response": "split the block per - column and apply the callable fonction f on each entry. return a new list of blocks where each entry is a new entry."} {"instruction": "How would you implement a function in Python 3 that\ntries to downcast each item to the dict of dtypes if present", "response": "def downcast(self, dtypes=None):\n \"\"\" try to downcast each item to the dict of dtypes if present \"\"\"\n\n # turn it off completely\n if dtypes is False:\n return self\n\n values = self.values\n\n # single block handling\n if self._is_single_block:\n\n # try to cast all non-floats here\n if dtypes is None:\n dtypes = 'infer'\n\n nv = maybe_downcast_to_dtype(values, dtypes)\n return self.make_block(nv)\n\n # ndim > 1\n if dtypes is None:\n return self\n\n if not (dtypes == 'infer' or isinstance(dtypes, dict)):\n raise ValueError(\"downcast must have a dictionary or 'infer' as \"\n \"its argument\")\n\n # operate column-by-column\n # this is expensive as it splits the blocks items-by-item\n def f(m, v, i):\n\n if dtypes == 'infer':\n dtype = 'infer'\n else:\n raise AssertionError(\"dtypes as dict is not supported yet\")\n\n if dtype is not None:\n v = maybe_downcast_to_dtype(v, dtype)\n return v\n\n return self.split_and_operate(None, f, False)"} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef _astype(self, dtype, copy=False, errors='raise', values=None,\n **kwargs):\n \"\"\"Coerce to the new type\n\n Parameters\n ----------\n dtype : str, dtype convertible\n copy : boolean, default False\n copy if indicated\n errors : str, {'raise', 'ignore'}, default 'ignore'\n - ``raise`` : allow exceptions to be raised\n - ``ignore`` : suppress exceptions. On error return original object\n\n Returns\n -------\n Block\n \"\"\"\n errors_legal_values = ('raise', 'ignore')\n\n if errors not in errors_legal_values:\n invalid_arg = (\"Expected value of kwarg 'errors' to be one of {}. \"\n \"Supplied value is '{}'\".format(\n list(errors_legal_values), errors))\n raise ValueError(invalid_arg)\n\n if (inspect.isclass(dtype) and\n issubclass(dtype, (PandasExtensionDtype, ExtensionDtype))):\n msg = (\"Expected an instance of {}, but got the class instead. \"\n \"Try instantiating 'dtype'.\".format(dtype.__name__))\n raise TypeError(msg)\n\n # may need to convert to categorical\n if self.is_categorical_astype(dtype):\n\n # deprecated 17636\n if ('categories' in kwargs or 'ordered' in kwargs):\n if isinstance(dtype, CategoricalDtype):\n raise TypeError(\n \"Cannot specify a CategoricalDtype and also \"\n \"`categories` or `ordered`. Use \"\n \"`dtype=CategoricalDtype(categories, ordered)`\"\n \" instead.\")\n warnings.warn(\"specifying 'categories' or 'ordered' in \"\n \".astype() is deprecated; pass a \"\n \"CategoricalDtype instead\",\n FutureWarning, stacklevel=7)\n\n categories = kwargs.get('categories', None)\n ordered = kwargs.get('ordered', None)\n if com._any_not_none(categories, ordered):\n dtype = CategoricalDtype(categories, ordered)\n\n if is_categorical_dtype(self.values):\n # GH 10696/18593: update an existing categorical efficiently\n return self.make_block(self.values.astype(dtype, copy=copy))\n\n return self.make_block(Categorical(self.values, dtype=dtype))\n\n dtype = pandas_dtype(dtype)\n\n # astype processing\n if is_dtype_equal(self.dtype, dtype):\n if copy:\n return self.copy()\n return self\n\n try:\n # force the copy here\n if values is None:\n\n if self.is_extension:\n values = self.values.astype(dtype)\n else:\n if issubclass(dtype.type, str):\n\n # use native type formatting for datetime/tz/timedelta\n if self.is_datelike:\n values = self.to_native_types()\n\n # astype formatting\n else:\n values = self.get_values()\n\n else:\n values = self.get_values(dtype=dtype)\n\n # _astype_nansafe works fine with 1-d only\n values = astype_nansafe(values.ravel(), dtype, copy=True)\n\n # TODO(extension)\n # should we make this attribute?\n try:\n values = values.reshape(self.shape)\n except AttributeError:\n pass\n\n newb = make_block(values, placement=self.mgr_locs,\n ndim=self.ndim)\n except Exception: # noqa: E722\n if errors == 'raise':\n raise\n newb = self.copy() if copy else self\n\n if newb.is_numeric and self.is_numeric:\n if newb.shape != self.shape:\n raise TypeError(\n \"cannot set astype for copy = [{copy}] for dtype \"\n \"({dtype} [{shape}]) to different shape \"\n \"({newb_dtype} [{newb_shape}])\".format(\n copy=copy, dtype=self.dtype.name,\n shape=self.shape, newb_dtype=newb.dtype.name,\n newb_shape=newb.shape))\n return newb", "response": "Coerce to the new type of the object."} {"instruction": "Create a Python 3 function to\ncheck if the given element can be hold.", "response": "def _can_hold_element(self, element):\n \"\"\" require the same dtype as ourselves \"\"\"\n dtype = self.values.dtype.type\n tipo = maybe_infer_dtype_type(element)\n if tipo is not None:\n return issubclass(tipo.type, dtype)\n return isinstance(element, dtype)"} {"instruction": "Implement a Python 3 function for\ntrying to cast the result to our original type", "response": "def _try_cast_result(self, result, dtype=None):\n \"\"\" try to cast the result to our original type, we may have\n roundtripped thru object in the mean-time\n \"\"\"\n if dtype is None:\n dtype = self.dtype\n\n if self.is_integer or self.is_bool or self.is_datetime:\n pass\n elif self.is_float and result.dtype == self.dtype:\n\n # protect against a bool/object showing up here\n if isinstance(dtype, str) and dtype == 'infer':\n return result\n if not isinstance(dtype, type):\n dtype = dtype.type\n if issubclass(dtype, (np.bool_, np.object_)):\n if issubclass(dtype, np.bool_):\n if isna(result).all():\n return result.astype(np.bool_)\n else:\n result = result.astype(np.object_)\n result[result == 1] = True\n result[result == 0] = False\n return result\n else:\n return result.astype(np.object_)\n\n return result\n\n # may need to change the dtype here\n return maybe_downcast_to_dtype(result, dtype)"} {"instruction": "Here you have a function in Python 3, explain what it does\ndef _try_coerce_args(self, values, other):\n\n if np.any(notna(other)) and not self._can_hold_element(other):\n # coercion issues\n # let higher levels handle\n raise TypeError(\"cannot convert {} to an {}\".format(\n type(other).__name__,\n type(self).__name__.lower().replace('Block', '')))\n\n return values, other", "response": "provide coercion to our input arguments"} {"instruction": "Make a summary of the following Python 3 code\ndef to_native_types(self, slicer=None, na_rep='nan', quoting=None,\n **kwargs):\n \"\"\" convert to our native types format, slicing if desired \"\"\"\n\n values = self.get_values()\n\n if slicer is not None:\n values = values[:, slicer]\n mask = isna(values)\n\n if not self.is_object and not quoting:\n values = values.astype(str)\n else:\n values = np.array(values, dtype='object')\n\n values[mask] = na_rep\n return values", "response": "convert to our native types format slicing if desired"} {"instruction": "Write a Python 3 function for\nreplacing the to_replace value with value possible to create new objectblocks here", "response": "def replace(self, to_replace, value, inplace=False, filter=None,\n regex=False, convert=True):\n \"\"\"replace the to_replace value with value, possible to create new\n blocks here this is just a call to putmask. regex is not used here.\n It is used in ObjectBlocks. It is here for API compatibility.\n \"\"\"\n\n inplace = validate_bool_kwarg(inplace, 'inplace')\n original_to_replace = to_replace\n\n # try to replace, if we raise an error, convert to ObjectBlock and\n # retry\n try:\n values, to_replace = self._try_coerce_args(self.values,\n to_replace)\n mask = missing.mask_missing(values, to_replace)\n if filter is not None:\n filtered_out = ~self.mgr_locs.isin(filter)\n mask[filtered_out.nonzero()[0]] = False\n\n blocks = self.putmask(mask, value, inplace=inplace)\n if convert:\n blocks = [b.convert(by_item=True, numeric=False,\n copy=not inplace) for b in blocks]\n return blocks\n except (TypeError, ValueError):\n # GH 22083, TypeError or ValueError occurred within error handling\n # causes infinite loop. Cast and retry only if not objectblock.\n if is_object_dtype(self):\n raise\n\n # try again with a compatible block\n block = self.astype(object)\n return block.replace(to_replace=original_to_replace,\n value=value,\n inplace=inplace,\n filter=filter,\n regex=regex,\n convert=convert)"} {"instruction": "Can you create a Python 3 function that\nsets the value in a maybe different typed block.", "response": "def setitem(self, indexer, value):\n \"\"\"Set the value inplace, returning a a maybe different typed block.\n\n Parameters\n ----------\n indexer : tuple, list-like, array-like, slice\n The subset of self.values to set\n value : object\n The value being set\n\n Returns\n -------\n Block\n\n Notes\n -----\n `indexer` is a direct slice/positional indexer. `value` must\n be a compatible shape.\n \"\"\"\n # coerce None values, if appropriate\n if value is None:\n if self.is_numeric:\n value = np.nan\n\n # coerce if block dtype can store value\n values = self.values\n try:\n values, value = self._try_coerce_args(values, value)\n # can keep its own dtype\n if hasattr(value, 'dtype') and is_dtype_equal(values.dtype,\n value.dtype):\n dtype = self.dtype\n else:\n dtype = 'infer'\n\n except (TypeError, ValueError):\n # current dtype cannot store value, coerce to common dtype\n find_dtype = False\n\n if hasattr(value, 'dtype'):\n dtype = value.dtype\n find_dtype = True\n\n elif lib.is_scalar(value):\n if isna(value):\n # NaN promotion is handled in latter path\n dtype = False\n else:\n dtype, _ = infer_dtype_from_scalar(value,\n pandas_dtype=True)\n find_dtype = True\n else:\n dtype = 'infer'\n\n if find_dtype:\n dtype = find_common_type([values.dtype, dtype])\n if not is_dtype_equal(self.dtype, dtype):\n b = self.astype(dtype)\n return b.setitem(indexer, value)\n\n # value must be storeable at this moment\n arr_value = np.array(value)\n\n # cast the values to a type that can hold nan (if necessary)\n if not self._can_hold_element(value):\n dtype, _ = maybe_promote(arr_value.dtype)\n values = values.astype(dtype)\n\n transf = (lambda x: x.T) if self.ndim == 2 else (lambda x: x)\n values = transf(values)\n\n # length checking\n check_setitem_lengths(indexer, value, values)\n\n def _is_scalar_indexer(indexer):\n # return True if we are all scalar indexers\n\n if arr_value.ndim == 1:\n if not isinstance(indexer, tuple):\n indexer = tuple([indexer])\n return any(isinstance(idx, np.ndarray) and len(idx) == 0\n for idx in indexer)\n return False\n\n def _is_empty_indexer(indexer):\n # return a boolean if we have an empty indexer\n\n if is_list_like(indexer) and not len(indexer):\n return True\n if arr_value.ndim == 1:\n if not isinstance(indexer, tuple):\n indexer = tuple([indexer])\n return any(isinstance(idx, np.ndarray) and len(idx) == 0\n for idx in indexer)\n return False\n\n # empty indexers\n # 8669 (empty)\n if _is_empty_indexer(indexer):\n pass\n\n # setting a single element for each dim and with a rhs that could\n # be say a list\n # GH 6043\n elif _is_scalar_indexer(indexer):\n values[indexer] = value\n\n # if we are an exact match (ex-broadcasting),\n # then use the resultant dtype\n elif (len(arr_value.shape) and\n arr_value.shape[0] == values.shape[0] and\n np.prod(arr_value.shape) == np.prod(values.shape)):\n values[indexer] = value\n try:\n values = values.astype(arr_value.dtype)\n except ValueError:\n pass\n\n # set\n else:\n values[indexer] = value\n\n # coerce and try to infer the dtypes of the result\n values = self._try_coerce_and_cast_result(values, dtype)\n block = self.make_block(transf(values))\n return block"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef coerce_to_target_dtype(self, other):\n\n # if we cannot then coerce to object\n dtype, _ = infer_dtype_from(other, pandas_dtype=True)\n\n if is_dtype_equal(self.dtype, dtype):\n return self\n\n if self.is_bool or is_object_dtype(dtype) or is_bool_dtype(dtype):\n # we don't upcast to bool\n return self.astype(object)\n\n elif ((self.is_float or self.is_complex) and\n (is_integer_dtype(dtype) or is_float_dtype(dtype))):\n # don't coerce float/complex to int\n return self\n\n elif (self.is_datetime or\n is_datetime64_dtype(dtype) or\n is_datetime64tz_dtype(dtype)):\n\n # not a datetime\n if not ((is_datetime64_dtype(dtype) or\n is_datetime64tz_dtype(dtype)) and self.is_datetime):\n return self.astype(object)\n\n # don't upcast timezone with different timezone or no timezone\n mytz = getattr(self.dtype, 'tz', None)\n othertz = getattr(dtype, 'tz', None)\n\n if str(mytz) != str(othertz):\n return self.astype(object)\n\n raise AssertionError(\"possible recursion in \"\n \"coerce_to_target_dtype: {} {}\".format(\n self, other))\n\n elif (self.is_timedelta or is_timedelta64_dtype(dtype)):\n\n # not a timedelta\n if not (is_timedelta64_dtype(dtype) and self.is_timedelta):\n return self.astype(object)\n\n raise AssertionError(\"possible recursion in \"\n \"coerce_to_target_dtype: {} {}\".format(\n self, other))\n\n try:\n return self.astype(dtype)\n except (ValueError, TypeError, OverflowError):\n pass\n\n return self.astype(object)", "response": "coerce the current block to a dtype compat for other\n "} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef _interpolate_with_fill(self, method='pad', axis=0, inplace=False,\n limit=None, fill_value=None, coerce=False,\n downcast=None):\n \"\"\" fillna but using the interpolate machinery \"\"\"\n\n inplace = validate_bool_kwarg(inplace, 'inplace')\n\n # if we are coercing, then don't force the conversion\n # if the block can't hold the type\n if coerce:\n if not self._can_hold_na:\n if inplace:\n return [self]\n else:\n return [self.copy()]\n\n values = self.values if inplace else self.values.copy()\n values, fill_value = self._try_coerce_args(values, fill_value)\n values = missing.interpolate_2d(values, method=method, axis=axis,\n limit=limit, fill_value=fill_value,\n dtype=self.dtype)\n values = self._try_coerce_result(values)\n\n blocks = [self.make_block_same_class(values, ndim=self.ndim)]\n return self._maybe_downcast(blocks, downcast)", "response": "fillna but using the interpolate machinery"} {"instruction": "Implement a Python 3 function for\ninterpolating the data for the current object", "response": "def _interpolate(self, method=None, index=None, values=None,\n fill_value=None, axis=0, limit=None,\n limit_direction='forward', limit_area=None,\n inplace=False, downcast=None, **kwargs):\n \"\"\" interpolate using scipy wrappers \"\"\"\n\n inplace = validate_bool_kwarg(inplace, 'inplace')\n data = self.values if inplace else self.values.copy()\n\n # only deal with floats\n if not self.is_float:\n if not self.is_integer:\n return self\n data = data.astype(np.float64)\n\n if fill_value is None:\n fill_value = self.fill_value\n\n if method in ('krogh', 'piecewise_polynomial', 'pchip'):\n if not index.is_monotonic:\n raise ValueError(\"{0} interpolation requires that the \"\n \"index be monotonic.\".format(method))\n # process 1-d slices in the axis direction\n\n def func(x):\n\n # process a 1-d slice, returning it\n # should the axis argument be handled below in apply_along_axis?\n # i.e. not an arg to missing.interpolate_1d\n return missing.interpolate_1d(index, x, method=method, limit=limit,\n limit_direction=limit_direction,\n limit_area=limit_area,\n fill_value=fill_value,\n bounds_error=False, **kwargs)\n\n # interp each column independently\n interp_values = np.apply_along_axis(func, axis, data)\n\n blocks = [self.make_block_same_class(interp_values)]\n return self._maybe_downcast(blocks, downcast)"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef take_nd(self, indexer, axis, new_mgr_locs=None, fill_tuple=None):\n\n # algos.take_nd dispatches for DatetimeTZBlock, CategoricalBlock\n # so need to preserve types\n # sparse is treated like an ndarray, but needs .get_values() shaping\n\n values = self.values\n if self.is_sparse:\n values = self.get_values()\n\n if fill_tuple is None:\n fill_value = self.fill_value\n new_values = algos.take_nd(values, indexer, axis=axis,\n allow_fill=False, fill_value=fill_value)\n else:\n fill_value = fill_tuple[0]\n new_values = algos.take_nd(values, indexer, axis=axis,\n allow_fill=True, fill_value=fill_value)\n\n if new_mgr_locs is None:\n if axis == 0:\n slc = libinternals.indexer_as_slice(indexer)\n if slc is not None:\n new_mgr_locs = self.mgr_locs[slc]\n else:\n new_mgr_locs = self.mgr_locs[indexer]\n else:\n new_mgr_locs = self.mgr_locs\n\n if not is_dtype_equal(new_values.dtype, self.dtype):\n return self.make_block(new_values, new_mgr_locs)\n else:\n return self.make_block_same_class(new_values, new_mgr_locs)", "response": "Take values according to indexer and return them as a block."} {"instruction": "Make a summary of the following Python 3 code\ndef diff(self, n, axis=1):\n new_values = algos.diff(self.values, n, axis=axis)\n return [self.make_block(values=new_values)]", "response": "return the diff of the values"} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef shift(self, periods, axis=0, fill_value=None):\n\n # convert integer to float if necessary. need to do a lot more than\n # that, handle boolean etc also\n new_values, fill_value = maybe_upcast(self.values, fill_value)\n\n # make sure array sent to np.roll is c_contiguous\n f_ordered = new_values.flags.f_contiguous\n if f_ordered:\n new_values = new_values.T\n axis = new_values.ndim - axis - 1\n\n if np.prod(new_values.shape):\n new_values = np.roll(new_values, ensure_platform_int(periods),\n axis=axis)\n\n axis_indexer = [slice(None)] * self.ndim\n if periods > 0:\n axis_indexer[axis] = slice(None, periods)\n else:\n axis_indexer[axis] = slice(periods, None)\n new_values[tuple(axis_indexer)] = fill_value\n\n # restore original order\n if f_ordered:\n new_values = new_values.T\n\n return [self.make_block(new_values)]", "response": "shift the block by periods possibly upcast"} {"instruction": "Can you implement a function in Python 3 that\nevaluates the block where the condition is met on other and return the result of the function.", "response": "def where(self, other, cond, align=True, errors='raise',\n try_cast=False, axis=0, transpose=False):\n \"\"\"\n evaluate the block; return result block(s) from the result\n\n Parameters\n ----------\n other : a ndarray/object\n cond : the condition to respect\n align : boolean, perform alignment on other/cond\n errors : str, {'raise', 'ignore'}, default 'raise'\n - ``raise`` : allow exceptions to be raised\n - ``ignore`` : suppress exceptions. On error return original object\n\n axis : int\n transpose : boolean\n Set to True if self is stored with axes reversed\n\n Returns\n -------\n a new block(s), the result of the func\n \"\"\"\n import pandas.core.computation.expressions as expressions\n assert errors in ['raise', 'ignore']\n\n values = self.values\n orig_other = other\n if transpose:\n values = values.T\n\n other = getattr(other, '_values', getattr(other, 'values', other))\n cond = getattr(cond, 'values', cond)\n\n # If the default broadcasting would go in the wrong direction, then\n # explicitly reshape other instead\n if getattr(other, 'ndim', 0) >= 1:\n if values.ndim - 1 == other.ndim and axis == 1:\n other = other.reshape(tuple(other.shape + (1, )))\n elif transpose and values.ndim == self.ndim - 1:\n cond = cond.T\n\n if not hasattr(cond, 'shape'):\n raise ValueError(\"where must have a condition that is ndarray \"\n \"like\")\n\n # our where function\n def func(cond, values, other):\n if cond.ravel().all():\n return values\n\n values, other = self._try_coerce_args(values, other)\n\n try:\n return self._try_coerce_result(expressions.where(\n cond, values, other))\n except Exception as detail:\n if errors == 'raise':\n raise TypeError(\n 'Could not operate [{other!r}] with block values '\n '[{detail!s}]'.format(other=other, detail=detail))\n else:\n # return the values\n result = np.empty(values.shape, dtype='float64')\n result.fill(np.nan)\n return result\n\n # see if we can operate on the entire block, or need item-by-item\n # or if we are a single block (ndim == 1)\n try:\n result = func(cond, values, other)\n except TypeError:\n\n # we cannot coerce, return a compat dtype\n # we are explicitly ignoring errors\n block = self.coerce_to_target_dtype(other)\n blocks = block.where(orig_other, cond, align=align,\n errors=errors,\n try_cast=try_cast, axis=axis,\n transpose=transpose)\n return self._maybe_downcast(blocks, 'infer')\n\n if self._can_hold_na or self.ndim == 1:\n\n if transpose:\n result = result.T\n\n # try to cast if requested\n if try_cast:\n result = self._try_cast_result(result)\n\n return self.make_block(result)\n\n # might need to separate out blocks\n axis = cond.ndim - 1\n cond = cond.swapaxes(axis, 0)\n mask = np.array([cond[i].all() for i in range(cond.shape[0])],\n dtype=bool)\n\n result_blocks = []\n for m in [mask, ~mask]:\n if m.any():\n r = self._try_cast_result(result.take(m.nonzero()[0],\n axis=axis))\n result_blocks.append(\n self.make_block(r.T, placement=self.mgr_locs[m]))\n\n return result_blocks"} {"instruction": "How would you explain what the following Python 3 function does\ndef _unstack(self, unstacker_func, new_columns, n_rows, fill_value):\n unstacker = unstacker_func(self.values.T)\n new_items = unstacker.get_new_columns()\n new_placement = new_columns.get_indexer(new_items)\n new_values, mask = unstacker.get_new_values()\n\n mask = mask.any(0)\n new_values = new_values.T[mask]\n new_placement = new_placement[mask]\n\n blocks = [make_block(new_values, placement=new_placement)]\n return blocks, mask", "response": "Unstacks the entries of the EntryPoint into a list of blocks."} {"instruction": "Create a Python 3 function for\ncomputing the quantiles of the entry in the block", "response": "def quantile(self, qs, interpolation='linear', axis=0):\n \"\"\"\n compute the quantiles of the\n\n Parameters\n ----------\n qs: a scalar or list of the quantiles to be computed\n interpolation: type of interpolation, default 'linear'\n axis: axis to compute, default 0\n\n Returns\n -------\n Block\n \"\"\"\n if self.is_datetimetz:\n # TODO: cleanup this special case.\n # We need to operate on i8 values for datetimetz\n # but `Block.get_values()` returns an ndarray of objects\n # right now. We need an API for \"values to do numeric-like ops on\"\n values = self.values.asi8\n\n # TODO: NonConsolidatableMixin shape\n # Usual shape inconsistencies for ExtensionBlocks\n if self.ndim > 1:\n values = values[None, :]\n else:\n values = self.get_values()\n values, _ = self._try_coerce_args(values, values)\n\n is_empty = values.shape[axis] == 0\n orig_scalar = not is_list_like(qs)\n if orig_scalar:\n # make list-like, unpack later\n qs = [qs]\n\n if is_empty:\n if self.ndim == 1:\n result = self._na_value\n else:\n # create the array of na_values\n # 2d len(values) * len(qs)\n result = np.repeat(np.array([self.fill_value] * len(qs)),\n len(values)).reshape(len(values),\n len(qs))\n else:\n # asarray needed for Sparse, see GH#24600\n # TODO: Why self.values and not values?\n mask = np.asarray(isna(self.values))\n result = nanpercentile(values, np.array(qs) * 100,\n axis=axis, na_value=self.fill_value,\n mask=mask, ndim=self.ndim,\n interpolation=interpolation)\n\n result = np.array(result, copy=False)\n if self.ndim > 1:\n result = result.T\n\n if orig_scalar and not lib.is_scalar(result):\n # result could be scalar in case with is_empty and self.ndim == 1\n assert result.shape[-1] == 1, result.shape\n result = result[..., 0]\n result = lib.item_from_zerodim(result)\n\n ndim = getattr(result, 'ndim', None) or 0\n result = self._try_coerce_result(result)\n return make_block(result,\n placement=np.arange(len(result)),\n ndim=ndim)"} {"instruction": "Can you create a Python 3 function that\nreplaces value corresponding to the given boolean array with another value. Parameters ---------- to_replace : object or pattern Scalar to replace or regular expression to match. value : object Replacement object. inplace : bool, default False Perform inplace modification. regex : bool, default False If true, perform regular expression substitution. convert : bool, default True If true, try to coerce any object types to better types. mask : array-like of bool, optional True indicate corresponding element is ignored. Returns ------- A new block if there is anything to replace or the original block.", "response": "def _replace_coerce(self, to_replace, value, inplace=True, regex=False,\n convert=False, mask=None):\n \"\"\"\n Replace value corresponding to the given boolean array with another\n value.\n\n Parameters\n ----------\n to_replace : object or pattern\n Scalar to replace or regular expression to match.\n value : object\n Replacement object.\n inplace : bool, default False\n Perform inplace modification.\n regex : bool, default False\n If true, perform regular expression substitution.\n convert : bool, default True\n If true, try to coerce any object types to better types.\n mask : array-like of bool, optional\n True indicate corresponding element is ignored.\n\n Returns\n -------\n A new block if there is anything to replace or the original block.\n \"\"\"\n\n if mask.any():\n if not regex:\n self = self.coerce_to_target_dtype(value)\n return self.putmask(mask, value, inplace=inplace)\n else:\n return self._replace_single(to_replace, value, inplace=inplace,\n regex=regex,\n convert=convert,\n mask=mask)\n return self"} {"instruction": "How would you explain what the following Python 3 function does\ndef putmask(self, mask, new, align=True, inplace=False, axis=0,\n transpose=False):\n \"\"\"\n putmask the data to the block; we must be a single block and not\n generate other blocks\n\n return the resulting block\n\n Parameters\n ----------\n mask : the condition to respect\n new : a ndarray/object\n align : boolean, perform alignment on other/cond, default is True\n inplace : perform inplace modification, default is False\n\n Returns\n -------\n a new block, the result of the putmask\n \"\"\"\n inplace = validate_bool_kwarg(inplace, 'inplace')\n\n # use block's copy logic.\n # .values may be an Index which does shallow copy by default\n new_values = self.values if inplace else self.copy().values\n new_values, new = self._try_coerce_args(new_values, new)\n\n if isinstance(new, np.ndarray) and len(new) == len(mask):\n new = new[mask]\n\n mask = _safe_reshape(mask, new_values.shape)\n\n new_values[mask] = new\n new_values = self._try_coerce_result(new_values)\n return [self.make_block(values=new_values)]", "response": "Putmask the data to the block."} {"instruction": "Create a Python 3 function to\nget the placement values and mask for a Block unstack.", "response": "def _get_unstack_items(self, unstacker, new_columns):\n \"\"\"\n Get the placement, values, and mask for a Block unstack.\n\n This is shared between ObjectBlock and ExtensionBlock. They\n differ in that ObjectBlock passes the values, while ExtensionBlock\n passes the dummy ndarray of positions to be used by a take\n later.\n\n Parameters\n ----------\n unstacker : pandas.core.reshape.reshape._Unstacker\n new_columns : Index\n All columns of the unstacked BlockManager.\n\n Returns\n -------\n new_placement : ndarray[int]\n The placement of the new columns in `new_columns`.\n new_values : Union[ndarray, ExtensionArray]\n The first return value from _Unstacker.get_new_values.\n mask : ndarray[bool]\n The second return value from _Unstacker.get_new_values.\n \"\"\"\n # shared with ExtensionBlock\n new_items = unstacker.get_new_columns()\n new_placement = new_columns.get_indexer(new_items)\n new_values, mask = unstacker.get_new_values()\n\n mask = mask.any(0)\n return new_placement, new_values, mask"} {"instruction": "Write a Python 3 script to\ncoerce the values to an ExtensionArray.", "response": "def _maybe_coerce_values(self, values):\n \"\"\"Unbox to an extension array.\n\n This will unbox an ExtensionArray stored in an Index or Series.\n ExtensionArrays pass through. No dtype coercion is done.\n\n Parameters\n ----------\n values : Index, Series, ExtensionArray\n\n Returns\n -------\n ExtensionArray\n \"\"\"\n if isinstance(values, (ABCIndexClass, ABCSeries)):\n values = values._values\n return values"} {"instruction": "Make a summary of the following Python 3 code\ndef setitem(self, indexer, value):\n if isinstance(indexer, tuple):\n # we are always 1-D\n indexer = indexer[0]\n\n check_setitem_lengths(indexer, value, self.values)\n self.values[indexer] = value\n return self", "response": "Set the value inplace returning a same - typed block."} {"instruction": "Implement a function in Python 3 to\ntake values according to indexer and return them as a block.", "response": "def take_nd(self, indexer, axis=0, new_mgr_locs=None, fill_tuple=None):\n \"\"\"\n Take values according to indexer and return them as a block.\n \"\"\"\n if fill_tuple is None:\n fill_value = None\n else:\n fill_value = fill_tuple[0]\n\n # axis doesn't matter; we are really a single-dim object\n # but are passed the axis depending on the calling routing\n # if its REALLY axis 0, then this will be a reindex and not a take\n new_values = self.values.take(indexer, fill_value=fill_value,\n allow_fill=True)\n\n if self.ndim == 1 and new_mgr_locs is None:\n new_mgr_locs = [0]\n else:\n if new_mgr_locs is None:\n new_mgr_locs = self.mgr_locs\n\n return self.make_block_same_class(new_values, new_mgr_locs)"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef _slice(self, slicer):\n\n # slice the category\n # return same dims as we currently have\n\n if isinstance(slicer, tuple) and len(slicer) == 2:\n if not com.is_null_slice(slicer[0]):\n raise AssertionError(\"invalid slicing for a 1-ndim \"\n \"categorical\")\n slicer = slicer[1]\n\n return self.values[slicer]", "response": "return a slice of my values"} {"instruction": "How would you implement a function in Python 3 that\nconcatenates list of single blocks of the same type.", "response": "def concat_same_type(self, to_concat, placement=None):\n \"\"\"\n Concatenate list of single blocks of the same type.\n \"\"\"\n values = self._holder._concat_same_type(\n [blk.values for blk in to_concat])\n placement = placement or slice(0, len(values), 1)\n return self.make_block_same_class(values, ndim=self.ndim,\n placement=placement)"} {"instruction": "Create a Python 3 function for\nshifting the block by periods.", "response": "def shift(self,\n periods: int,\n axis: libinternals.BlockPlacement = 0,\n fill_value: Any = None) -> List['ExtensionBlock']:\n \"\"\"\n Shift the block by `periods`.\n\n Dispatches to underlying ExtensionArray and re-boxes in an\n ExtensionBlock.\n \"\"\"\n return [\n self.make_block_same_class(\n self.values.shift(periods=periods, fill_value=fill_value),\n placement=self.mgr_locs, ndim=self.ndim)\n ]"} {"instruction": "Create a Python 3 function for\nconverting the current object to our native types format slicing if desired", "response": "def to_native_types(self, slicer=None, na_rep='', float_format=None,\n decimal='.', quoting=None, **kwargs):\n \"\"\" convert to our native types format, slicing if desired \"\"\"\n\n values = self.values\n if slicer is not None:\n values = values[:, slicer]\n\n # see gh-13418: no special formatting is desired at the\n # output (important for appropriate 'quoting' behaviour),\n # so do not pass it through the FloatArrayFormatter\n if float_format is None and decimal == '.':\n mask = isna(values)\n\n if not quoting:\n values = values.astype(str)\n else:\n values = np.array(values, dtype='object')\n\n values[mask] = na_rep\n return values\n\n from pandas.io.formats.format import FloatArrayFormatter\n formatter = FloatArrayFormatter(values, na_rep=na_rep,\n float_format=float_format,\n decimal=decimal, quoting=quoting,\n fixed_width=False)\n return formatter.get_result_as_array()"} {"instruction": "Write a Python 3 function that can\nreturn the values as boxed values such as Timestamps and Timedelta objects", "response": "def get_values(self, dtype=None):\n \"\"\"\n return object dtype as boxed values, such as Timestamps/Timedelta\n \"\"\"\n if is_object_dtype(dtype):\n values = self.values.ravel()\n result = self._holder(values).astype(object)\n return result.reshape(self.values.shape)\n return self.values"} {"instruction": "Write a Python 3 function that can\ninput validation for values passed to __init__. Ensure that we have datetime64ns coercing if necessary.", "response": "def _maybe_coerce_values(self, values):\n \"\"\"Input validation for values passed to __init__. Ensure that\n we have datetime64ns, coercing if necessary.\n\n Parameters\n ----------\n values : array-like\n Must be convertible to datetime64\n\n Returns\n -------\n values : ndarray[datetime64ns]\n\n Overridden by DatetimeTZBlock.\n \"\"\"\n if values.dtype != _NS_DTYPE:\n values = conversion.ensure_datetime64ns(values)\n\n if isinstance(values, DatetimeArray):\n values = values._data\n\n assert isinstance(values, np.ndarray), type(values)\n return values"} {"instruction": "Write a Python 3 script for\nconverting to a new object with the specified dtype.", "response": "def _astype(self, dtype, **kwargs):\n \"\"\"\n these automatically copy, so copy=True has no effect\n raise on an except if raise == True\n \"\"\"\n dtype = pandas_dtype(dtype)\n\n # if we are passed a datetime64[ns, tz]\n if is_datetime64tz_dtype(dtype):\n values = self.values\n if getattr(values, 'tz', None) is None:\n values = DatetimeIndex(values).tz_localize('UTC')\n values = values.tz_convert(dtype.tz)\n return self.make_block(values)\n\n # delegate\n return super()._astype(dtype=dtype, **kwargs)"} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef _try_coerce_args(self, values, other):\n\n values = values.view('i8')\n\n if isinstance(other, bool):\n raise TypeError\n elif is_null_datetimelike(other):\n other = tslibs.iNaT\n elif isinstance(other, (datetime, np.datetime64, date)):\n other = self._box_func(other)\n if getattr(other, 'tz') is not None:\n raise TypeError(\"cannot coerce a Timestamp with a tz on a \"\n \"naive Block\")\n other = other.asm8.view('i8')\n elif hasattr(other, 'dtype') and is_datetime64_dtype(other):\n other = other.astype('i8', copy=False).view('i8')\n else:\n # coercion issues\n # let higher levels handle\n raise TypeError(other)\n\n return values, other", "response": "Coerce values and other to dtype i8. NaN and NaT convert to smallest i8. NaN and NaT convert to NaT"} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef _try_coerce_result(self, result):\n if isinstance(result, np.ndarray):\n if result.dtype.kind in ['i', 'f']:\n result = result.astype('M8[ns]')\n\n elif isinstance(result, (np.integer, np.float, np.datetime64)):\n result = self._box_func(result)\n return result", "response": "try coercion of result to a list of the correct type"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef to_native_types(self, slicer=None, na_rep=None, date_format=None,\n quoting=None, **kwargs):\n \"\"\" convert to our native types format, slicing if desired \"\"\"\n\n values = self.values\n i8values = self.values.view('i8')\n\n if slicer is not None:\n values = values[..., slicer]\n i8values = i8values[..., slicer]\n\n from pandas.io.formats.format import _get_format_datetime64_from_values\n fmt = _get_format_datetime64_from_values(values, date_format)\n\n result = tslib.format_array_from_datetime(\n i8values.ravel(), tz=getattr(self.values, 'tz', None),\n format=fmt, na_rep=na_rep).reshape(i8values.shape)\n return np.atleast_2d(result)", "response": "convert to our native types format date_format if desired"} {"instruction": "Implement a Python 3 function for\nmodifying the value of a key in - place with new item values.", "response": "def set(self, locs, values):\n \"\"\"\n Modify Block in-place with new item value\n\n Returns\n -------\n None\n \"\"\"\n values = conversion.ensure_datetime64ns(values, copy=False)\n\n self.values[locs] = values"} {"instruction": "Create a Python 3 function to\ninput validation for values passed to __init__. Ensure that we have datetime64TZ coercing if necessary.", "response": "def _maybe_coerce_values(self, values):\n \"\"\"Input validation for values passed to __init__. Ensure that\n we have datetime64TZ, coercing if necessary.\n\n Parametetrs\n -----------\n values : array-like\n Must be convertible to datetime64\n\n Returns\n -------\n values : DatetimeArray\n \"\"\"\n if not isinstance(values, self._holder):\n values = self._holder(values)\n\n if values.tz is None:\n raise ValueError(\"cannot create a DatetimeTZBlock without a tz\")\n\n return values"} {"instruction": "How would you code a function in Python 3 to\nreturn an ndarray of values.", "response": "def get_values(self, dtype=None):\n \"\"\"\n Returns an ndarray of values.\n\n Parameters\n ----------\n dtype : np.dtype\n Only `object`-like dtypes are respected here (not sure\n why).\n\n Returns\n -------\n values : ndarray\n When ``dtype=object``, then and object-dtype ndarray of\n boxed values is returned. Otherwise, an M8[ns] ndarray\n is returned.\n\n DatetimeArray is always 1-d. ``get_values`` will reshape\n the return value to be the same dimensionality as the\n block.\n \"\"\"\n values = self.values\n if is_object_dtype(dtype):\n values = values._box_values(values._data)\n\n values = np.asarray(values)\n\n if self.ndim == 2:\n # Ensure that our shape is correct for DataFrame.\n # ExtensionArrays are always 1-D, even in a DataFrame when\n # the analogous NumPy-backed column would be a 2-D ndarray.\n values = values.reshape(1, -1)\n return values"} {"instruction": "Create a Python 3 function to\nreturn a slice of my values", "response": "def _slice(self, slicer):\n \"\"\" return a slice of my values \"\"\"\n if isinstance(slicer, tuple):\n col, loc = slicer\n if not com.is_null_slice(col) and col != 0:\n raise IndexError(\"{0} only contains one item\".format(self))\n return self.values[loc]\n return self.values[slicer]"} {"instruction": "Write a Python 3 function for\ncoercing the values and other into the base - types of the values.", "response": "def _try_coerce_args(self, values, other):\n \"\"\"\n localize and return i8 for the values\n\n Parameters\n ----------\n values : ndarray-like\n other : ndarray-like or scalar\n\n Returns\n -------\n base-type values, base-type other\n \"\"\"\n # asi8 is a view, needs copy\n values = _block_shape(values.view(\"i8\"), ndim=self.ndim)\n\n if isinstance(other, ABCSeries):\n other = self._holder(other)\n\n if isinstance(other, bool):\n raise TypeError\n elif is_datetime64_dtype(other):\n # add the tz back\n other = self._holder(other, dtype=self.dtype)\n\n elif is_null_datetimelike(other):\n other = tslibs.iNaT\n elif isinstance(other, self._holder):\n if other.tz != self.values.tz:\n raise ValueError(\"incompatible or non tz-aware value\")\n other = _block_shape(other.asi8, ndim=self.ndim)\n elif isinstance(other, (np.datetime64, datetime, date)):\n other = tslibs.Timestamp(other)\n tz = getattr(other, 'tz', None)\n\n # test we can have an equal time zone\n if tz is None or str(tz) != str(self.values.tz):\n raise ValueError(\"incompatible or non tz-aware value\")\n other = other.value\n else:\n raise TypeError(other)\n\n return values, other"} {"instruction": "Make a summary of the following Python 3 code\ndef _try_coerce_result(self, result):\n if isinstance(result, np.ndarray):\n if result.dtype.kind in ['i', 'f']:\n result = result.astype('M8[ns]')\n\n elif isinstance(result, (np.integer, np.float, np.datetime64)):\n result = self._box_func(result)\n\n if isinstance(result, np.ndarray):\n # allow passing of > 1dim if its trivial\n\n if result.ndim > 1:\n result = result.reshape(np.prod(result.shape))\n # GH#24096 new values invalidates a frequency\n result = self._holder._simple_new(result, freq=None,\n dtype=self.values.dtype)\n\n return result", "response": "try coercion of result to a new object"} {"instruction": "Here you have a function in Python 3, explain what it does\ndef diff(self, n, axis=0):\n if axis == 0:\n # Cannot currently calculate diff across multiple blocks since this\n # function is invoked via apply\n raise NotImplementedError\n new_values = (self.values - self.shift(n, axis=axis)[0].values).asi8\n\n # Reshape the new_values like how algos.diff does for timedelta data\n new_values = new_values.reshape(1, len(new_values))\n new_values = new_values.astype('timedelta64[ns]')\n return [TimeDeltaBlock(new_values, placement=self.mgr_locs.indexer)]", "response": "1st discrete difference between two time - series items."} {"instruction": "Explain what the following Python 3 code does\ndef _try_coerce_args(self, values, other):\n values = values.view('i8')\n\n if isinstance(other, bool):\n raise TypeError\n elif is_null_datetimelike(other):\n other = tslibs.iNaT\n elif isinstance(other, (timedelta, np.timedelta64)):\n other = Timedelta(other).value\n elif hasattr(other, 'dtype') and is_timedelta64_dtype(other):\n other = other.astype('i8', copy=False).view('i8')\n else:\n # coercion issues\n # let higher levels handle\n raise TypeError(other)\n\n return values, other", "response": "Coerce values and other to int64 with null values converted to\n iNaT."} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef _try_coerce_result(self, result):\n if isinstance(result, np.ndarray):\n mask = isna(result)\n if result.dtype.kind in ['i', 'f']:\n result = result.astype('m8[ns]')\n result[mask] = tslibs.iNaT\n\n elif isinstance(result, (np.integer, np.float)):\n result = self._box_func(result)\n\n return result", "response": "coerce the result to a list of the correct type"} {"instruction": "Here you have a function in Python 3, explain what it does\ndef to_native_types(self, slicer=None, na_rep=None, quoting=None,\n **kwargs):\n \"\"\" convert to our native types format, slicing if desired \"\"\"\n\n values = self.values\n if slicer is not None:\n values = values[:, slicer]\n mask = isna(values)\n\n rvalues = np.empty(values.shape, dtype=object)\n if na_rep is None:\n na_rep = 'NaT'\n rvalues[mask] = na_rep\n imask = (~mask).ravel()\n\n # FIXME:\n # should use the formats.format.Timedelta64Formatter here\n # to figure what format to pass to the Timedelta\n # e.g. to not show the decimals say\n rvalues.flat[imask] = np.array([Timedelta(val)._repr_base(format='all')\n for val in values.ravel()[imask]],\n dtype=object)\n return rvalues", "response": "convert to our native types format slicing if desired"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef convert(self, *args, **kwargs):\n\n if args:\n raise NotImplementedError\n by_item = kwargs.get('by_item', True)\n\n new_inputs = ['coerce', 'datetime', 'numeric', 'timedelta']\n new_style = False\n for kw in new_inputs:\n new_style |= kw in kwargs\n\n if new_style:\n fn = soft_convert_objects\n fn_inputs = new_inputs\n else:\n fn = maybe_convert_objects\n fn_inputs = ['convert_dates', 'convert_numeric',\n 'convert_timedeltas']\n fn_inputs += ['copy']\n\n fn_kwargs = {key: kwargs[key] for key in fn_inputs if key in kwargs}\n\n # operate column-by-column\n def f(m, v, i):\n shape = v.shape\n values = fn(v.ravel(), **fn_kwargs)\n try:\n values = values.reshape(shape)\n values = _block_shape(values, ndim=self.ndim)\n except (AttributeError, NotImplementedError):\n pass\n\n return values\n\n if by_item and not self._is_single_block:\n blocks = self.split_and_operate(None, f, False)\n else:\n values = f(None, self.values.ravel(), None)\n blocks = [make_block(values, ndim=self.ndim,\n placement=self.mgr_locs)]\n\n return blocks", "response": "converts any object types to better types return a copy of the block"} {"instruction": "Make a summary of the following Python 3 code\ndef set(self, locs, values):\n try:\n self.values[locs] = values\n except (ValueError):\n\n # broadcasting error\n # see GH6171\n new_shape = list(values.shape)\n new_shape[0] = len(self.items)\n self.values = np.empty(tuple(new_shape), dtype=self.dtype)\n self.values.fill(np.nan)\n self.values[locs] = values", "response": "Modify Block in - place with new item value"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef _try_coerce_args(self, values, other):\n\n if isinstance(other, ABCDatetimeIndex):\n # May get a DatetimeIndex here. Unbox it.\n other = other.array\n\n if isinstance(other, DatetimeArray):\n # hit in pandas/tests/indexing/test_coercion.py\n # ::TestWhereCoercion::test_where_series_datetime64[datetime64tz]\n # when falling back to ObjectBlock.where\n other = other.astype(object)\n\n return values, other", "response": "try to coerce the values and other to the correct type"} {"instruction": "Given the following Python 3 function, write the documentation\ndef _replace_single(self, to_replace, value, inplace=False, filter=None,\n regex=False, convert=True, mask=None):\n \"\"\"\n Replace elements by the given value.\n\n Parameters\n ----------\n to_replace : object or pattern\n Scalar to replace or regular expression to match.\n value : object\n Replacement object.\n inplace : bool, default False\n Perform inplace modification.\n filter : list, optional\n regex : bool, default False\n If true, perform regular expression substitution.\n convert : bool, default True\n If true, try to coerce any object types to better types.\n mask : array-like of bool, optional\n True indicate corresponding element is ignored.\n\n Returns\n -------\n a new block, the result after replacing\n \"\"\"\n inplace = validate_bool_kwarg(inplace, 'inplace')\n\n # to_replace is regex compilable\n to_rep_re = regex and is_re_compilable(to_replace)\n\n # regex is regex compilable\n regex_re = is_re_compilable(regex)\n\n # only one will survive\n if to_rep_re and regex_re:\n raise AssertionError('only one of to_replace and regex can be '\n 'regex compilable')\n\n # if regex was passed as something that can be a regex (rather than a\n # boolean)\n if regex_re:\n to_replace = regex\n\n regex = regex_re or to_rep_re\n\n # try to get the pattern attribute (compiled re) or it's a string\n try:\n pattern = to_replace.pattern\n except AttributeError:\n pattern = to_replace\n\n # if the pattern is not empty and to_replace is either a string or a\n # regex\n if regex and pattern:\n rx = re.compile(to_replace)\n else:\n # if the thing to replace is not a string or compiled regex call\n # the superclass method -> to_replace is some kind of object\n return super().replace(to_replace, value, inplace=inplace,\n filter=filter, regex=regex)\n\n new_values = self.values if inplace else self.values.copy()\n\n # deal with replacing values with objects (strings) that match but\n # whose replacement is not a string (numeric, nan, object)\n if isna(value) or not isinstance(value, str):\n\n def re_replacer(s):\n try:\n return value if rx.search(s) is not None else s\n except TypeError:\n return s\n else:\n # value is guaranteed to be a string here, s can be either a string\n # or null if it's null it gets returned\n def re_replacer(s):\n try:\n return rx.sub(value, s)\n except TypeError:\n return s\n\n f = np.vectorize(re_replacer, otypes=[self.dtype])\n\n if filter is None:\n filt = slice(None)\n else:\n filt = self.mgr_locs.isin(filter).nonzero()[0]\n\n if mask is None:\n new_values[filt] = f(new_values[filt])\n else:\n new_values[filt][mask] = f(new_values[filt][mask])\n\n # convert\n block = self.make_block(new_values)\n if convert:\n block = block.convert(by_item=True, numeric=False)\n return block", "response": "Replaces the value of the object with the given value."} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef _replace_coerce(self, to_replace, value, inplace=True, regex=False,\n convert=False, mask=None):\n \"\"\"\n Replace value corresponding to the given boolean array with another\n value.\n\n Parameters\n ----------\n to_replace : object or pattern\n Scalar to replace or regular expression to match.\n value : object\n Replacement object.\n inplace : bool, default False\n Perform inplace modification.\n regex : bool, default False\n If true, perform regular expression substitution.\n convert : bool, default True\n If true, try to coerce any object types to better types.\n mask : array-like of bool, optional\n True indicate corresponding element is ignored.\n\n Returns\n -------\n A new block if there is anything to replace or the original block.\n \"\"\"\n if mask.any():\n block = super()._replace_coerce(\n to_replace=to_replace, value=value, inplace=inplace,\n regex=regex, convert=convert, mask=mask)\n if convert:\n block = [b.convert(by_item=True, numeric=False, copy=True)\n for b in block]\n return block\n return self", "response": "Replace value corresponding to the given boolean array with another boolean array."} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef _try_coerce_result(self, result):\n\n # GH12564: CategoricalBlock is 1-dim only\n # while returned results could be any dim\n if ((not is_categorical_dtype(result)) and\n isinstance(result, np.ndarray)):\n result = _block_shape(result, ndim=self.ndim)\n\n return result", "response": "try to coerce result to a CategoricalBlock"} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef to_native_types(self, slicer=None, na_rep='', quoting=None, **kwargs):\n\n values = self.values\n if slicer is not None:\n # Categorical is always one dimension\n values = values[slicer]\n mask = isna(values)\n values = np.array(values, dtype='object')\n values[mask] = na_rep\n\n # we are expected to return a 2-d ndarray\n return values.reshape(1, len(values))", "response": "convert to our native types format slicing is desired"} {"instruction": "Explain what the following Python 3 code does\ndef _style_to_xlwt(cls, item, firstlevel=True, field_sep=',',\n line_sep=';'):\n \"\"\"helper which recursively generate an xlwt easy style string\n for example:\n\n hstyle = {\"font\": {\"bold\": True},\n \"border\": {\"top\": \"thin\",\n \"right\": \"thin\",\n \"bottom\": \"thin\",\n \"left\": \"thin\"},\n \"align\": {\"horiz\": \"center\"}}\n will be converted to\n font: bold on; \\\n border: top thin, right thin, bottom thin, left thin; \\\n align: horiz center;\n \"\"\"\n if hasattr(item, 'items'):\n if firstlevel:\n it = [\"{key}: {val}\"\n .format(key=key, val=cls._style_to_xlwt(value, False))\n for key, value in item.items()]\n out = \"{sep} \".format(sep=(line_sep).join(it))\n return out\n else:\n it = [\"{key} {val}\"\n .format(key=key, val=cls._style_to_xlwt(value, False))\n for key, value in item.items()]\n out = \"{sep} \".format(sep=(field_sep).join(it))\n return out\n else:\n item = \"{item}\".format(item=item)\n item = item.replace(\"True\", \"on\")\n item = item.replace(\"False\", \"off\")\n return item", "response": "helper which recursively generates an xlwt easy style string for example"} {"instruction": "Can you write a function in Python 3 where it\nconverts a style dictionary to an xlwt style object.", "response": "def _convert_to_style(cls, style_dict, num_format_str=None):\n \"\"\"\n converts a style_dict to an xlwt style object\n Parameters\n ----------\n style_dict : style dictionary to convert\n num_format_str : optional number format string\n \"\"\"\n import xlwt\n\n if style_dict:\n xlwt_stylestr = cls._style_to_xlwt(style_dict)\n style = xlwt.easyxf(xlwt_stylestr, field_sep=',', line_sep=';')\n else:\n style = xlwt.XFStyle()\n if num_format_str is not None:\n style.num_format_str = num_format_str\n\n return style"} {"instruction": "Write a Python 3 function for\nconverting a style dictionary to an xlsxwriter format dict.", "response": "def convert(cls, style_dict, num_format_str=None):\n \"\"\"\n converts a style_dict to an xlsxwriter format dict\n\n Parameters\n ----------\n style_dict : style dictionary to convert\n num_format_str : optional number format string\n \"\"\"\n\n # Create a XlsxWriter format object.\n props = {}\n\n if num_format_str is not None:\n props['num_format'] = num_format_str\n\n if style_dict is None:\n return props\n\n if 'borders' in style_dict:\n style_dict = style_dict.copy()\n style_dict['border'] = style_dict.pop('borders')\n\n for style_group_key, style_group in style_dict.items():\n for src, dst in cls.STYLE_MAPPING.get(style_group_key, []):\n # src is a sequence of keys into a nested dict\n # dst is a flat key\n if dst in props:\n continue\n v = style_group\n for k in src:\n try:\n v = v[k]\n except (KeyError, TypeError):\n break\n else:\n props[dst] = v\n\n if isinstance(props.get('pattern'), str):\n # TODO: support other fill patterns\n props['pattern'] = 0 if props['pattern'] == 'none' else 1\n\n for k in ['border', 'top', 'right', 'bottom', 'left']:\n if isinstance(props.get(k), str):\n try:\n props[k] = ['none', 'thin', 'medium', 'dashed', 'dotted',\n 'thick', 'double', 'hair', 'mediumDashed',\n 'dashDot', 'mediumDashDot', 'dashDotDot',\n 'mediumDashDotDot',\n 'slantDashDot'].index(props[k])\n except ValueError:\n props[k] = 2\n\n if isinstance(props.get('font_script'), str):\n props['font_script'] = ['baseline', 'superscript',\n 'subscript'].index(props['font_script'])\n\n if isinstance(props.get('underline'), str):\n props['underline'] = {'none': 0, 'single': 1, 'double': 2,\n 'singleAccounting': 33,\n 'doubleAccounting': 34}[props['underline']]\n\n return props"} {"instruction": "Write a Python 3 script to\nconvert DataFrame to Series with multi - level Index. Columns become the second level of the resulting hierarchical index Returns a Series with multi - level Index.", "response": "def stack(frame, level=-1, dropna=True):\n \"\"\"\n Convert DataFrame to Series with multi-level Index. Columns become the\n second level of the resulting hierarchical index\n\n Returns\n -------\n stacked : Series\n \"\"\"\n def factorize(index):\n if index.is_unique:\n return index, np.arange(len(index))\n codes, categories = _factorize_from_iterable(index)\n return categories, codes\n\n N, K = frame.shape\n\n # Will also convert negative level numbers and check if out of bounds.\n level_num = frame.columns._get_level_number(level)\n\n if isinstance(frame.columns, MultiIndex):\n return _stack_multi_columns(frame, level_num=level_num, dropna=dropna)\n elif isinstance(frame.index, MultiIndex):\n new_levels = list(frame.index.levels)\n new_codes = [lab.repeat(K) for lab in frame.index.codes]\n\n clev, clab = factorize(frame.columns)\n new_levels.append(clev)\n new_codes.append(np.tile(clab, N).ravel())\n\n new_names = list(frame.index.names)\n new_names.append(frame.columns.name)\n new_index = MultiIndex(levels=new_levels, codes=new_codes,\n names=new_names, verify_integrity=False)\n else:\n levels, (ilab, clab) = zip(*map(factorize, (frame.index,\n frame.columns)))\n codes = ilab.repeat(K), np.tile(clab, N).ravel()\n new_index = MultiIndex(levels=levels, codes=codes,\n names=[frame.index.name, frame.columns.name],\n verify_integrity=False)\n\n if frame._is_homogeneous_type:\n # For homogeneous EAs, frame.values will coerce to object. So\n # we concatenate instead.\n dtypes = list(frame.dtypes.values)\n dtype = dtypes[0]\n\n if is_extension_array_dtype(dtype):\n arr = dtype.construct_array_type()\n new_values = arr._concat_same_type([\n col._values for _, col in frame.iteritems()\n ])\n new_values = _reorder_for_extension_array_stack(new_values, N, K)\n else:\n # homogeneous, non-EA\n new_values = frame.values.ravel()\n\n else:\n # non-homogeneous\n new_values = frame.values.ravel()\n\n if dropna:\n mask = notna(new_values)\n new_values = new_values[mask]\n new_index = new_index[mask]\n\n return frame._constructor_sliced(new_values, index=new_index)"} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False,\n columns=None, sparse=False, drop_first=False, dtype=None):\n \"\"\"\n Convert categorical variable into dummy/indicator variables.\n\n Parameters\n ----------\n data : array-like, Series, or DataFrame\n Data of which to get dummy indicators.\n prefix : str, list of str, or dict of str, default None\n String to append DataFrame column names.\n Pass a list with length equal to the number of columns\n when calling get_dummies on a DataFrame. Alternatively, `prefix`\n can be a dictionary mapping column names to prefixes.\n prefix_sep : str, default '_'\n If appending prefix, separator/delimiter to use. Or pass a\n list or dictionary as with `prefix`.\n dummy_na : bool, default False\n Add a column to indicate NaNs, if False NaNs are ignored.\n columns : list-like, default None\n Column names in the DataFrame to be encoded.\n If `columns` is None then all the columns with\n `object` or `category` dtype will be converted.\n sparse : bool, default False\n Whether the dummy-encoded columns should be backed by\n a :class:`SparseArray` (True) or a regular NumPy array (False).\n drop_first : bool, default False\n Whether to get k-1 dummies out of k categorical levels by removing the\n first level.\n\n .. versionadded:: 0.18.0\n\n dtype : dtype, default np.uint8\n Data type for new columns. Only a single dtype is allowed.\n\n .. versionadded:: 0.23.0\n\n Returns\n -------\n DataFrame\n Dummy-coded data.\n\n See Also\n --------\n Series.str.get_dummies : Convert Series to dummy codes.\n\n Examples\n --------\n >>> s = pd.Series(list('abca'))\n\n >>> pd.get_dummies(s)\n a b c\n 0 1 0 0\n 1 0 1 0\n 2 0 0 1\n 3 1 0 0\n\n >>> s1 = ['a', 'b', np.nan]\n\n >>> pd.get_dummies(s1)\n a b\n 0 1 0\n 1 0 1\n 2 0 0\n\n >>> pd.get_dummies(s1, dummy_na=True)\n a b NaN\n 0 1 0 0\n 1 0 1 0\n 2 0 0 1\n\n >>> df = pd.DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'a', 'c'],\n ... 'C': [1, 2, 3]})\n\n >>> pd.get_dummies(df, prefix=['col1', 'col2'])\n C col1_a col1_b col2_a col2_b col2_c\n 0 1 1 0 0 1 0\n 1 2 0 1 1 0 0\n 2 3 1 0 0 0 1\n\n >>> pd.get_dummies(pd.Series(list('abcaa')))\n a b c\n 0 1 0 0\n 1 0 1 0\n 2 0 0 1\n 3 1 0 0\n 4 1 0 0\n\n >>> pd.get_dummies(pd.Series(list('abcaa')), drop_first=True)\n b c\n 0 0 0\n 1 1 0\n 2 0 1\n 3 0 0\n 4 0 0\n\n >>> pd.get_dummies(pd.Series(list('abc')), dtype=float)\n a b c\n 0 1.0 0.0 0.0\n 1 0.0 1.0 0.0\n 2 0.0 0.0 1.0\n \"\"\"\n from pandas.core.reshape.concat import concat\n from itertools import cycle\n\n dtypes_to_encode = ['object', 'category']\n\n if isinstance(data, DataFrame):\n # determine columns being encoded\n if columns is None:\n data_to_encode = data.select_dtypes(\n include=dtypes_to_encode)\n else:\n data_to_encode = data[columns]\n\n # validate prefixes and separator to avoid silently dropping cols\n def check_len(item, name):\n len_msg = (\"Length of '{name}' ({len_item}) did not match the \"\n \"length of the columns being encoded ({len_enc}).\")\n\n if is_list_like(item):\n if not len(item) == data_to_encode.shape[1]:\n len_msg = len_msg.format(name=name, len_item=len(item),\n len_enc=data_to_encode.shape[1])\n raise ValueError(len_msg)\n\n check_len(prefix, 'prefix')\n check_len(prefix_sep, 'prefix_sep')\n\n if isinstance(prefix, str):\n prefix = cycle([prefix])\n if isinstance(prefix, dict):\n prefix = [prefix[col] for col in data_to_encode.columns]\n\n if prefix is None:\n prefix = data_to_encode.columns\n\n # validate separators\n if isinstance(prefix_sep, str):\n prefix_sep = cycle([prefix_sep])\n elif isinstance(prefix_sep, dict):\n prefix_sep = [prefix_sep[col] for col in data_to_encode.columns]\n\n if data_to_encode.shape == data.shape:\n # Encoding the entire df, do not prepend any dropped columns\n with_dummies = []\n elif columns is not None:\n # Encoding only cols specified in columns. Get all cols not in\n # columns to prepend to result.\n with_dummies = [data.drop(columns, axis=1)]\n else:\n # Encoding only object and category dtype columns. Get remaining\n # columns to prepend to result.\n with_dummies = [data.select_dtypes(exclude=dtypes_to_encode)]\n\n for (col, pre, sep) in zip(data_to_encode.iteritems(), prefix,\n prefix_sep):\n # col is (column_name, column), use just column data here\n dummy = _get_dummies_1d(col[1], prefix=pre, prefix_sep=sep,\n dummy_na=dummy_na, sparse=sparse,\n drop_first=drop_first, dtype=dtype)\n with_dummies.append(dummy)\n result = concat(with_dummies, axis=1)\n else:\n result = _get_dummies_1d(data, prefix, prefix_sep, dummy_na,\n sparse=sparse,\n drop_first=drop_first,\n dtype=dtype)\n return result", "response": "Convert categorical variable into dummy - encoded indicators."} {"instruction": "Can you implement a function in Python 3 that\nconstructs 1 - 0 dummy variables corresponding to designated axis.", "response": "def make_axis_dummies(frame, axis='minor', transform=None):\n \"\"\"\n Construct 1-0 dummy variables corresponding to designated axis\n labels\n\n Parameters\n ----------\n frame : DataFrame\n axis : {'major', 'minor'}, default 'minor'\n transform : function, default None\n Function to apply to axis labels first. For example, to\n get \"day of week\" dummies in a time series regression\n you might call::\n\n make_axis_dummies(panel, axis='major',\n transform=lambda d: d.weekday())\n Returns\n -------\n dummies : DataFrame\n Column names taken from chosen axis\n \"\"\"\n numbers = {'major': 0, 'minor': 1}\n num = numbers.get(axis, axis)\n\n items = frame.index.levels[num]\n codes = frame.index.codes[num]\n if transform is not None:\n mapped_items = items.map(transform)\n codes, items = _factorize_from_iterable(mapped_items.take(codes))\n\n values = np.eye(len(items), dtype=float)\n values = values.take(codes, axis=0)\n\n return DataFrame(values, columns=items, index=frame.index)"} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef _reorder_for_extension_array_stack(arr, n_rows, n_columns):\n # final take to get the order correct.\n # idx is an indexer like\n # [c0r0, c1r0, c2r0, ...,\n # c0r1, c1r1, c2r1, ...]\n idx = np.arange(n_rows * n_columns).reshape(n_columns, n_rows).T.ravel()\n return arr.take(idx)", "response": "Re - orders the values when stacking multiple extension - arrays."} {"instruction": "How would you implement a function in Python 3 that\nsplits a string into a set of key - value pairs.", "response": "def _split_line(s, parts):\n \"\"\"\n Parameters\n ----------\n s: string\n Fixed-length string to split\n parts: list of (name, length) pairs\n Used to break up string, name '_' will be filtered from output.\n\n Returns\n -------\n Dict of name:contents of string at given location.\n \"\"\"\n out = {}\n start = 0\n for name, length in parts:\n out[name] = s[start:start + length].strip()\n start += length\n del out['_']\n return out"} {"instruction": "Make a summary of the following Python 3 code\ndef _parse_float_vec(vec):\n\n dtype = np.dtype('>u4,>u4')\n vec1 = vec.view(dtype=dtype)\n xport1 = vec1['f0']\n xport2 = vec1['f1']\n\n # Start by setting first half of ieee number to first half of IBM\n # number sans exponent\n ieee1 = xport1 & 0x00ffffff\n\n # The fraction bit to the left of the binary point in the ieee\n # format was set and the number was shifted 0, 1, 2, or 3\n # places. This will tell us how to adjust the ibm exponent to be a\n # power of 2 ieee exponent and how to shift the fraction bits to\n # restore the correct magnitude.\n shift = np.zeros(len(vec), dtype=np.uint8)\n shift[np.where(xport1 & 0x00200000)] = 1\n shift[np.where(xport1 & 0x00400000)] = 2\n shift[np.where(xport1 & 0x00800000)] = 3\n\n # shift the ieee number down the correct number of places then\n # set the second half of the ieee number to be the second half\n # of the ibm number shifted appropriately, ored with the bits\n # from the first half that would have been shifted in if we\n # could shift a double. All we are worried about are the low\n # order 3 bits of the first half since we're only shifting by\n # 1, 2, or 3.\n ieee1 >>= shift\n ieee2 = (xport2 >> shift) | ((xport1 & 0x00000007) << (29 + (3 - shift)))\n\n # clear the 1 bit to the left of the binary point\n ieee1 &= 0xffefffff\n\n # set the exponent of the ieee number to be the actual exponent\n # plus the shift count + 1023. Or this into the first half of the\n # ieee number. The ibm exponent is excess 64 but is adjusted by 65\n # since during conversion to ibm format the exponent is\n # incremented by 1 and the fraction bits left 4 positions to the\n # right of the radix point. (had to add >> 24 because C treats &\n # 0x7f as 0x7f000000 and Python doesn't)\n ieee1 |= ((((((xport1 >> 24) & 0x7f) - 65) << 2) +\n shift + 1023) << 20) | (xport1 & 0x80000000)\n\n ieee = np.empty((len(ieee1),), dtype='>u4,>u4')\n ieee['f0'] = ieee1\n ieee['f1'] = ieee2\n ieee = ieee.view(dtype='>f8')\n ieee = ieee.astype('f8')\n\n return ieee", "response": "Parses a vector of float values representing IBM 8 byte floats into a base - 2 ieee sequence."} {"instruction": "Write a Python 3 script to\nreturn the number of records in the file.", "response": "def _record_count(self):\n \"\"\"\n Get number of records in file.\n\n This is maybe suboptimal because we have to seek to the end of\n the file.\n\n Side effect: returns file position to record_start.\n \"\"\"\n\n self.filepath_or_buffer.seek(0, 2)\n total_records_length = (self.filepath_or_buffer.tell() -\n self.record_start)\n\n if total_records_length % 80 != 0:\n warnings.warn(\"xport file may be corrupted\")\n\n if self.record_length > 80:\n self.filepath_or_buffer.seek(self.record_start)\n return total_records_length // self.record_length\n\n self.filepath_or_buffer.seek(-80, 2)\n last_card = self.filepath_or_buffer.read(80)\n last_card = np.frombuffer(last_card, dtype=np.uint64)\n\n # 8 byte blank\n ix = np.flatnonzero(last_card == 2314885530818453536)\n\n if len(ix) == 0:\n tail_pad = 0\n else:\n tail_pad = 8 * len(ix)\n\n self.filepath_or_buffer.seek(self.record_start)\n\n return (total_records_length - tail_pad) // self.record_length"} {"instruction": "Make a summary of the following Python 3 code\ndef get_chunk(self, size=None):\n if size is None:\n size = self._chunksize\n return self.read(nrows=size)", "response": "Reads lines from Xport file and returns as dataframe"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef construction_error(tot_items, block_shape, axes, e=None):\n passed = tuple(map(int, [tot_items] + list(block_shape)))\n # Correcting the user facing error message during dataframe construction\n if len(passed) <= 2:\n passed = passed[::-1]\n\n implied = tuple(len(ax) for ax in axes)\n # Correcting the user facing error message during dataframe construction\n if len(implied) <= 2:\n implied = implied[::-1]\n\n if passed == implied and e is not None:\n raise e\n if block_shape[0] == 0:\n raise ValueError(\"Empty data passed with indices specified.\")\n raise ValueError(\"Shape of passed values is {0}, indices imply {1}\".format(\n passed, implied))", "response": "raise a helpful message about our construction"} {"instruction": "Implement a Python 3 function for\nreturning a single array of a block that has a single dtype ; if dtype is None return a single array of a block that has a single dtype ; if dtype is None return a single array of a block that has a single dtype ; if dtype is None return a single array of a block that has a single dtype", "response": "def _simple_blockify(tuples, dtype):\n \"\"\" return a single array of a block that has a single dtype; if dtype is\n not None, coerce to this dtype\n \"\"\"\n values, placement = _stack_arrays(tuples, dtype)\n\n # CHECK DTYPE?\n if dtype is not None and values.dtype != dtype: # pragma: no cover\n values = values.astype(dtype)\n\n block = make_block(values, placement=placement)\n return [block]"} {"instruction": "How would you explain what the following Python 3 function does\ndef _multi_blockify(tuples, dtype=None):\n\n # group by dtype\n grouper = itertools.groupby(tuples, lambda x: x[2].dtype)\n\n new_blocks = []\n for dtype, tup_block in grouper:\n\n values, placement = _stack_arrays(list(tup_block), dtype)\n\n block = make_block(values, placement=placement)\n new_blocks.append(block)\n\n return new_blocks", "response": "return an array of blocks that potentially have different dtypes"} {"instruction": "Write a Python 3 function that can\nreturn an array of blocks that potentially have different dtypes", "response": "def _sparse_blockify(tuples, dtype=None):\n \"\"\" return an array of blocks that potentially have different dtypes (and\n are sparse)\n \"\"\"\n\n new_blocks = []\n for i, names, array in tuples:\n array = _maybe_to_sparse(array)\n block = make_block(array, placement=[i])\n new_blocks.append(block)\n\n return new_blocks"} {"instruction": "Write a Python 3 script to\nfind the common dtype for the given list of blocks.", "response": "def _interleaved_dtype(\n blocks: List[Block]\n) -> Optional[Union[np.dtype, ExtensionDtype]]:\n \"\"\"Find the common dtype for `blocks`.\n\n Parameters\n ----------\n blocks : List[Block]\n\n Returns\n -------\n dtype : Optional[Union[np.dtype, ExtensionDtype]]\n None is returned when `blocks` is empty.\n \"\"\"\n if not len(blocks):\n return None\n\n return find_common_type([b.dtype for b in blocks])"} {"instruction": "Create a Python 3 function to\nmerge blocks having same dtype exclude non - consolidating blocks", "response": "def _consolidate(blocks):\n \"\"\"\n Merge blocks having same dtype, exclude non-consolidating blocks\n \"\"\"\n\n # sort by _can_consolidate, dtype\n gkey = lambda x: x._consolidate_key\n grouper = itertools.groupby(sorted(blocks, key=gkey), gkey)\n\n new_blocks = []\n for (_can_consolidate, dtype), group_blocks in grouper:\n merged_blocks = _merge_blocks(list(group_blocks), dtype=dtype,\n _can_consolidate=_can_consolidate)\n new_blocks = _extend_blocks(merged_blocks, new_blocks)\n return new_blocks"} {"instruction": "How would you implement a function in Python 3 that\ncompares two array_like inputs of the same shape or two scalar values.", "response": "def _compare_or_regex_search(a, b, regex=False):\n \"\"\"\n Compare two array_like inputs of the same shape or two scalar values\n\n Calls operator.eq or re.search, depending on regex argument. If regex is\n True, perform an element-wise regex matching.\n\n Parameters\n ----------\n a : array_like or scalar\n b : array_like or scalar\n regex : bool, default False\n\n Returns\n -------\n mask : array_like of bool\n \"\"\"\n if not regex:\n op = lambda x: operator.eq(x, b)\n else:\n op = np.vectorize(lambda x: bool(re.search(b, x)) if isinstance(x, str)\n else False)\n\n is_a_array = isinstance(a, np.ndarray)\n is_b_array = isinstance(b, np.ndarray)\n\n # numpy deprecation warning to have i8 vs integer comparisons\n if is_datetimelike_v_numeric(a, b):\n result = False\n\n # numpy deprecation warning if comparing numeric vs string-like\n elif is_numeric_v_string_like(a, b):\n result = False\n else:\n result = op(a)\n\n if is_scalar(result) and (is_a_array or is_b_array):\n type_names = [type(a).__name__, type(b).__name__]\n\n if is_a_array:\n type_names[0] = 'ndarray(dtype={dtype})'.format(dtype=a.dtype)\n\n if is_b_array:\n type_names[1] = 'ndarray(dtype={dtype})'.format(dtype=b.dtype)\n\n raise TypeError(\n \"Cannot compare types {a!r} and {b!r}\".format(a=type_names[0],\n b=type_names[1]))\n return result"} {"instruction": "Make a summary of the following Python 3 code\ndef items_overlap_with_suffix(left, lsuffix, right, rsuffix):\n to_rename = left.intersection(right)\n if len(to_rename) == 0:\n return left, right\n else:\n if not lsuffix and not rsuffix:\n raise ValueError('columns overlap but no suffix specified: '\n '{rename}'.format(rename=to_rename))\n\n def renamer(x, suffix):\n \"\"\"Rename the left and right indices.\n\n If there is overlap, and suffix is not None, add\n suffix, otherwise, leave it as-is.\n\n Parameters\n ----------\n x : original column name\n suffix : str or None\n\n Returns\n -------\n x : renamed column name\n \"\"\"\n if x in to_rename and suffix is not None:\n return '{x}{suffix}'.format(x=x, suffix=suffix)\n return x\n\n lrenamer = partial(renamer, suffix=lsuffix)\n rrenamer = partial(renamer, suffix=rsuffix)\n\n return (_transform_index(left, lrenamer),\n _transform_index(right, rrenamer))", "response": "Returns two lists of items that overlap with the given suffix."} {"instruction": "How would you implement a function in Python 3 that\napplies function to all values found in index.", "response": "def _transform_index(index, func, level=None):\n \"\"\"\n Apply function to all values found in index.\n\n This includes transforming multiindex entries separately.\n Only apply function to one level of the MultiIndex if level is specified.\n\n \"\"\"\n if isinstance(index, MultiIndex):\n if level is not None:\n items = [tuple(func(y) if i == level else y\n for i, y in enumerate(x)) for x in index]\n else:\n items = [tuple(func(y) for y in x) for x in index]\n return MultiIndex.from_tuples(items, names=index.names)\n else:\n items = [func(x) for x in index]\n return Index(items, name=index.name, tupleize_cols=False)"} {"instruction": "Given the following Python 3 function, write the documentation\ndef concatenate_block_managers(mgrs_indexers, axes, concat_axis, copy):\n concat_plans = [get_mgr_concatenation_plan(mgr, indexers)\n for mgr, indexers in mgrs_indexers]\n concat_plan = combine_concat_plans(concat_plans, concat_axis)\n blocks = []\n\n for placement, join_units in concat_plan:\n\n if len(join_units) == 1 and not join_units[0].indexers:\n b = join_units[0].block\n values = b.values\n if copy:\n values = values.copy()\n elif not copy:\n values = values.view()\n b = b.make_block_same_class(values, placement=placement)\n elif is_uniform_join_units(join_units):\n b = join_units[0].block.concat_same_type(\n [ju.block for ju in join_units], placement=placement)\n else:\n b = make_block(\n concatenate_join_units(join_units, concat_axis, copy=copy),\n placement=placement)\n blocks.append(b)\n\n return BlockManager(blocks, axes)", "response": "Concatenate block managers into one."} {"instruction": "Write a Python 3 script for\nmaking an empty BlockManager with the items axis of len 0", "response": "def make_empty(self, axes=None):\n \"\"\" return an empty BlockManager with the items axis of len 0 \"\"\"\n if axes is None:\n axes = [ensure_index([])] + [ensure_index(a)\n for a in self.axes[1:]]\n\n # preserve dtype if possible\n if self.ndim == 1:\n blocks = np.array([], dtype=self.array_dtype)\n else:\n blocks = []\n return self.__class__(blocks, axes)"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef rename_axis(self, mapper, axis, copy=True, level=None):\n obj = self.copy(deep=copy)\n obj.set_axis(axis, _transform_index(self.axes[axis], mapper, level))\n return obj", "response": "Returns a copy of the current object with the new axis renamed."} {"instruction": "Implement a function in Python 3 to\nrebuild the internal _blknos and _blklocs arrays.", "response": "def _rebuild_blknos_and_blklocs(self):\n \"\"\"\n Update mgr._blknos / mgr._blklocs.\n \"\"\"\n new_blknos = np.empty(self.shape[0], dtype=np.int64)\n new_blklocs = np.empty(self.shape[0], dtype=np.int64)\n new_blknos.fill(-1)\n new_blklocs.fill(-1)\n\n for blkno, blk in enumerate(self.blocks):\n rl = blk.mgr_locs\n new_blknos[rl.indexer] = blkno\n new_blklocs[rl.indexer] = np.arange(len(rl))\n\n if (new_blknos == -1).any():\n raise AssertionError(\"Gaps in blk ref_locs\")\n\n self._blknos = new_blknos\n self._blklocs = new_blklocs"} {"instruction": "Create a Python 3 function for\nreturning a dict of the counts of the function f in BlockManager", "response": "def _get_counts(self, f):\n \"\"\" return a dict of the counts of the function in BlockManager \"\"\"\n self._consolidate_inplace()\n counts = dict()\n for b in self.blocks:\n v = f(b)\n counts[v] = counts.get(v, 0) + b.shape[0]\n return counts"} {"instruction": "How would you explain what the following Python 3 function does\ndef apply(self, f, axes=None, filter=None, do_integrity_check=False,\n consolidate=True, **kwargs):\n \"\"\"\n iterate over the blocks, collect and create a new block manager\n\n Parameters\n ----------\n f : the callable or function name to operate on at the block level\n axes : optional (if not supplied, use self.axes)\n filter : list, if supplied, only call the block if the filter is in\n the block\n do_integrity_check : boolean, default False. Do the block manager\n integrity check\n consolidate: boolean, default True. Join together blocks having same\n dtype\n\n Returns\n -------\n Block Manager (new object)\n\n \"\"\"\n\n result_blocks = []\n\n # filter kwarg is used in replace-* family of methods\n if filter is not None:\n filter_locs = set(self.items.get_indexer_for(filter))\n if len(filter_locs) == len(self.items):\n # All items are included, as if there were no filtering\n filter = None\n else:\n kwargs['filter'] = filter_locs\n\n if consolidate:\n self._consolidate_inplace()\n\n if f == 'where':\n align_copy = True\n if kwargs.get('align', True):\n align_keys = ['other', 'cond']\n else:\n align_keys = ['cond']\n elif f == 'putmask':\n align_copy = False\n if kwargs.get('align', True):\n align_keys = ['new', 'mask']\n else:\n align_keys = ['mask']\n elif f == 'fillna':\n # fillna internally does putmask, maybe it's better to do this\n # at mgr, not block level?\n align_copy = False\n align_keys = ['value']\n else:\n align_keys = []\n\n # TODO(EA): may interfere with ExtensionBlock.setitem for blocks\n # with a .values attribute.\n aligned_args = {k: kwargs[k]\n for k in align_keys\n if hasattr(kwargs[k], 'values') and\n not isinstance(kwargs[k], ABCExtensionArray)}\n\n for b in self.blocks:\n if filter is not None:\n if not b.mgr_locs.isin(filter_locs).any():\n result_blocks.append(b)\n continue\n\n if aligned_args:\n b_items = self.items[b.mgr_locs.indexer]\n\n for k, obj in aligned_args.items():\n axis = getattr(obj, '_info_axis_number', 0)\n kwargs[k] = obj.reindex(b_items, axis=axis,\n copy=align_copy)\n\n applied = getattr(b, f)(**kwargs)\n result_blocks = _extend_blocks(applied, result_blocks)\n\n if len(result_blocks) == 0:\n return self.make_empty(axes or self.axes)\n bm = self.__class__(result_blocks, axes or self.axes,\n do_integrity_check=do_integrity_check)\n bm._consolidate_inplace()\n return bm", "response": "This function will iterate over the blocks and create a new block manager for the items in the block level."} {"instruction": "Given the following Python 3 function, write the documentation\ndef quantile(self, axis=0, consolidate=True, transposed=False,\n interpolation='linear', qs=None, numeric_only=None):\n \"\"\"\n Iterate over blocks applying quantile reduction.\n This routine is intended for reduction type operations and\n will do inference on the generated blocks.\n\n Parameters\n ----------\n axis: reduction axis, default 0\n consolidate: boolean, default True. Join together blocks having same\n dtype\n transposed: boolean, default False\n we are holding transposed data\n interpolation : type of interpolation, default 'linear'\n qs : a scalar or list of the quantiles to be computed\n numeric_only : ignored\n\n Returns\n -------\n Block Manager (new object)\n \"\"\"\n\n # Series dispatches to DataFrame for quantile, which allows us to\n # simplify some of the code here and in the blocks\n assert self.ndim >= 2\n\n if consolidate:\n self._consolidate_inplace()\n\n def get_axe(block, qs, axes):\n from pandas import Float64Index\n if is_list_like(qs):\n ax = Float64Index(qs)\n elif block.ndim == 1:\n ax = Float64Index([qs])\n else:\n ax = axes[0]\n return ax\n\n axes, blocks = [], []\n for b in self.blocks:\n block = b.quantile(axis=axis, qs=qs, interpolation=interpolation)\n\n axe = get_axe(b, qs, axes=self.axes)\n\n axes.append(axe)\n blocks.append(block)\n\n # note that some DatetimeTZ, Categorical are always ndim==1\n ndim = {b.ndim for b in blocks}\n assert 0 not in ndim, ndim\n\n if 2 in ndim:\n\n new_axes = list(self.axes)\n\n # multiple blocks that are reduced\n if len(blocks) > 1:\n new_axes[1] = axes[0]\n\n # reset the placement to the original\n for b, sb in zip(blocks, self.blocks):\n b.mgr_locs = sb.mgr_locs\n\n else:\n new_axes[axis] = Index(np.concatenate(\n [ax.values for ax in axes]))\n\n if transposed:\n new_axes = new_axes[::-1]\n blocks = [b.make_block(b.values.T,\n placement=np.arange(b.shape[1])\n ) for b in blocks]\n\n return self.__class__(blocks, new_axes)\n\n # single block, i.e. ndim == {1}\n values = _concat._concat_compat([b.values for b in blocks])\n\n # compute the orderings of our original data\n if len(self.blocks) > 1:\n\n indexer = np.empty(len(self.axes[0]), dtype=np.intp)\n i = 0\n for b in self.blocks:\n for j in b.mgr_locs:\n indexer[j] = i\n i = i + 1\n\n values = values.take(indexer)\n\n return SingleBlockManager(\n [make_block(values,\n ndim=1,\n placement=np.arange(len(values)))],\n axes[0])", "response": "Returns a new object with the values of the quantile of the current object."} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef replace_list(self, src_list, dest_list, inplace=False, regex=False):\n\n inplace = validate_bool_kwarg(inplace, 'inplace')\n\n # figure out our mask a-priori to avoid repeated replacements\n values = self.as_array()\n\n def comp(s, regex=False):\n \"\"\"\n Generate a bool array by perform an equality check, or perform\n an element-wise regular expression matching\n \"\"\"\n if isna(s):\n return isna(values)\n if hasattr(s, 'asm8'):\n return _compare_or_regex_search(maybe_convert_objects(values),\n getattr(s, 'asm8'), regex)\n return _compare_or_regex_search(values, s, regex)\n\n masks = [comp(s, regex) for i, s in enumerate(src_list)]\n\n result_blocks = []\n src_len = len(src_list) - 1\n for blk in self.blocks:\n\n # its possible to get multiple result blocks here\n # replace ALWAYS will return a list\n rb = [blk if inplace else blk.copy()]\n for i, (s, d) in enumerate(zip(src_list, dest_list)):\n new_rb = []\n for b in rb:\n m = masks[i][b.mgr_locs.indexer]\n convert = i == src_len\n result = b._replace_coerce(mask=m, to_replace=s, value=d,\n inplace=inplace,\n convert=convert, regex=regex)\n if m.any():\n new_rb = _extend_blocks(result, new_rb)\n else:\n new_rb.append(b)\n rb = new_rb\n result_blocks.extend(rb)\n\n bm = self.__class__(result_blocks, self.axes)\n bm._consolidate_inplace()\n return bm", "response": "replace a list of items in src_list with dest_list"} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef get_bool_data(self, copy=False):\n self._consolidate_inplace()\n return self.combine([b for b in self.blocks if b.is_bool], copy)", "response": "Returns a list of bool data."} {"instruction": "Write a Python 3 function for\nreturning a list of numeric data blocks.", "response": "def get_numeric_data(self, copy=False):\n \"\"\"\n Parameters\n ----------\n copy : boolean, default False\n Whether to copy the blocks\n \"\"\"\n self._consolidate_inplace()\n return self.combine([b for b in self.blocks if b.is_numeric], copy)"} {"instruction": "How would you explain what the following Python 3 function does\ndef combine(self, blocks, copy=True):\n if len(blocks) == 0:\n return self.make_empty()\n\n # FIXME: optimization potential\n indexer = np.sort(np.concatenate([b.mgr_locs.as_array\n for b in blocks]))\n inv_indexer = lib.get_reverse_indexer(indexer, self.shape[0])\n\n new_blocks = []\n for b in blocks:\n b = b.copy(deep=copy)\n b.mgr_locs = algos.take_1d(inv_indexer, b.mgr_locs.as_array,\n axis=0, allow_fill=False)\n new_blocks.append(b)\n\n axes = list(self.axes)\n axes[0] = self.items.take(indexer)\n\n return self.__class__(new_blocks, axes, do_integrity_check=False)", "response": "return a new manager with the blocks"} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef copy(self, deep=True):\n # this preserves the notion of view copying of axes\n if deep:\n if deep == 'all':\n copy = lambda ax: ax.copy(deep=True)\n else:\n copy = lambda ax: ax.view()\n new_axes = [copy(ax) for ax in self.axes]\n else:\n new_axes = list(self.axes)\n return self.apply('copy', axes=new_axes, deep=deep,\n do_integrity_check=False)", "response": "Make deep or shallow copy of the current object"} {"instruction": "Can you create a Python 3 function that\nconverts the blockmanager data into an array.", "response": "def as_array(self, transpose=False, items=None):\n \"\"\"Convert the blockmanager data into an numpy array.\n\n Parameters\n ----------\n transpose : boolean, default False\n If True, transpose the return array\n items : list of strings or None\n Names of block items that will be included in the returned\n array. ``None`` means that all block items will be used\n\n Returns\n -------\n arr : ndarray\n \"\"\"\n if len(self.blocks) == 0:\n arr = np.empty(self.shape, dtype=float)\n return arr.transpose() if transpose else arr\n\n if items is not None:\n mgr = self.reindex_axis(items, axis=0)\n else:\n mgr = self\n\n if self._is_single_block and mgr.blocks[0].is_datetimetz:\n # TODO(Block.get_values): Make DatetimeTZBlock.get_values\n # always be object dtype. Some callers seem to want the\n # DatetimeArray (previously DTI)\n arr = mgr.blocks[0].get_values(dtype=object)\n elif self._is_single_block or not self.is_mixed_type:\n arr = np.asarray(mgr.blocks[0].get_values())\n else:\n arr = mgr._interleave()\n\n return arr.transpose() if transpose else arr"} {"instruction": "Given the following Python 3 function, write the documentation\ndef _interleave(self):\n from pandas.core.dtypes.common import is_sparse\n dtype = _interleaved_dtype(self.blocks)\n\n # TODO: https://github.com/pandas-dev/pandas/issues/22791\n # Give EAs some input on what happens here. Sparse needs this.\n if is_sparse(dtype):\n dtype = dtype.subtype\n elif is_extension_array_dtype(dtype):\n dtype = 'object'\n\n result = np.empty(self.shape, dtype=dtype)\n\n itemmask = np.zeros(self.shape[0])\n\n for blk in self.blocks:\n rl = blk.mgr_locs\n result[rl.indexer] = blk.get_values(dtype)\n itemmask[rl.indexer] = 1\n\n if not itemmask.all():\n raise AssertionError('Some items were not contained in blocks')\n\n return result", "response": "Interleave the items in the blocks with specified item order\n "} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef to_dict(self, copy=True):\n self._consolidate_inplace()\n\n bd = {}\n for b in self.blocks:\n bd.setdefault(str(b.dtype), []).append(b)\n\n return {dtype: self.combine(blocks, copy=copy)\n for dtype, blocks in bd.items()}", "response": "Returns a dict of dtype - > BlockManager"} {"instruction": "Create a Python 3 function to\nget a cross sectional for a given location in the items ; handle dups return the result, is *could* be a view in the case of a single block", "response": "def fast_xs(self, loc):\n \"\"\"\n get a cross sectional for a given location in the\n items ; handle dups\n\n return the result, is *could* be a view in the case of a\n single block\n \"\"\"\n if len(self.blocks) == 1:\n return self.blocks[0].iget((slice(None), loc))\n\n items = self.items\n\n # non-unique (GH4726)\n if not items.is_unique:\n result = self._interleave()\n if self.ndim == 2:\n result = result.T\n return result[loc]\n\n # unique\n dtype = _interleaved_dtype(self.blocks)\n\n n = len(items)\n if is_extension_array_dtype(dtype):\n # we'll eventually construct an ExtensionArray.\n result = np.empty(n, dtype=object)\n else:\n result = np.empty(n, dtype=dtype)\n\n for blk in self.blocks:\n # Such assignment may incorrectly coerce NaT to None\n # result[blk.mgr_locs] = blk._slice((slice(None), loc))\n for i, rl in enumerate(blk.mgr_locs):\n result[rl] = blk._try_coerce_result(blk.iget((i, loc)))\n\n if is_extension_array_dtype(dtype):\n result = dtype.construct_array_type()._from_sequence(\n result, dtype=dtype\n )\n\n return result"} {"instruction": "Given the following Python 3 function, write the documentation\ndef consolidate(self):\n if self.is_consolidated():\n return self\n\n bm = self.__class__(self.blocks, self.axes)\n bm._is_consolidated = False\n bm._consolidate_inplace()\n return bm", "response": "Returns a new object with the same blocks and axes."} {"instruction": "Given the following Python 3 function, write the documentation\ndef get(self, item, fastpath=True):\n if self.items.is_unique:\n\n if not isna(item):\n loc = self.items.get_loc(item)\n else:\n indexer = np.arange(len(self.items))[isna(self.items)]\n\n # allow a single nan location indexer\n if not is_scalar(indexer):\n if len(indexer) == 1:\n loc = indexer.item()\n else:\n raise ValueError(\"cannot label index with a null key\")\n\n return self.iget(loc, fastpath=fastpath)\n else:\n\n if isna(item):\n raise TypeError(\"cannot label index with a null key\")\n\n indexer = self.items.get_indexer_for([item])\n return self.reindex_indexer(new_axis=self.items[indexer],\n indexer=indexer, axis=0,\n allow_dups=True)", "response": "Return values for the specified item."} {"instruction": "How would you implement a function in Python 3 that\nreturns the data as a SingleBlockManager or ndarray.", "response": "def iget(self, i, fastpath=True):\n \"\"\"\n Return the data as a SingleBlockManager if fastpath=True and possible\n\n Otherwise return as a ndarray\n \"\"\"\n block = self.blocks[self._blknos[i]]\n values = block.iget(self._blklocs[i])\n if not fastpath or not block._box_to_block_values or values.ndim != 1:\n return values\n\n # fastpath shortcut for select a single-dim from a 2-dim BM\n return SingleBlockManager(\n [block.make_block_same_class(values,\n placement=slice(0, len(values)),\n ndim=1)],\n self.axes[1])"} {"instruction": "Make a summary of the following Python 3 code\ndef delete(self, item):\n indexer = self.items.get_loc(item)\n\n is_deleted = np.zeros(self.shape[0], dtype=np.bool_)\n is_deleted[indexer] = True\n ref_loc_offset = -is_deleted.cumsum()\n\n is_blk_deleted = [False] * len(self.blocks)\n\n if isinstance(indexer, int):\n affected_start = indexer\n else:\n affected_start = is_deleted.nonzero()[0][0]\n\n for blkno, _ in _fast_count_smallints(self._blknos[affected_start:]):\n blk = self.blocks[blkno]\n bml = blk.mgr_locs\n blk_del = is_deleted[bml.indexer].nonzero()[0]\n\n if len(blk_del) == len(bml):\n is_blk_deleted[blkno] = True\n continue\n elif len(blk_del) != 0:\n blk.delete(blk_del)\n bml = blk.mgr_locs\n\n blk.mgr_locs = bml.add(ref_loc_offset[bml.indexer])\n\n # FIXME: use Index.delete as soon as it uses fastpath=True\n self.axes[0] = self.items[~is_deleted]\n self.blocks = tuple(b for blkno, b in enumerate(self.blocks)\n if not is_blk_deleted[blkno])\n self._shape = None\n self._rebuild_blknos_and_blklocs()", "response": "Delete the item from the cache."} {"instruction": "Implement a Python 3 function for\nsetting new item in - place. Does not consolidate. Adds new Block if not contained in the current set.", "response": "def set(self, item, value):\n \"\"\"\n Set new item in-place. Does not consolidate. Adds new Block if not\n contained in the current set of items\n \"\"\"\n # FIXME: refactor, clearly separate broadcasting & zip-like assignment\n # can prob also fix the various if tests for sparse/categorical\n\n # TODO(EA): Remove an is_extension_ when all extension types satisfy\n # the interface\n value_is_extension_type = (is_extension_type(value) or\n is_extension_array_dtype(value))\n\n # categorical/spares/datetimetz\n if value_is_extension_type:\n\n def value_getitem(placement):\n return value\n else:\n if value.ndim == self.ndim - 1:\n value = _safe_reshape(value, (1,) + value.shape)\n\n def value_getitem(placement):\n return value\n else:\n\n def value_getitem(placement):\n return value[placement.indexer]\n\n if value.shape[1:] != self.shape[1:]:\n raise AssertionError('Shape of new values must be compatible '\n 'with manager shape')\n\n try:\n loc = self.items.get_loc(item)\n except KeyError:\n # This item wasn't present, just insert at end\n self.insert(len(self.items), item, value)\n return\n\n if isinstance(loc, int):\n loc = [loc]\n\n blknos = self._blknos[loc]\n blklocs = self._blklocs[loc].copy()\n\n unfit_mgr_locs = []\n unfit_val_locs = []\n removed_blknos = []\n for blkno, val_locs in libinternals.get_blkno_placements(blknos,\n self.nblocks,\n group=True):\n blk = self.blocks[blkno]\n blk_locs = blklocs[val_locs.indexer]\n if blk.should_store(value):\n blk.set(blk_locs, value_getitem(val_locs))\n else:\n unfit_mgr_locs.append(blk.mgr_locs.as_array[blk_locs])\n unfit_val_locs.append(val_locs)\n\n # If all block items are unfit, schedule the block for removal.\n if len(val_locs) == len(blk.mgr_locs):\n removed_blknos.append(blkno)\n else:\n self._blklocs[blk.mgr_locs.indexer] = -1\n blk.delete(blk_locs)\n self._blklocs[blk.mgr_locs.indexer] = np.arange(len(blk))\n\n if len(removed_blknos):\n # Remove blocks & update blknos accordingly\n is_deleted = np.zeros(self.nblocks, dtype=np.bool_)\n is_deleted[removed_blknos] = True\n\n new_blknos = np.empty(self.nblocks, dtype=np.int64)\n new_blknos.fill(-1)\n new_blknos[~is_deleted] = np.arange(self.nblocks -\n len(removed_blknos))\n self._blknos = algos.take_1d(new_blknos, self._blknos, axis=0,\n allow_fill=False)\n self.blocks = tuple(blk for i, blk in enumerate(self.blocks)\n if i not in set(removed_blknos))\n\n if unfit_val_locs:\n unfit_mgr_locs = np.concatenate(unfit_mgr_locs)\n unfit_count = len(unfit_mgr_locs)\n\n new_blocks = []\n if value_is_extension_type:\n # This code (ab-)uses the fact that sparse blocks contain only\n # one item.\n new_blocks.extend(\n make_block(values=value.copy(), ndim=self.ndim,\n placement=slice(mgr_loc, mgr_loc + 1))\n for mgr_loc in unfit_mgr_locs)\n\n self._blknos[unfit_mgr_locs] = (np.arange(unfit_count) +\n len(self.blocks))\n self._blklocs[unfit_mgr_locs] = 0\n\n else:\n # unfit_val_locs contains BlockPlacement objects\n unfit_val_items = unfit_val_locs[0].append(unfit_val_locs[1:])\n\n new_blocks.append(\n make_block(values=value_getitem(unfit_val_items),\n ndim=self.ndim, placement=unfit_mgr_locs))\n\n self._blknos[unfit_mgr_locs] = len(self.blocks)\n self._blklocs[unfit_mgr_locs] = np.arange(unfit_count)\n\n self.blocks += tuple(new_blocks)\n\n # Newly created block's dtype may already be present.\n self._known_consolidated = False"} {"instruction": "Explain what the following Python 3 code does\ndef insert(self, loc, item, value, allow_duplicates=False):\n if not allow_duplicates and item in self.items:\n # Should this be a different kind of error??\n raise ValueError('cannot insert {}, already exists'.format(item))\n\n if not isinstance(loc, int):\n raise TypeError(\"loc must be int\")\n\n # insert to the axis; this could possibly raise a TypeError\n new_axis = self.items.insert(loc, item)\n\n block = make_block(values=value, ndim=self.ndim,\n placement=slice(loc, loc + 1))\n\n for blkno, count in _fast_count_smallints(self._blknos[loc:]):\n blk = self.blocks[blkno]\n if count == len(blk.mgr_locs):\n blk.mgr_locs = blk.mgr_locs.add(1)\n else:\n new_mgr_locs = blk.mgr_locs.as_array.copy()\n new_mgr_locs[new_mgr_locs >= loc] += 1\n blk.mgr_locs = new_mgr_locs\n\n if loc == self._blklocs.shape[0]:\n # np.append is a lot faster, let's use it if we can.\n self._blklocs = np.append(self._blklocs, 0)\n self._blknos = np.append(self._blknos, len(self.blocks))\n else:\n self._blklocs = np.insert(self._blklocs, loc, 0)\n self._blknos = np.insert(self._blknos, loc, len(self.blocks))\n\n self.axes[0] = new_axis\n self.blocks += (block,)\n self._shape = None\n\n self._known_consolidated = False\n\n if len(self.blocks) > 100:\n self._consolidate_inplace()", "response": "Insert an item at a specific location in the cache."} {"instruction": "Create a Python 3 function to\nconform block manager to new index.", "response": "def reindex_axis(self, new_index, axis, method=None, limit=None,\n fill_value=None, copy=True):\n \"\"\"\n Conform block manager to new index.\n \"\"\"\n new_index = ensure_index(new_index)\n new_index, indexer = self.axes[axis].reindex(new_index, method=method,\n limit=limit)\n\n return self.reindex_indexer(new_index, indexer, axis=axis,\n fill_value=fill_value, copy=copy)"} {"instruction": "Can you implement a function in Python 3 that\nreturns a new object with new_axis indexed by indexer.", "response": "def reindex_indexer(self, new_axis, indexer, axis, fill_value=None,\n allow_dups=False, copy=True):\n \"\"\"\n Parameters\n ----------\n new_axis : Index\n indexer : ndarray of int64 or None\n axis : int\n fill_value : object\n allow_dups : bool\n\n pandas-indexer with -1's only.\n \"\"\"\n if indexer is None:\n if new_axis is self.axes[axis] and not copy:\n return self\n\n result = self.copy(deep=copy)\n result.axes = list(self.axes)\n result.axes[axis] = new_axis\n return result\n\n self._consolidate_inplace()\n\n # some axes don't allow reindexing with dups\n if not allow_dups:\n self.axes[axis]._can_reindex(indexer)\n\n if axis >= self.ndim:\n raise IndexError(\"Requested axis not found in manager\")\n\n if axis == 0:\n new_blocks = self._slice_take_blocks_ax0(indexer,\n fill_tuple=(fill_value,))\n else:\n new_blocks = [blk.take_nd(indexer, axis=axis, fill_tuple=(\n fill_value if fill_value is not None else blk.fill_value,))\n for blk in self.blocks]\n\n new_axes = list(self.axes)\n new_axes[axis] = new_axis\n return self.__class__(new_blocks, new_axes)"} {"instruction": "Here you have a function in Python 3, explain what it does\ndef _slice_take_blocks_ax0(self, slice_or_indexer, fill_tuple=None):\n\n allow_fill = fill_tuple is not None\n\n sl_type, slobj, sllen = _preprocess_slice_or_indexer(\n slice_or_indexer, self.shape[0], allow_fill=allow_fill)\n\n if self._is_single_block:\n blk = self.blocks[0]\n\n if sl_type in ('slice', 'mask'):\n return [blk.getitem_block(slobj, new_mgr_locs=slice(0, sllen))]\n elif not allow_fill or self.ndim == 1:\n if allow_fill and fill_tuple[0] is None:\n _, fill_value = maybe_promote(blk.dtype)\n fill_tuple = (fill_value, )\n\n return [blk.take_nd(slobj, axis=0,\n new_mgr_locs=slice(0, sllen),\n fill_tuple=fill_tuple)]\n\n if sl_type in ('slice', 'mask'):\n blknos = self._blknos[slobj]\n blklocs = self._blklocs[slobj]\n else:\n blknos = algos.take_1d(self._blknos, slobj, fill_value=-1,\n allow_fill=allow_fill)\n blklocs = algos.take_1d(self._blklocs, slobj, fill_value=-1,\n allow_fill=allow_fill)\n\n # When filling blknos, make sure blknos is updated before appending to\n # blocks list, that way new blkno is exactly len(blocks).\n #\n # FIXME: mgr_groupby_blknos must return mgr_locs in ascending order,\n # pytables serialization will break otherwise.\n blocks = []\n for blkno, mgr_locs in libinternals.get_blkno_placements(blknos,\n self.nblocks,\n group=True):\n if blkno == -1:\n # If we've got here, fill_tuple was not None.\n fill_value = fill_tuple[0]\n\n blocks.append(self._make_na_block(placement=mgr_locs,\n fill_value=fill_value))\n else:\n blk = self.blocks[blkno]\n\n # Otherwise, slicing along items axis is necessary.\n if not blk._can_consolidate:\n # A non-consolidatable block, it's easy, because there's\n # only one item and each mgr loc is a copy of that single\n # item.\n for mgr_loc in mgr_locs:\n newblk = blk.copy(deep=True)\n newblk.mgr_locs = slice(mgr_loc, mgr_loc + 1)\n blocks.append(newblk)\n\n else:\n blocks.append(blk.take_nd(blklocs[mgr_locs.indexer],\n axis=0, new_mgr_locs=mgr_locs,\n fill_tuple=None))\n\n return blocks", "response": "Slice and take blocks along axis = 0."} {"instruction": "How would you explain what the following Python 3 function does\ndef take(self, indexer, axis=1, verify=True, convert=True):\n self._consolidate_inplace()\n indexer = (np.arange(indexer.start, indexer.stop, indexer.step,\n dtype='int64')\n if isinstance(indexer, slice)\n else np.asanyarray(indexer, dtype='int64'))\n\n n = self.shape[axis]\n if convert:\n indexer = maybe_convert_indices(indexer, n)\n\n if verify:\n if ((indexer == -1) | (indexer >= n)).any():\n raise Exception('Indices must be nonzero and less than '\n 'the axis length')\n\n new_labels = self.axes[axis].take(indexer)\n return self.reindex_indexer(new_axis=new_labels, indexer=indexer,\n axis=axis, allow_dups=True)", "response": "Take items along any axis."} {"instruction": "Write a Python 3 script to\nreturn a new blockmanager with all blocks unstacked.", "response": "def unstack(self, unstacker_func, fill_value):\n \"\"\"Return a blockmanager with all blocks unstacked.\n\n Parameters\n ----------\n unstacker_func : callable\n A (partially-applied) ``pd.core.reshape._Unstacker`` class.\n fill_value : Any\n fill_value for newly introduced missing values.\n\n Returns\n -------\n unstacked : BlockManager\n \"\"\"\n n_rows = self.shape[-1]\n dummy = unstacker_func(np.empty((0, 0)), value_columns=self.items)\n new_columns = dummy.get_new_columns()\n new_index = dummy.get_new_index()\n new_blocks = []\n columns_mask = []\n\n for blk in self.blocks:\n blocks, mask = blk._unstack(\n partial(unstacker_func,\n value_columns=self.items[blk.mgr_locs.indexer]),\n new_columns,\n n_rows,\n fill_value\n )\n\n new_blocks.extend(blocks)\n columns_mask.extend(mask)\n\n new_columns = new_columns[columns_mask]\n\n bm = BlockManager(new_blocks, [new_columns, new_index])\n return bm"} {"instruction": "Here you have a function in Python 3, explain what it does\ndef delete(self, item):\n loc = self.items.get_loc(item)\n self._block.delete(loc)\n self.axes[0] = self.axes[0].delete(loc)", "response": "Delete single item from SingleBlockManager."} {"instruction": "Here you have a function in Python 3, explain what it does\ndef concat(self, to_concat, new_axis):\n non_empties = [x for x in to_concat if len(x) > 0]\n\n # check if all series are of the same block type:\n if len(non_empties) > 0:\n blocks = [obj.blocks[0] for obj in non_empties]\n if len({b.dtype for b in blocks}) == 1:\n new_block = blocks[0].concat_same_type(blocks)\n else:\n values = [x.values for x in blocks]\n values = _concat._concat_compat(values)\n new_block = make_block(\n values, placement=slice(0, len(values), 1))\n else:\n values = [x._block.values for x in to_concat]\n values = _concat._concat_compat(values)\n new_block = make_block(\n values, placement=slice(0, len(values), 1))\n\n mgr = SingleBlockManager(new_block, new_axis)\n return mgr", "response": "Concatenate a list of SingleBlockManagers into a single - block manager."} {"instruction": "How would you code a function in Python 3 to\nconstruct a SparseSeries from a NumPy array.", "response": "def from_array(cls, arr, index=None, name=None, copy=False,\n fill_value=None, fastpath=False):\n \"\"\"Construct SparseSeries from array.\n\n .. deprecated:: 0.23.0\n Use the pd.SparseSeries(..) constructor instead.\n \"\"\"\n warnings.warn(\"'from_array' is deprecated and will be removed in a \"\n \"future version. Please use the pd.SparseSeries(..) \"\n \"constructor instead.\", FutureWarning, stacklevel=2)\n return cls(arr, index=index, name=name, copy=copy,\n fill_value=fill_value, fastpath=fastpath)"} {"instruction": "Write a Python 3 script for\nreturning my self as a sparse array", "response": "def as_sparse_array(self, kind=None, fill_value=None, copy=False):\n \"\"\" return my self as a sparse array, do not copy by default \"\"\"\n\n if fill_value is None:\n fill_value = self.fill_value\n if kind is None:\n kind = self.kind\n return SparseArray(self.values, sparse_index=self.sp_index,\n fill_value=fill_value, kind=kind, copy=copy)"} {"instruction": "Can you create a Python 3 function that\nperforms a reduction operation on the object", "response": "def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None,\n filter_type=None, **kwds):\n \"\"\" perform a reduction operation \"\"\"\n return op(self.get_values(), skipna=skipna, **kwds)"} {"instruction": "Given the following Python 3 function, write the documentation\ndef _ixs(self, i, axis=0):\n label = self.index[i]\n if isinstance(label, Index):\n return self.take(i, axis=axis)\n else:\n return self._get_val_at(i)", "response": "Return the i - th value or values in the SparseSeries by location"} {"instruction": "Can you create a Python 3 function that\nreturns an object with absolute value taken. Only applicable to objects that are all numeric", "response": "def abs(self):\n \"\"\"\n Return an object with absolute value taken. Only applicable to objects\n that are all numeric\n\n Returns\n -------\n abs: same type as caller\n \"\"\"\n return self._constructor(np.abs(self.values),\n index=self.index).__finalize__(self)"} {"instruction": "Can you write a function in Python 3 where it\nreturns the value occupying the requested label default to specified Value if not present.", "response": "def get(self, label, default=None):\n \"\"\"\n Returns value occupying requested label, default to specified\n missing value if not present. Analogous to dict.get\n\n Parameters\n ----------\n label : object\n Label value looking for\n default : object, optional\n Value to return if label not in index\n\n Returns\n -------\n y : scalar\n \"\"\"\n if label in self.index:\n loc = self.index.get_loc(label)\n return self._get_val_at(loc)\n else:\n return default"} {"instruction": "Create a Python 3 function for\ngetting a value from the cache at the specified label.", "response": "def get_value(self, label, takeable=False):\n \"\"\"\n Retrieve single value at passed index label\n\n .. deprecated:: 0.21.0\n\n Please use .at[] or .iat[] accessors.\n\n Parameters\n ----------\n index : label\n takeable : interpret the index as indexers, default False\n\n Returns\n -------\n value : scalar value\n \"\"\"\n warnings.warn(\"get_value is deprecated and will be removed \"\n \"in a future release. Please use \"\n \".at[] or .iat[] accessors instead\", FutureWarning,\n stacklevel=2)\n\n return self._get_value(label, takeable=takeable)"} {"instruction": "Implement a Python 3 function for\nconverting SparseSeries to a Series.", "response": "def to_dense(self):\n \"\"\"\n Convert SparseSeries to a Series.\n\n Returns\n -------\n s : Series\n \"\"\"\n return Series(self.values.to_dense(), index=self.index,\n name=self.name)"} {"instruction": "Write a Python 3 function that can\nmake a copy of the SparseSeries.", "response": "def copy(self, deep=True):\n \"\"\"\n Make a copy of the SparseSeries. Only the actual sparse values need to\n be copied\n \"\"\"\n # TODO: https://github.com/pandas-dev/pandas/issues/22314\n # We skip the block manager till that is resolved.\n new_data = self.values.copy(deep=deep)\n return self._constructor(new_data, sparse_index=self.sp_index,\n fill_value=self.fill_value,\n index=self.index.copy(),\n name=self.name).__finalize__(self)"} {"instruction": "Write a Python 3 function for\nconforming sparse values to new SparseIndex", "response": "def sparse_reindex(self, new_index):\n \"\"\"\n Conform sparse values to new SparseIndex\n\n Parameters\n ----------\n new_index : {BlockIndex, IntIndex}\n\n Returns\n -------\n reindexed : SparseSeries\n \"\"\"\n if not isinstance(new_index, splib.SparseIndex):\n raise TypeError(\"new index must be a SparseIndex\")\n values = self.values\n values = values.sp_index.to_int_index().reindex(\n values.sp_values.astype('float64'), values.fill_value, new_index)\n values = SparseArray(values,\n sparse_index=new_index,\n fill_value=self.values.fill_value)\n return self._constructor(values, index=self.index).__finalize__(self)"} {"instruction": "Make a summary of the following Python 3 code\ndef combine_first(self, other):\n if isinstance(other, SparseSeries):\n other = other.to_dense()\n\n dense_combined = self.to_dense().combine_first(other)\n return dense_combined.to_sparse(fill_value=self.fill_value)", "response": "Combine two Series values choosing the calling Series s values\n first."} {"instruction": "Write a Python 3 function that can\ncreate a cache of unique dates from an array of dates.", "response": "def _maybe_cache(arg, format, cache, convert_listlike):\n \"\"\"\n Create a cache of unique dates from an array of dates\n\n Parameters\n ----------\n arg : integer, float, string, datetime, list, tuple, 1-d array, Series\n format : string\n Strftime format to parse time\n cache : boolean\n True attempts to create a cache of converted values\n convert_listlike : function\n Conversion function to apply on dates\n\n Returns\n -------\n cache_array : Series\n Cache of converted, unique dates. Can be empty\n \"\"\"\n from pandas import Series\n cache_array = Series()\n if cache:\n # Perform a quicker unique check\n from pandas import Index\n unique_dates = Index(arg).unique()\n if len(unique_dates) < len(arg):\n cache_dates = convert_listlike(unique_dates.to_numpy(),\n True, format)\n cache_array = Series(cache_dates, index=unique_dates)\n return cache_array"} {"instruction": "How would you implement a function in Python 3 that\nconverts array of dates with a cache and box the result", "response": "def _convert_and_box_cache(arg, cache_array, box, errors, name=None):\n \"\"\"\n Convert array of dates with a cache and box the result\n\n Parameters\n ----------\n arg : integer, float, string, datetime, list, tuple, 1-d array, Series\n cache_array : Series\n Cache of converted, unique dates\n box : boolean\n True boxes result as an Index-like, False returns an ndarray\n errors : string\n 'ignore' plus box=True will convert result to Index\n name : string, default None\n Name for a DatetimeIndex\n\n Returns\n -------\n result : datetime of converted dates\n Returns:\n\n - Index-like if box=True\n - ndarray if box=False\n \"\"\"\n from pandas import Series, DatetimeIndex, Index\n result = Series(arg).map(cache_array)\n if box:\n if errors == 'ignore':\n return Index(result, name=name)\n else:\n return DatetimeIndex(result, name=name)\n return result.values"} {"instruction": "How would you implement a function in Python 3 that\nreturns parsed timezone information from array_strptime.", "response": "def _return_parsed_timezone_results(result, timezones, box, tz, name):\n \"\"\"\n Return results from array_strptime if a %z or %Z directive was passed.\n\n Parameters\n ----------\n result : ndarray\n int64 date representations of the dates\n timezones : ndarray\n pytz timezone objects\n box : boolean\n True boxes result as an Index-like, False returns an ndarray\n tz : object\n None or pytz timezone object\n name : string, default None\n Name for a DatetimeIndex\n\n Returns\n -------\n tz_result : ndarray of parsed dates with timezone\n Returns:\n\n - Index-like if box=True\n - ndarray of Timestamps if box=False\n\n \"\"\"\n if tz is not None:\n raise ValueError(\"Cannot pass a tz argument when \"\n \"parsing strings with timezone \"\n \"information.\")\n tz_results = np.array([Timestamp(res).tz_localize(zone) for res, zone\n in zip(result, timezones)])\n if box:\n from pandas import Index\n return Index(tz_results, name=name)\n return tz_results"} {"instruction": "Make a summary of the following Python 3 code\ndef _convert_listlike_datetimes(arg, box, format, name=None, tz=None,\n unit=None, errors=None,\n infer_datetime_format=None, dayfirst=None,\n yearfirst=None, exact=None):\n \"\"\"\n Helper function for to_datetime. Performs the conversions of 1D listlike\n of dates\n\n Parameters\n ----------\n arg : list, tuple, ndarray, Series, Index\n date to be parced\n box : boolean\n True boxes result as an Index-like, False returns an ndarray\n name : object\n None or string for the Index name\n tz : object\n None or 'utc'\n unit : string\n None or string of the frequency of the passed data\n errors : string\n error handing behaviors from to_datetime, 'raise', 'coerce', 'ignore'\n infer_datetime_format : boolean\n inferring format behavior from to_datetime\n dayfirst : boolean\n dayfirst parsing behavior from to_datetime\n yearfirst : boolean\n yearfirst parsing behavior from to_datetime\n exact : boolean\n exact format matching behavior from to_datetime\n\n Returns\n -------\n ndarray of parsed dates\n Returns:\n\n - Index-like if box=True\n - ndarray of Timestamps if box=False\n \"\"\"\n from pandas import DatetimeIndex\n from pandas.core.arrays import DatetimeArray\n from pandas.core.arrays.datetimes import (\n maybe_convert_dtype, objects_to_datetime64ns)\n\n if isinstance(arg, (list, tuple)):\n arg = np.array(arg, dtype='O')\n\n # these are shortcutable\n if is_datetime64tz_dtype(arg):\n if not isinstance(arg, (DatetimeArray, DatetimeIndex)):\n return DatetimeIndex(arg, tz=tz, name=name)\n if tz == 'utc':\n arg = arg.tz_convert(None).tz_localize(tz)\n return arg\n\n elif is_datetime64_ns_dtype(arg):\n if box and not isinstance(arg, (DatetimeArray, DatetimeIndex)):\n try:\n return DatetimeIndex(arg, tz=tz, name=name)\n except ValueError:\n pass\n\n return arg\n\n elif unit is not None:\n if format is not None:\n raise ValueError(\"cannot specify both format and unit\")\n arg = getattr(arg, 'values', arg)\n result = tslib.array_with_unit_to_datetime(arg, unit,\n errors=errors)\n if box:\n if errors == 'ignore':\n from pandas import Index\n result = Index(result, name=name)\n # GH 23758: We may still need to localize the result with tz\n try:\n return result.tz_localize(tz)\n except AttributeError:\n return result\n\n return DatetimeIndex(result, tz=tz, name=name)\n return result\n elif getattr(arg, 'ndim', 1) > 1:\n raise TypeError('arg must be a string, datetime, list, tuple, '\n '1-d array, or Series')\n\n # warn if passing timedelta64, raise for PeriodDtype\n # NB: this must come after unit transformation\n orig_arg = arg\n arg, _ = maybe_convert_dtype(arg, copy=False)\n\n arg = ensure_object(arg)\n require_iso8601 = False\n\n if infer_datetime_format and format is None:\n format = _guess_datetime_format_for_array(arg, dayfirst=dayfirst)\n\n if format is not None:\n # There is a special fast-path for iso8601 formatted\n # datetime strings, so in those cases don't use the inferred\n # format because this path makes process slower in this\n # special case\n format_is_iso8601 = _format_is_iso(format)\n if format_is_iso8601:\n require_iso8601 = not infer_datetime_format\n format = None\n\n tz_parsed = None\n result = None\n\n if format is not None:\n try:\n # shortcut formatting here\n if format == '%Y%m%d':\n try:\n # pass orig_arg as float-dtype may have been converted to\n # datetime64[ns]\n orig_arg = ensure_object(orig_arg)\n result = _attempt_YYYYMMDD(orig_arg, errors=errors)\n except (ValueError, TypeError, tslibs.OutOfBoundsDatetime):\n raise ValueError(\"cannot convert the input to \"\n \"'%Y%m%d' date format\")\n\n # fallback\n if result is None:\n try:\n result, timezones = array_strptime(\n arg, format, exact=exact, errors=errors)\n if '%Z' in format or '%z' in format:\n return _return_parsed_timezone_results(\n result, timezones, box, tz, name)\n except tslibs.OutOfBoundsDatetime:\n if errors == 'raise':\n raise\n elif errors == 'coerce':\n result = np.empty(arg.shape, dtype='M8[ns]')\n iresult = result.view('i8')\n iresult.fill(tslibs.iNaT)\n else:\n result = arg\n except ValueError:\n # if format was inferred, try falling back\n # to array_to_datetime - terminate here\n # for specified formats\n if not infer_datetime_format:\n if errors == 'raise':\n raise\n elif errors == 'coerce':\n result = np.empty(arg.shape, dtype='M8[ns]')\n iresult = result.view('i8')\n iresult.fill(tslibs.iNaT)\n else:\n result = arg\n except ValueError as e:\n # Fallback to try to convert datetime objects if timezone-aware\n # datetime objects are found without passing `utc=True`\n try:\n values, tz = conversion.datetime_to_datetime64(arg)\n return DatetimeIndex._simple_new(values, name=name, tz=tz)\n except (ValueError, TypeError):\n raise e\n\n if result is None:\n assert format is None or infer_datetime_format\n utc = tz == 'utc'\n result, tz_parsed = objects_to_datetime64ns(\n arg, dayfirst=dayfirst, yearfirst=yearfirst,\n utc=utc, errors=errors, require_iso8601=require_iso8601,\n allow_object=True)\n\n if tz_parsed is not None:\n if box:\n # We can take a shortcut since the datetime64 numpy array\n # is in UTC\n return DatetimeIndex._simple_new(result, name=name,\n tz=tz_parsed)\n else:\n # Convert the datetime64 numpy array to an numpy array\n # of datetime objects\n result = [Timestamp(ts, tz=tz_parsed).to_pydatetime()\n for ts in result]\n return np.array(result, dtype=object)\n\n if box:\n # Ensure we return an Index in all cases where box=True\n if is_datetime64_dtype(result):\n return DatetimeIndex(result, tz=tz, name=name)\n elif is_object_dtype(result):\n # e.g. an Index of datetime objects\n from pandas import Index\n return Index(result, name=name)\n return result", "response": "Helper function for to_datetime."} {"instruction": "Given the following Python 3 function, write the documentation\ndef _adjust_to_origin(arg, origin, unit):\n if origin == 'julian':\n original = arg\n j0 = Timestamp(0).to_julian_date()\n if unit != 'D':\n raise ValueError(\"unit must be 'D' for origin='julian'\")\n try:\n arg = arg - j0\n except TypeError:\n raise ValueError(\"incompatible 'arg' type for given \"\n \"'origin'='julian'\")\n\n # premptively check this for a nice range\n j_max = Timestamp.max.to_julian_date() - j0\n j_min = Timestamp.min.to_julian_date() - j0\n if np.any(arg > j_max) or np.any(arg < j_min):\n raise tslibs.OutOfBoundsDatetime(\n \"{original} is Out of Bounds for \"\n \"origin='julian'\".format(original=original))\n else:\n # arg must be numeric\n if not ((is_scalar(arg) and (is_integer(arg) or is_float(arg))) or\n is_numeric_dtype(np.asarray(arg))):\n raise ValueError(\n \"'{arg}' is not compatible with origin='{origin}'; \"\n \"it must be numeric with a unit specified \".format(\n arg=arg,\n origin=origin))\n\n # we are going to offset back to unix / epoch time\n try:\n offset = Timestamp(origin)\n except tslibs.OutOfBoundsDatetime:\n raise tslibs.OutOfBoundsDatetime(\n \"origin {origin} is Out of Bounds\".format(origin=origin))\n except ValueError:\n raise ValueError(\"origin {origin} cannot be converted \"\n \"to a Timestamp\".format(origin=origin))\n\n if offset.tz is not None:\n raise ValueError(\n \"origin offset {} must be tz-naive\".format(offset))\n offset -= Timestamp(0)\n\n # convert the offset to the unit of the arg\n # this should be lossless in terms of precision\n offset = offset // tslibs.Timedelta(1, unit=unit)\n\n # scalars & ndarray-like can handle the addition\n if is_list_like(arg) and not isinstance(\n arg, (ABCSeries, ABCIndexClass, np.ndarray)):\n arg = np.asarray(arg)\n arg = arg + offset\n return arg", "response": "Adjusts the input argument to the specified origin"} {"instruction": "Implement a function in Python 3 to\nconvert argument to datetime. datetime.", "response": "def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False,\n utc=None, box=True, format=None, exact=True,\n unit=None, infer_datetime_format=False, origin='unix',\n cache=False):\n \"\"\"\n Convert argument to datetime.\n\n Parameters\n ----------\n arg : integer, float, string, datetime, list, tuple, 1-d array, Series\n\n .. versionadded:: 0.18.1\n\n or DataFrame/dict-like\n\n errors : {'ignore', 'raise', 'coerce'}, default 'raise'\n\n - If 'raise', then invalid parsing will raise an exception\n - If 'coerce', then invalid parsing will be set as NaT\n - If 'ignore', then invalid parsing will return the input\n dayfirst : boolean, default False\n Specify a date parse order if `arg` is str or its list-likes.\n If True, parses dates with the day first, eg 10/11/12 is parsed as\n 2012-11-10.\n Warning: dayfirst=True is not strict, but will prefer to parse\n with day first (this is a known bug, based on dateutil behavior).\n yearfirst : boolean, default False\n Specify a date parse order if `arg` is str or its list-likes.\n\n - If True parses dates with the year first, eg 10/11/12 is parsed as\n 2010-11-12.\n - If both dayfirst and yearfirst are True, yearfirst is preceded (same\n as dateutil).\n\n Warning: yearfirst=True is not strict, but will prefer to parse\n with year first (this is a known bug, based on dateutil behavior).\n\n .. versionadded:: 0.16.1\n\n utc : boolean, default None\n Return UTC DatetimeIndex if True (converting any tz-aware\n datetime.datetime objects as well).\n box : boolean, default True\n\n - If True returns a DatetimeIndex or Index-like object\n - If False returns ndarray of values.\n\n .. deprecated:: 0.25.0\n Use :meth:`.to_numpy` or :meth:`Timestamp.to_datetime64`\n instead to get an ndarray of values or numpy.datetime64,\n respectively.\n\n format : string, default None\n strftime to parse time, eg \"%d/%m/%Y\", note that \"%f\" will parse\n all the way up to nanoseconds.\n See strftime documentation for more information on choices:\n https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior\n exact : boolean, True by default\n\n - If True, require an exact format match.\n - If False, allow the format to match anywhere in the target string.\n\n unit : string, default 'ns'\n unit of the arg (D,s,ms,us,ns) denote the unit, which is an\n integer or float number. This will be based off the origin.\n Example, with unit='ms' and origin='unix' (the default), this\n would calculate the number of milliseconds to the unix epoch start.\n infer_datetime_format : boolean, default False\n If True and no `format` is given, attempt to infer the format of the\n datetime strings, and if it can be inferred, switch to a faster\n method of parsing them. In some cases this can increase the parsing\n speed by ~5-10x.\n origin : scalar, default is 'unix'\n Define the reference date. The numeric values would be parsed as number\n of units (defined by `unit`) since this reference date.\n\n - If 'unix' (or POSIX) time; origin is set to 1970-01-01.\n - If 'julian', unit must be 'D', and origin is set to beginning of\n Julian Calendar. Julian day number 0 is assigned to the day starting\n at noon on January 1, 4713 BC.\n - If Timestamp convertible, origin is set to Timestamp identified by\n origin.\n\n .. versionadded:: 0.20.0\n cache : boolean, default False\n If True, use a cache of unique, converted dates to apply the datetime\n conversion. May produce significant speed-up when parsing duplicate\n date strings, especially ones with timezone offsets.\n\n .. versionadded:: 0.23.0\n\n Returns\n -------\n ret : datetime if parsing succeeded.\n Return type depends on input:\n\n - list-like: DatetimeIndex\n - Series: Series of datetime64 dtype\n - scalar: Timestamp\n\n In case when it is not possible to return designated types (e.g. when\n any element of input is before Timestamp.min or after Timestamp.max)\n return will have datetime.datetime type (or corresponding\n array/Series).\n\n See Also\n --------\n DataFrame.astype : Cast argument to a specified dtype.\n to_timedelta : Convert argument to timedelta.\n\n Examples\n --------\n Assembling a datetime from multiple columns of a DataFrame. The keys can be\n common abbreviations like ['year', 'month', 'day', 'minute', 'second',\n 'ms', 'us', 'ns']) or plurals of the same\n\n >>> df = pd.DataFrame({'year': [2015, 2016],\n 'month': [2, 3],\n 'day': [4, 5]})\n >>> pd.to_datetime(df)\n 0 2015-02-04\n 1 2016-03-05\n dtype: datetime64[ns]\n\n If a date does not meet the `timestamp limitations\n `_, passing errors='ignore'\n will return the original input instead of raising any exception.\n\n Passing errors='coerce' will force an out-of-bounds date to NaT,\n in addition to forcing non-dates (or non-parseable dates) to NaT.\n\n >>> pd.to_datetime('13000101', format='%Y%m%d', errors='ignore')\n datetime.datetime(1300, 1, 1, 0, 0)\n >>> pd.to_datetime('13000101', format='%Y%m%d', errors='coerce')\n NaT\n\n Passing infer_datetime_format=True can often-times speedup a parsing\n if its not an ISO8601 format exactly, but in a regular format.\n\n >>> s = pd.Series(['3/11/2000', '3/12/2000', '3/13/2000']*1000)\n\n >>> s.head()\n 0 3/11/2000\n 1 3/12/2000\n 2 3/13/2000\n 3 3/11/2000\n 4 3/12/2000\n dtype: object\n\n >>> %timeit pd.to_datetime(s,infer_datetime_format=True)\n 100 loops, best of 3: 10.4 ms per loop\n\n >>> %timeit pd.to_datetime(s,infer_datetime_format=False)\n 1 loop, best of 3: 471 ms per loop\n\n Using a unix epoch time\n\n >>> pd.to_datetime(1490195805, unit='s')\n Timestamp('2017-03-22 15:16:45')\n >>> pd.to_datetime(1490195805433502912, unit='ns')\n Timestamp('2017-03-22 15:16:45.433502912')\n\n .. warning:: For float arg, precision rounding might happen. To prevent\n unexpected behavior use a fixed-width exact type.\n\n Using a non-unix epoch origin\n\n >>> pd.to_datetime([1, 2, 3], unit='D',\n origin=pd.Timestamp('1960-01-01'))\n 0 1960-01-02\n 1 1960-01-03\n 2 1960-01-04\n \"\"\"\n if arg is None:\n return None\n\n if origin != 'unix':\n arg = _adjust_to_origin(arg, origin, unit)\n\n tz = 'utc' if utc else None\n convert_listlike = partial(_convert_listlike_datetimes, tz=tz, unit=unit,\n dayfirst=dayfirst, yearfirst=yearfirst,\n errors=errors, exact=exact,\n infer_datetime_format=infer_datetime_format)\n\n if isinstance(arg, Timestamp):\n result = arg\n if tz is not None:\n if arg.tz is not None:\n result = result.tz_convert(tz)\n else:\n result = result.tz_localize(tz)\n elif isinstance(arg, ABCSeries):\n cache_array = _maybe_cache(arg, format, cache, convert_listlike)\n if not cache_array.empty:\n result = arg.map(cache_array)\n else:\n values = convert_listlike(arg._values, True, format)\n result = arg._constructor(values, index=arg.index, name=arg.name)\n elif isinstance(arg, (ABCDataFrame, abc.MutableMapping)):\n result = _assemble_from_unit_mappings(arg, errors, box, tz)\n elif isinstance(arg, ABCIndexClass):\n cache_array = _maybe_cache(arg, format, cache, convert_listlike)\n if not cache_array.empty:\n result = _convert_and_box_cache(arg, cache_array, box, errors,\n name=arg.name)\n else:\n convert_listlike = partial(convert_listlike, name=arg.name)\n result = convert_listlike(arg, box, format)\n elif is_list_like(arg):\n cache_array = _maybe_cache(arg, format, cache, convert_listlike)\n if not cache_array.empty:\n result = _convert_and_box_cache(arg, cache_array, box, errors)\n else:\n result = convert_listlike(arg, box, format)\n else:\n result = convert_listlike(np.array([arg]), box, format)[0]\n\n return result"} {"instruction": "How would you implement a function in Python 3 that\nassembles the unit specified fields from the arg.", "response": "def _assemble_from_unit_mappings(arg, errors, box, tz):\n \"\"\"\n assemble the unit specified fields from the arg (DataFrame)\n Return a Series for actual parsing\n\n Parameters\n ----------\n arg : DataFrame\n errors : {'ignore', 'raise', 'coerce'}, default 'raise'\n\n - If 'raise', then invalid parsing will raise an exception\n - If 'coerce', then invalid parsing will be set as NaT\n - If 'ignore', then invalid parsing will return the input\n box : boolean\n\n - If True, return a DatetimeIndex\n - If False, return an array\n tz : None or 'utc'\n\n Returns\n -------\n Series\n \"\"\"\n from pandas import to_timedelta, to_numeric, DataFrame\n arg = DataFrame(arg)\n if not arg.columns.is_unique:\n raise ValueError(\"cannot assemble with duplicate keys\")\n\n # replace passed unit with _unit_map\n def f(value):\n if value in _unit_map:\n return _unit_map[value]\n\n # m is case significant\n if value.lower() in _unit_map:\n return _unit_map[value.lower()]\n\n return value\n\n unit = {k: f(k) for k in arg.keys()}\n unit_rev = {v: k for k, v in unit.items()}\n\n # we require at least Ymd\n required = ['year', 'month', 'day']\n req = sorted(list(set(required) - set(unit_rev.keys())))\n if len(req):\n raise ValueError(\"to assemble mappings requires at least that \"\n \"[year, month, day] be specified: [{required}] \"\n \"is missing\".format(required=','.join(req)))\n\n # keys we don't recognize\n excess = sorted(list(set(unit_rev.keys()) - set(_unit_map.values())))\n if len(excess):\n raise ValueError(\"extra keys have been passed \"\n \"to the datetime assemblage: \"\n \"[{excess}]\".format(excess=','.join(excess)))\n\n def coerce(values):\n # we allow coercion to if errors allows\n values = to_numeric(values, errors=errors)\n\n # prevent overflow in case of int8 or int16\n if is_integer_dtype(values):\n values = values.astype('int64', copy=False)\n return values\n\n values = (coerce(arg[unit_rev['year']]) * 10000 +\n coerce(arg[unit_rev['month']]) * 100 +\n coerce(arg[unit_rev['day']]))\n try:\n values = to_datetime(values, format='%Y%m%d', errors=errors, utc=tz)\n except (TypeError, ValueError) as e:\n raise ValueError(\"cannot assemble the \"\n \"datetimes: {error}\".format(error=e))\n\n for u in ['h', 'm', 's', 'ms', 'us', 'ns']:\n value = unit_rev.get(u)\n if value is not None and value in arg:\n try:\n values += to_timedelta(coerce(arg[value]),\n unit=u,\n errors=errors)\n except (TypeError, ValueError) as e:\n raise ValueError(\"cannot assemble the datetimes [{value}]: \"\n \"{error}\".format(value=value, error=e))\n if not box:\n return values.values\n return values"} {"instruction": "Explain what the following Python 3 code does\ndef _attempt_YYYYMMDD(arg, errors):\n\n def calc(carg):\n # calculate the actual result\n carg = carg.astype(object)\n parsed = parsing.try_parse_year_month_day(carg / 10000,\n carg / 100 % 100,\n carg % 100)\n return tslib.array_to_datetime(parsed, errors=errors)[0]\n\n def calc_with_mask(carg, mask):\n result = np.empty(carg.shape, dtype='M8[ns]')\n iresult = result.view('i8')\n iresult[~mask] = tslibs.iNaT\n\n masked_result = calc(carg[mask].astype(np.float64).astype(np.int64))\n result[mask] = masked_result.astype('M8[ns]')\n return result\n\n # try intlike / strings that are ints\n try:\n return calc(arg.astype(np.int64))\n except ValueError:\n pass\n\n # a float with actual np.nan\n try:\n carg = arg.astype(np.float64)\n return calc_with_mask(carg, notna(carg))\n except ValueError:\n pass\n\n # string with NaN-like\n try:\n mask = ~algorithms.isin(arg, list(tslib.nat_strings))\n return calc_with_mask(arg, mask)\n except ValueError:\n pass\n\n return None", "response": "Attempts to parse the YYYYMMDD format and returns the object that represents the object."} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef to_time(arg, format=None, infer_time_format=False, errors='raise'):\n\n def _convert_listlike(arg, format):\n\n if isinstance(arg, (list, tuple)):\n arg = np.array(arg, dtype='O')\n\n elif getattr(arg, 'ndim', 1) > 1:\n raise TypeError('arg must be a string, datetime, list, tuple, '\n '1-d array, or Series')\n\n arg = ensure_object(arg)\n\n if infer_time_format and format is None:\n format = _guess_time_format_for_array(arg)\n\n times = []\n if format is not None:\n for element in arg:\n try:\n times.append(datetime.strptime(element, format).time())\n except (ValueError, TypeError):\n if errors == 'raise':\n msg = (\"Cannot convert {element} to a time with given \"\n \"format {format}\").format(element=element,\n format=format)\n raise ValueError(msg)\n elif errors == 'ignore':\n return arg\n else:\n times.append(None)\n else:\n formats = _time_formats[:]\n format_found = False\n for element in arg:\n time_object = None\n for time_format in formats:\n try:\n time_object = datetime.strptime(element,\n time_format).time()\n if not format_found:\n # Put the found format in front\n fmt = formats.pop(formats.index(time_format))\n formats.insert(0, fmt)\n format_found = True\n break\n except (ValueError, TypeError):\n continue\n\n if time_object is not None:\n times.append(time_object)\n elif errors == 'raise':\n raise ValueError(\"Cannot convert arg {arg} to \"\n \"a time\".format(arg=arg))\n elif errors == 'ignore':\n return arg\n else:\n times.append(None)\n\n return times\n\n if arg is None:\n return arg\n elif isinstance(arg, time):\n return arg\n elif isinstance(arg, ABCSeries):\n values = _convert_listlike(arg._values, format)\n return arg._constructor(values, index=arg.index, name=arg.name)\n elif isinstance(arg, ABCIndexClass):\n return _convert_listlike(arg, format)\n elif is_list_like(arg):\n return _convert_listlike(arg, format)\n\n return _convert_listlike(np.array([arg]), format)[0]", "response": "Convert a string in time format to a time object."} {"instruction": "Given the following Python 3 function, write the documentation\ndef deprecate(name, alternative, version, alt_name=None,\n klass=None, stacklevel=2, msg=None):\n \"\"\"\n Return a new function that emits a deprecation warning on use.\n\n To use this method for a deprecated function, another function\n `alternative` with the same signature must exist. The deprecated\n function will emit a deprecation warning, and in the docstring\n it will contain the deprecation directive with the provided version\n so it can be detected for future removal.\n\n Parameters\n ----------\n name : str\n Name of function to deprecate.\n alternative : func\n Function to use instead.\n version : str\n Version of pandas in which the method has been deprecated.\n alt_name : str, optional\n Name to use in preference of alternative.__name__.\n klass : Warning, default FutureWarning\n stacklevel : int, default 2\n msg : str\n The message to display in the warning.\n Default is '{name} is deprecated. Use {alt_name} instead.'\n \"\"\"\n\n alt_name = alt_name or alternative.__name__\n klass = klass or FutureWarning\n warning_msg = msg or '{} is deprecated, use {} instead'.format(name,\n alt_name)\n\n @wraps(alternative)\n def wrapper(*args, **kwargs):\n warnings.warn(warning_msg, klass, stacklevel=stacklevel)\n return alternative(*args, **kwargs)\n\n # adding deprecated directive to the docstring\n msg = msg or 'Use `{alt_name}` instead.'.format(alt_name=alt_name)\n doc_error_msg = ('deprecate needs a correctly formatted docstring in '\n 'the target function (should have a one liner short '\n 'summary, and opening quotes should be in their own '\n 'line). Found:\\n{}'.format(alternative.__doc__))\n\n # when python is running in optimized mode (i.e. `-OO`), docstrings are\n # removed, so we check that a docstring with correct formatting is used\n # but we allow empty docstrings\n if alternative.__doc__:\n if alternative.__doc__.count('\\n') < 3:\n raise AssertionError(doc_error_msg)\n empty1, summary, empty2, doc = alternative.__doc__.split('\\n', 3)\n if empty1 or empty2 and not summary:\n raise AssertionError(doc_error_msg)\n wrapper.__doc__ = dedent(\"\"\"\n {summary}\n\n .. deprecated:: {depr_version}\n {depr_msg}\n\n {rest_of_docstring}\"\"\").format(summary=summary.strip(),\n depr_version=version,\n depr_msg=msg,\n rest_of_docstring=dedent(doc))\n\n return wrapper", "response": "Deprecate a function or function."} {"instruction": "Make a summary of the following Python 3 code\ndef deprecate_kwarg(old_arg_name, new_arg_name, mapping=None, stacklevel=2):\n\n if mapping is not None and not hasattr(mapping, 'get') and \\\n not callable(mapping):\n raise TypeError(\"mapping from old to new argument values \"\n \"must be dict or callable!\")\n\n def _deprecate_kwarg(func):\n @wraps(func)\n def wrapper(*args, **kwargs):\n old_arg_value = kwargs.pop(old_arg_name, None)\n\n if new_arg_name is None and old_arg_value is not None:\n msg = (\n \"the '{old_name}' keyword is deprecated and will be \"\n \"removed in a future version. \"\n \"Please take steps to stop the use of '{old_name}'\"\n ).format(old_name=old_arg_name)\n warnings.warn(msg, FutureWarning, stacklevel=stacklevel)\n kwargs[old_arg_name] = old_arg_value\n return func(*args, **kwargs)\n\n if old_arg_value is not None:\n if mapping is not None:\n if hasattr(mapping, 'get'):\n new_arg_value = mapping.get(old_arg_value,\n old_arg_value)\n else:\n new_arg_value = mapping(old_arg_value)\n msg = (\"the {old_name}={old_val!r} keyword is deprecated, \"\n \"use {new_name}={new_val!r} instead\"\n ).format(old_name=old_arg_name,\n old_val=old_arg_value,\n new_name=new_arg_name,\n new_val=new_arg_value)\n else:\n new_arg_value = old_arg_value\n msg = (\"the '{old_name}' keyword is deprecated, \"\n \"use '{new_name}' instead\"\n ).format(old_name=old_arg_name,\n new_name=new_arg_name)\n\n warnings.warn(msg, FutureWarning, stacklevel=stacklevel)\n if kwargs.get(new_arg_name, None) is not None:\n msg = (\"Can only specify '{old_name}' or '{new_name}', \"\n \"not both\").format(old_name=old_arg_name,\n new_name=new_arg_name)\n raise TypeError(msg)\n else:\n kwargs[new_arg_name] = new_arg_value\n return func(*args, **kwargs)\n return wrapper\n return _deprecate_kwarg", "response": "Deprecate a keyword argument of a function."} {"instruction": "Make a summary of the following Python 3 code\ndef make_signature(func):\n\n spec = inspect.getfullargspec(func)\n if spec.defaults is None:\n n_wo_defaults = len(spec.args)\n defaults = ('',) * n_wo_defaults\n else:\n n_wo_defaults = len(spec.args) - len(spec.defaults)\n defaults = ('',) * n_wo_defaults + tuple(spec.defaults)\n args = []\n for var, default in zip(spec.args, defaults):\n args.append(var if default == '' else var + '=' + repr(default))\n if spec.varargs:\n args.append('*' + spec.varargs)\n if spec.varkw:\n args.append('**' + spec.varkw)\n return args, spec.args", "response": "Returns a tuple containing the paramenter list with defaults\n and parameter list."} {"instruction": "Implement a function in Python 3 to\nreturn a fixed frequency PeriodIndex with day ( calendar ) as the default frequency for the given start and end dates.", "response": "def period_range(start=None, end=None, periods=None, freq=None, name=None):\n \"\"\"\n Return a fixed frequency PeriodIndex, with day (calendar) as the default\n frequency\n\n Parameters\n ----------\n start : string or period-like, default None\n Left bound for generating periods\n end : string or period-like, default None\n Right bound for generating periods\n periods : integer, default None\n Number of periods to generate\n freq : string or DateOffset, optional\n Frequency alias. By default the freq is taken from `start` or `end`\n if those are Period objects. Otherwise, the default is ``\"D\"`` for\n daily frequency.\n\n name : string, default None\n Name of the resulting PeriodIndex\n\n Returns\n -------\n prng : PeriodIndex\n\n Notes\n -----\n Of the three parameters: ``start``, ``end``, and ``periods``, exactly two\n must be specified.\n\n To learn more about the frequency strings, please see `this link\n `__.\n\n Examples\n --------\n\n >>> pd.period_range(start='2017-01-01', end='2018-01-01', freq='M')\n PeriodIndex(['2017-01', '2017-02', '2017-03', '2017-04', '2017-05',\n '2017-06', '2017-06', '2017-07', '2017-08', '2017-09',\n '2017-10', '2017-11', '2017-12', '2018-01'],\n dtype='period[M]', freq='M')\n\n If ``start`` or ``end`` are ``Period`` objects, they will be used as anchor\n endpoints for a ``PeriodIndex`` with frequency matching that of the\n ``period_range`` constructor.\n\n >>> pd.period_range(start=pd.Period('2017Q1', freq='Q'),\n ... end=pd.Period('2017Q2', freq='Q'), freq='M')\n PeriodIndex(['2017-03', '2017-04', '2017-05', '2017-06'],\n dtype='period[M]', freq='M')\n \"\"\"\n if com.count_not_none(start, end, periods) != 2:\n raise ValueError('Of the three parameters: start, end, and periods, '\n 'exactly two must be specified')\n if freq is None and (not isinstance(start, Period)\n and not isinstance(end, Period)):\n freq = 'D'\n\n data, freq = PeriodArray._generate_range(start, end, periods, freq,\n fields={})\n data = PeriodArray(data, freq=freq)\n return PeriodIndex(data, name=name)"} {"instruction": "How would you implement a function in Python 3 that\ncreates a RangeIndex from a range object.", "response": "def from_range(cls, data, name=None, dtype=None, **kwargs):\n \"\"\" Create RangeIndex from a range object. \"\"\"\n if not isinstance(data, range):\n raise TypeError(\n '{0}(...) must be called with object coercible to a '\n 'range, {1} was passed'.format(cls.__name__, repr(data)))\n\n start, stop, step = data.start, data.stop, data.step\n return RangeIndex(start, stop, step, dtype=dtype, name=name, **kwargs)"} {"instruction": "How would you explain what the following Python 3 function does\ndef _format_attrs(self):\n attrs = self._get_data_as_items()\n if self.name is not None:\n attrs.append(('name', ibase.default_pprint(self.name)))\n return attrs", "response": "Return a list of tuples of the attr and formatted_value"} {"instruction": "Make a summary of the following Python 3 code\ndef max(self, axis=None, skipna=True):\n nv.validate_minmax_axis(axis)\n return self._minmax('max')", "response": "The maximum value of the RangeIndex"} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef argsort(self, *args, **kwargs):\n nv.validate_argsort(args, kwargs)\n\n if self._step > 0:\n return np.arange(len(self))\n else:\n return np.arange(len(self) - 1, -1, -1)", "response": "Returns the indices that would sort the index and its\n underlying data."} {"instruction": "Explain what the following Python 3 code does\ndef equals(self, other):\n if isinstance(other, RangeIndex):\n ls = len(self)\n lo = len(other)\n return (ls == lo == 0 or\n ls == lo == 1 and\n self._start == other._start or\n ls == lo and\n self._start == other._start and\n self._step == other._step)\n\n return super().equals(other)", "response": "Determines if two Index objects contain the same elements."} {"instruction": "How would you code a function in Python 3 to\nreturn the intersection of two Index objects.", "response": "def intersection(self, other, sort=False):\n \"\"\"\n Form the intersection of two Index objects.\n\n Parameters\n ----------\n other : Index or array-like\n sort : False or None, default False\n Sort the resulting index if possible\n\n .. versionadded:: 0.24.0\n\n .. versionchanged:: 0.24.1\n\n Changed the default to ``False`` to match the behaviour\n from before 0.24.0.\n\n Returns\n -------\n intersection : Index\n \"\"\"\n self._validate_sort_keyword(sort)\n\n if self.equals(other):\n return self._get_reconciled_name_object(other)\n\n if not isinstance(other, RangeIndex):\n return super().intersection(other, sort=sort)\n\n if not len(self) or not len(other):\n return RangeIndex._simple_new(None)\n\n first = self[::-1] if self._step < 0 else self\n second = other[::-1] if other._step < 0 else other\n\n # check whether intervals intersect\n # deals with in- and decreasing ranges\n int_low = max(first._start, second._start)\n int_high = min(first._stop, second._stop)\n if int_high <= int_low:\n return RangeIndex._simple_new(None)\n\n # Method hint: linear Diophantine equation\n # solve intersection problem\n # performance hint: for identical step sizes, could use\n # cheaper alternative\n gcd, s, t = first._extended_gcd(first._step, second._step)\n\n # check whether element sets intersect\n if (first._start - second._start) % gcd:\n return RangeIndex._simple_new(None)\n\n # calculate parameters for the RangeIndex describing the\n # intersection disregarding the lower bounds\n tmp_start = first._start + (second._start - first._start) * \\\n first._step // gcd * s\n new_step = first._step * second._step // gcd\n new_index = RangeIndex._simple_new(tmp_start, int_high, new_step)\n\n # adjust index to limiting interval\n new_index._start = new_index._min_fitting_element(int_low)\n\n if (self._step < 0 and other._step < 0) is not (new_index._step < 0):\n new_index = new_index[::-1]\n if sort is None:\n new_index = new_index.sort_values()\n return new_index"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef _min_fitting_element(self, lower_limit):\n no_steps = -(-(lower_limit - self._start) // abs(self._step))\n return self._start + abs(self._step) * no_steps", "response": "Returns the smallest element greater than or equal to the limit"} {"instruction": "How would you explain what the following Python 3 function does\ndef _max_fitting_element(self, upper_limit):\n no_steps = (upper_limit - self._start) // abs(self._step)\n return self._start + abs(self._step) * no_steps", "response": "Returns the largest element smaller than or equal to the limit"} {"instruction": "Given the following Python 3 function, write the documentation\ndef _extended_gcd(self, a, b):\n s, old_s = 0, 1\n t, old_t = 1, 0\n r, old_r = b, a\n while r:\n quotient = old_r // r\n old_r, r = r, old_r - quotient * r\n old_s, s = s, old_s - quotient * s\n old_t, t = t, old_t - quotient * t\n return old_r, old_s, old_t", "response": "This function is used to solve the extended Euclidean distance algorithm to solve Bezout s identity."} {"instruction": "Make a summary of the following Python 3 code\ndef union(self, other, sort=None):\n self._assert_can_do_setop(other)\n if len(other) == 0 or self.equals(other) or len(self) == 0:\n return super().union(other, sort=sort)\n\n if isinstance(other, RangeIndex) and sort is None:\n start_s, step_s = self._start, self._step\n end_s = self._start + self._step * (len(self) - 1)\n start_o, step_o = other._start, other._step\n end_o = other._start + other._step * (len(other) - 1)\n if self._step < 0:\n start_s, step_s, end_s = end_s, -step_s, start_s\n if other._step < 0:\n start_o, step_o, end_o = end_o, -step_o, start_o\n if len(self) == 1 and len(other) == 1:\n step_s = step_o = abs(self._start - other._start)\n elif len(self) == 1:\n step_s = step_o\n elif len(other) == 1:\n step_o = step_s\n start_r = min(start_s, start_o)\n end_r = max(end_s, end_o)\n if step_o == step_s:\n if ((start_s - start_o) % step_s == 0 and\n (start_s - end_o) <= step_s and\n (start_o - end_s) <= step_s):\n return RangeIndex(start_r, end_r + step_s, step_s)\n if ((step_s % 2 == 0) and\n (abs(start_s - start_o) <= step_s / 2) and\n (abs(end_s - end_o) <= step_s / 2)):\n return RangeIndex(start_r, end_r + step_s / 2, step_s / 2)\n elif step_o % step_s == 0:\n if ((start_o - start_s) % step_s == 0 and\n (start_o + step_s >= start_s) and\n (end_o - step_s <= end_s)):\n return RangeIndex(start_r, end_r + step_s, step_s)\n elif step_s % step_o == 0:\n if ((start_s - start_o) % step_o == 0 and\n (start_s + step_o >= start_o) and\n (end_s - step_o <= end_o)):\n return RangeIndex(start_r, end_r + step_o, step_o)\n\n return self._int64index.union(other, sort=sort)", "response": "Return the union of two Index objects and sorts if possible."} {"instruction": "Implement a Python 3 function for\nadding in numeric methods specialized to RangeIndex", "response": "def _add_numeric_methods_binary(cls):\n \"\"\" add in numeric methods, specialized to RangeIndex \"\"\"\n\n def _make_evaluate_binop(op, step=False):\n \"\"\"\n Parameters\n ----------\n op : callable that accepts 2 parms\n perform the binary op\n step : callable, optional, default to False\n op to apply to the step parm if not None\n if False, use the existing step\n \"\"\"\n\n def _evaluate_numeric_binop(self, other):\n if isinstance(other, (ABCSeries, ABCDataFrame)):\n return NotImplemented\n elif isinstance(other, ABCTimedeltaIndex):\n # Defer to TimedeltaIndex implementation\n return NotImplemented\n elif isinstance(other, (timedelta, np.timedelta64)):\n # GH#19333 is_integer evaluated True on timedelta64,\n # so we need to catch these explicitly\n return op(self._int64index, other)\n elif is_timedelta64_dtype(other):\n # Must be an np.ndarray; GH#22390\n return op(self._int64index, other)\n\n other = self._validate_for_numeric_binop(other, op)\n attrs = self._get_attributes_dict()\n attrs = self._maybe_update_attributes(attrs)\n\n left, right = self, other\n\n try:\n # apply if we have an override\n if step:\n with np.errstate(all='ignore'):\n rstep = step(left._step, right)\n\n # we don't have a representable op\n # so return a base index\n if not is_integer(rstep) or not rstep:\n raise ValueError\n\n else:\n rstep = left._step\n\n with np.errstate(all='ignore'):\n rstart = op(left._start, right)\n rstop = op(left._stop, right)\n\n result = RangeIndex(rstart,\n rstop,\n rstep,\n **attrs)\n\n # for compat with numpy / Int64Index\n # even if we can represent as a RangeIndex, return\n # as a Float64Index if we have float-like descriptors\n if not all(is_integer(x) for x in\n [rstart, rstop, rstep]):\n result = result.astype('float64')\n\n return result\n\n except (ValueError, TypeError, ZeroDivisionError):\n # Defer to Int64Index implementation\n return op(self._int64index, other)\n # TODO: Do attrs get handled reliably?\n\n name = '__{name}__'.format(name=op.__name__)\n return compat.set_function_name(_evaluate_numeric_binop, name, cls)\n\n cls.__add__ = _make_evaluate_binop(operator.add)\n cls.__radd__ = _make_evaluate_binop(ops.radd)\n cls.__sub__ = _make_evaluate_binop(operator.sub)\n cls.__rsub__ = _make_evaluate_binop(ops.rsub)\n cls.__mul__ = _make_evaluate_binop(operator.mul, step=operator.mul)\n cls.__rmul__ = _make_evaluate_binop(ops.rmul, step=ops.rmul)\n cls.__truediv__ = _make_evaluate_binop(operator.truediv,\n step=operator.truediv)\n cls.__rtruediv__ = _make_evaluate_binop(ops.rtruediv,\n step=ops.rtruediv)"} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef to_numpy(self, dtype=None, copy=False):\n result = np.asarray(self._ndarray, dtype=dtype)\n if copy and result is self._ndarray:\n result = result.copy()\n\n return result", "response": "Convert the PandasArray to a NumPy array."} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef adjoin(space, *lists, **kwargs):\n strlen = kwargs.pop('strlen', len)\n justfunc = kwargs.pop('justfunc', justify)\n\n out_lines = []\n newLists = []\n lengths = [max(map(strlen, x)) + space for x in lists[:-1]]\n # not the last one\n lengths.append(max(map(len, lists[-1])))\n maxLen = max(map(len, lists))\n for i, lst in enumerate(lists):\n nl = justfunc(lst, lengths[i], mode='left')\n nl.extend([' ' * lengths[i]] * (maxLen - len(lst)))\n newLists.append(nl)\n toJoin = zip(*newLists)\n for lines in toJoin:\n out_lines.append(_join_unicode(lines))\n return _join_unicode(out_lines, sep='\\n')", "response": "Glues together two sets of strings using the amount of space requested."} {"instruction": "How would you implement a function in Python 3 that\nreturns a list of strings with the given length.", "response": "def justify(texts, max_len, mode='right'):\n \"\"\"\n Perform ljust, center, rjust against string or list-like\n \"\"\"\n if mode == 'left':\n return [x.ljust(max_len) for x in texts]\n elif mode == 'center':\n return [x.center(max_len) for x in texts]\n else:\n return [x.rjust(max_len) for x in texts]"} {"instruction": "How would you explain what the following Python 3 function does\ndef _pprint_seq(seq, _nest_lvl=0, max_seq_items=None, **kwds):\n if isinstance(seq, set):\n fmt = \"{{{body}}}\"\n else:\n fmt = \"[{body}]\" if hasattr(seq, '__setitem__') else \"({body})\"\n\n if max_seq_items is False:\n nitems = len(seq)\n else:\n nitems = max_seq_items or get_option(\"max_seq_items\") or len(seq)\n\n s = iter(seq)\n # handle sets, no slicing\n r = [pprint_thing(next(s),\n _nest_lvl + 1, max_seq_items=max_seq_items, **kwds)\n for i in range(min(nitems, len(seq)))]\n body = \", \".join(r)\n\n if nitems < len(seq):\n body += \", ...\"\n elif isinstance(seq, tuple) and len(seq) == 1:\n body += ','\n\n return fmt.format(body=body)", "response": "internal. pprinter for iterables"} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef _pprint_dict(seq, _nest_lvl=0, max_seq_items=None, **kwds):\n fmt = \"{{{things}}}\"\n pairs = []\n\n pfmt = \"{key}: {val}\"\n\n if max_seq_items is False:\n nitems = len(seq)\n else:\n nitems = max_seq_items or get_option(\"max_seq_items\") or len(seq)\n\n for k, v in list(seq.items())[:nitems]:\n pairs.append(\n pfmt.format(\n key=pprint_thing(k, _nest_lvl + 1,\n max_seq_items=max_seq_items, **kwds),\n val=pprint_thing(v, _nest_lvl + 1,\n max_seq_items=max_seq_items, **kwds)))\n\n if nitems < len(seq):\n return fmt.format(things=\", \".join(pairs) + \", ...\")\n else:\n return fmt.format(things=\", \".join(pairs))", "response": "internal. pprinter for dicts."} {"instruction": "Given the following Python 3 function, write the documentation\ndef pprint_thing(thing, _nest_lvl=0, escape_chars=None, default_escapes=False,\n quote_strings=False, max_seq_items=None):\n \"\"\"\n This function is the sanctioned way of converting objects\n to a unicode representation.\n\n properly handles nested sequences containing unicode strings\n (unicode(object) does not)\n\n Parameters\n ----------\n thing : anything to be formatted\n _nest_lvl : internal use only. pprint_thing() is mutually-recursive\n with pprint_sequence, this argument is used to keep track of the\n current nesting level, and limit it.\n escape_chars : list or dict, optional\n Characters to escape. If a dict is passed the values are the\n replacements\n default_escapes : bool, default False\n Whether the input escape characters replaces or adds to the defaults\n max_seq_items : False, int, default None\n Pass thru to other pretty printers to limit sequence printing\n\n Returns\n -------\n result - unicode str\n\n \"\"\"\n\n def as_escaped_unicode(thing, escape_chars=escape_chars):\n # Unicode is fine, else we try to decode using utf-8 and 'replace'\n # if that's not it either, we have no way of knowing and the user\n # should deal with it himself.\n\n try:\n result = str(thing) # we should try this first\n except UnicodeDecodeError:\n # either utf-8 or we replace errors\n result = str(thing).decode('utf-8', \"replace\")\n\n translate = {'\\t': r'\\t', '\\n': r'\\n', '\\r': r'\\r', }\n if isinstance(escape_chars, dict):\n if default_escapes:\n translate.update(escape_chars)\n else:\n translate = escape_chars\n escape_chars = list(escape_chars.keys())\n else:\n escape_chars = escape_chars or tuple()\n for c in escape_chars:\n result = result.replace(c, translate[c])\n\n return str(result)\n\n if hasattr(thing, '__next__'):\n return str(thing)\n elif (isinstance(thing, dict) and\n _nest_lvl < get_option(\"display.pprint_nest_depth\")):\n result = _pprint_dict(thing, _nest_lvl, quote_strings=True,\n max_seq_items=max_seq_items)\n elif (is_sequence(thing) and\n _nest_lvl < get_option(\"display.pprint_nest_depth\")):\n result = _pprint_seq(thing, _nest_lvl, escape_chars=escape_chars,\n quote_strings=quote_strings,\n max_seq_items=max_seq_items)\n elif isinstance(thing, str) and quote_strings:\n result = \"'{thing}'\".format(thing=as_escaped_unicode(thing))\n else:\n result = as_escaped_unicode(thing)\n\n return str(result)", "response": "Pretty print a single object or nested sequence of objects."} {"instruction": "Can you create a Python 3 function that\nreturns the formatted obj as a unicode string", "response": "def format_object_summary(obj, formatter, is_justify=True, name=None,\n indent_for_name=True):\n \"\"\"\n Return the formatted obj as a unicode string\n\n Parameters\n ----------\n obj : object\n must be iterable and support __getitem__\n formatter : callable\n string formatter for an element\n is_justify : boolean\n should justify the display\n name : name, optional\n defaults to the class name of the obj\n indent_for_name : bool, default True\n Whether subsequent lines should be be indented to\n align with the name.\n\n Returns\n -------\n summary string\n\n \"\"\"\n from pandas.io.formats.console import get_console_size\n from pandas.io.formats.format import _get_adjustment\n\n display_width, _ = get_console_size()\n if display_width is None:\n display_width = get_option('display.width') or 80\n if name is None:\n name = obj.__class__.__name__\n\n if indent_for_name:\n name_len = len(name)\n space1 = \"\\n%s\" % (' ' * (name_len + 1))\n space2 = \"\\n%s\" % (' ' * (name_len + 2))\n else:\n space1 = \"\\n\"\n space2 = \"\\n \" # space for the opening '['\n\n n = len(obj)\n sep = ','\n max_seq_items = get_option('display.max_seq_items') or n\n\n # are we a truncated display\n is_truncated = n > max_seq_items\n\n # adj can optionally handle unicode eastern asian width\n adj = _get_adjustment()\n\n def _extend_line(s, line, value, display_width, next_line_prefix):\n\n if (adj.len(line.rstrip()) + adj.len(value.rstrip()) >=\n display_width):\n s += line.rstrip()\n line = next_line_prefix\n line += value\n return s, line\n\n def best_len(values):\n if values:\n return max(adj.len(x) for x in values)\n else:\n return 0\n\n close = ', '\n\n if n == 0:\n summary = '[]{}'.format(close)\n elif n == 1:\n first = formatter(obj[0])\n summary = '[{}]{}'.format(first, close)\n elif n == 2:\n first = formatter(obj[0])\n last = formatter(obj[-1])\n summary = '[{}, {}]{}'.format(first, last, close)\n else:\n\n if n > max_seq_items:\n n = min(max_seq_items // 2, 10)\n head = [formatter(x) for x in obj[:n]]\n tail = [formatter(x) for x in obj[-n:]]\n else:\n head = []\n tail = [formatter(x) for x in obj]\n\n # adjust all values to max length if needed\n if is_justify:\n\n # however, if we are not truncated and we are only a single\n # line, then don't justify\n if (is_truncated or\n not (len(', '.join(head)) < display_width and\n len(', '.join(tail)) < display_width)):\n max_len = max(best_len(head), best_len(tail))\n head = [x.rjust(max_len) for x in head]\n tail = [x.rjust(max_len) for x in tail]\n\n summary = \"\"\n line = space2\n\n for i in range(len(head)):\n word = head[i] + sep + ' '\n summary, line = _extend_line(summary, line, word,\n display_width, space2)\n\n if is_truncated:\n # remove trailing space of last line\n summary += line.rstrip() + space2 + '...'\n line = space2\n\n for i in range(len(tail) - 1):\n word = tail[i] + sep + ' '\n summary, line = _extend_line(summary, line, word,\n display_width, space2)\n\n # last value: no sep added + 1 space of width used for trailing ','\n summary, line = _extend_line(summary, line, tail[-1],\n display_width - 2, space2)\n summary += line\n\n # right now close is either '' or ', '\n # Now we want to include the ']', but not the maybe space.\n close = ']' + close.rstrip(' ')\n summary += close\n\n if len(summary) > (display_width):\n summary += space1\n else: # one row\n summary += ' '\n\n # remove initial space\n summary = '[' + summary[len(space2):]\n\n return summary"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef format_object_attrs(obj):\n attrs = []\n if hasattr(obj, 'dtype'):\n attrs.append(('dtype', \"'{}'\".format(obj.dtype)))\n if getattr(obj, 'name', None) is not None:\n attrs.append(('name', default_pprint(obj.name)))\n max_seq_items = get_option('display.max_seq_items') or len(obj)\n if len(obj) > max_seq_items:\n attrs.append(('length', len(obj)))\n return attrs", "response": "Returns a list of tuples of the attr formatted_value for common attrs including dtype name length"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef read_gbq(query, project_id=None, index_col=None, col_order=None,\n reauth=False, auth_local_webserver=False, dialect=None,\n location=None, configuration=None, credentials=None,\n use_bqstorage_api=None, private_key=None, verbose=None):\n \"\"\"\n Load data from Google BigQuery.\n\n This function requires the `pandas-gbq package\n `__.\n\n See the `How to authenticate with Google BigQuery\n `__\n guide for authentication instructions.\n\n Parameters\n ----------\n query : str\n SQL-Like Query to return data values.\n project_id : str, optional\n Google BigQuery Account project ID. Optional when available from\n the environment.\n index_col : str, optional\n Name of result column to use for index in results DataFrame.\n col_order : list(str), optional\n List of BigQuery column names in the desired order for results\n DataFrame.\n reauth : boolean, default False\n Force Google BigQuery to re-authenticate the user. This is useful\n if multiple accounts are used.\n auth_local_webserver : boolean, default False\n Use the `local webserver flow`_ instead of the `console flow`_\n when getting user credentials.\n\n .. _local webserver flow:\n http://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_local_server\n .. _console flow:\n http://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_console\n\n *New in version 0.2.0 of pandas-gbq*.\n dialect : str, default 'legacy'\n Note: The default value is changing to 'standard' in a future verion.\n\n SQL syntax dialect to use. Value can be one of:\n\n ``'legacy'``\n Use BigQuery's legacy SQL dialect. For more information see\n `BigQuery Legacy SQL Reference\n `__.\n ``'standard'``\n Use BigQuery's standard SQL, which is\n compliant with the SQL 2011 standard. For more information\n see `BigQuery Standard SQL Reference\n `__.\n\n .. versionchanged:: 0.24.0\n location : str, optional\n Location where the query job should run. See the `BigQuery locations\n documentation\n `__ for a\n list of available locations. The location must match that of any\n datasets used in the query.\n\n *New in version 0.5.0 of pandas-gbq*.\n configuration : dict, optional\n Query config parameters for job processing.\n For example:\n\n configuration = {'query': {'useQueryCache': False}}\n\n For more information see `BigQuery REST API Reference\n `__.\n credentials : google.auth.credentials.Credentials, optional\n Credentials for accessing Google APIs. Use this parameter to override\n default credentials, such as to use Compute Engine\n :class:`google.auth.compute_engine.Credentials` or Service Account\n :class:`google.oauth2.service_account.Credentials` directly.\n\n *New in version 0.8.0 of pandas-gbq*.\n\n .. versionadded:: 0.24.0\n use_bqstorage_api : bool, default False\n Use the `BigQuery Storage API\n `__ to\n download query results quickly, but at an increased cost. To use this\n API, first `enable it in the Cloud Console\n `__.\n You must also have the `bigquery.readsessions.create\n `__\n permission on the project you are billing queries to.\n\n This feature requires version 0.10.0 or later of the ``pandas-gbq``\n package. It also requires the ``google-cloud-bigquery-storage`` and\n ``fastavro`` packages.\n\n .. versionadded:: 0.25.0\n private_key : str, deprecated\n Deprecated in pandas-gbq version 0.8.0. Use the ``credentials``\n parameter and\n :func:`google.oauth2.service_account.Credentials.from_service_account_info`\n or\n :func:`google.oauth2.service_account.Credentials.from_service_account_file`\n instead.\n\n Service account private key in JSON format. Can be file path\n or string contents. This is useful for remote server\n authentication (eg. Jupyter/IPython notebook on remote host).\n verbose : None, deprecated\n Deprecated in pandas-gbq version 0.4.0. Use the `logging module to\n adjust verbosity instead\n `__.\n\n Returns\n -------\n df: DataFrame\n DataFrame representing results of query.\n\n See Also\n --------\n pandas_gbq.read_gbq : This function in the pandas-gbq library.\n DataFrame.to_gbq : Write a DataFrame to Google BigQuery.\n \"\"\"\n pandas_gbq = _try_import()\n\n kwargs = {}\n\n # START: new kwargs. Don't populate unless explicitly set.\n if use_bqstorage_api is not None:\n kwargs[\"use_bqstorage_api\"] = use_bqstorage_api\n # END: new kwargs\n\n # START: deprecated kwargs. Don't populate unless explicitly set.\n if verbose is not None:\n kwargs[\"verbose\"] = verbose\n\n if private_key is not None:\n kwargs[\"private_key\"] = private_key\n # END: deprecated kwargs\n\n return pandas_gbq.read_gbq(\n query, project_id=project_id, index_col=index_col,\n col_order=col_order, reauth=reauth,\n auth_local_webserver=auth_local_webserver, dialect=dialect,\n location=location, configuration=configuration,\n credentials=credentials, **kwargs)", "response": "Read data from Google BigQuery."} {"instruction": "Explain what the following Python 3 code does\ndef scatter_matrix(frame, alpha=0.5, figsize=None, ax=None, grid=False,\n diagonal='hist', marker='.', density_kwds=None,\n hist_kwds=None, range_padding=0.05, **kwds):\n \"\"\"\n Draw a matrix of scatter plots.\n\n Parameters\n ----------\n frame : DataFrame\n alpha : float, optional\n amount of transparency applied\n figsize : (float,float), optional\n a tuple (width, height) in inches\n ax : Matplotlib axis object, optional\n grid : bool, optional\n setting this to True will show the grid\n diagonal : {'hist', 'kde'}\n pick between 'kde' and 'hist' for\n either Kernel Density Estimation or Histogram\n plot in the diagonal\n marker : str, optional\n Matplotlib marker type, default '.'\n hist_kwds : other plotting keyword arguments\n To be passed to hist function\n density_kwds : other plotting keyword arguments\n To be passed to kernel density estimate plot\n range_padding : float, optional\n relative extension of axis range in x and y\n with respect to (x_max - x_min) or (y_max - y_min),\n default 0.05\n kwds : other plotting keyword arguments\n To be passed to scatter function\n\n Examples\n --------\n >>> df = pd.DataFrame(np.random.randn(1000, 4), columns=['A','B','C','D'])\n >>> scatter_matrix(df, alpha=0.2)\n \"\"\"\n\n df = frame._get_numeric_data()\n n = df.columns.size\n naxes = n * n\n fig, axes = _subplots(naxes=naxes, figsize=figsize, ax=ax,\n squeeze=False)\n\n # no gaps between subplots\n fig.subplots_adjust(wspace=0, hspace=0)\n\n mask = notna(df)\n\n marker = _get_marker_compat(marker)\n\n hist_kwds = hist_kwds or {}\n density_kwds = density_kwds or {}\n\n # GH 14855\n kwds.setdefault('edgecolors', 'none')\n\n boundaries_list = []\n for a in df.columns:\n values = df[a].values[mask[a].values]\n rmin_, rmax_ = np.min(values), np.max(values)\n rdelta_ext = (rmax_ - rmin_) * range_padding / 2.\n boundaries_list.append((rmin_ - rdelta_ext, rmax_ + rdelta_ext))\n\n for i, a in zip(lrange(n), df.columns):\n for j, b in zip(lrange(n), df.columns):\n ax = axes[i, j]\n\n if i == j:\n values = df[a].values[mask[a].values]\n\n # Deal with the diagonal by drawing a histogram there.\n if diagonal == 'hist':\n ax.hist(values, **hist_kwds)\n\n elif diagonal in ('kde', 'density'):\n from scipy.stats import gaussian_kde\n y = values\n gkde = gaussian_kde(y)\n ind = np.linspace(y.min(), y.max(), 1000)\n ax.plot(ind, gkde.evaluate(ind), **density_kwds)\n\n ax.set_xlim(boundaries_list[i])\n\n else:\n common = (mask[a] & mask[b]).values\n\n ax.scatter(df[b][common], df[a][common],\n marker=marker, alpha=alpha, **kwds)\n\n ax.set_xlim(boundaries_list[j])\n ax.set_ylim(boundaries_list[i])\n\n ax.set_xlabel(b)\n ax.set_ylabel(a)\n\n if j != 0:\n ax.yaxis.set_visible(False)\n if i != n - 1:\n ax.xaxis.set_visible(False)\n\n if len(df.columns) > 1:\n lim1 = boundaries_list[0]\n locs = axes[0][1].yaxis.get_majorticklocs()\n locs = locs[(lim1[0] <= locs) & (locs <= lim1[1])]\n adj = (locs - lim1[0]) / (lim1[1] - lim1[0])\n\n lim0 = axes[0][0].get_ylim()\n adj = adj * (lim0[1] - lim0[0]) + lim0[0]\n axes[0][0].yaxis.set_ticks(adj)\n\n if np.all(locs == locs.astype(int)):\n # if all ticks are int\n locs = locs.astype(int)\n axes[0][0].yaxis.set_ticklabels(locs)\n\n _set_ticks_props(axes, xlabelsize=8, xrot=90, ylabelsize=8, yrot=0)\n\n return axes", "response": "Draw a matrix of scatter plots."} {"instruction": "Can you tell what is the following Python 3 function doing\ndef radviz(frame, class_column, ax=None, color=None, colormap=None, **kwds):\n import matplotlib.pyplot as plt\n import matplotlib.patches as patches\n\n def normalize(series):\n a = min(series)\n b = max(series)\n return (series - a) / (b - a)\n\n n = len(frame)\n classes = frame[class_column].drop_duplicates()\n class_col = frame[class_column]\n df = frame.drop(class_column, axis=1).apply(normalize)\n\n if ax is None:\n ax = plt.gca(xlim=[-1, 1], ylim=[-1, 1])\n\n to_plot = {}\n colors = _get_standard_colors(num_colors=len(classes), colormap=colormap,\n color_type='random', color=color)\n\n for kls in classes:\n to_plot[kls] = [[], []]\n\n m = len(frame.columns) - 1\n s = np.array([(np.cos(t), np.sin(t))\n for t in [2.0 * np.pi * (i / float(m))\n for i in range(m)]])\n\n for i in range(n):\n row = df.iloc[i].values\n row_ = np.repeat(np.expand_dims(row, axis=1), 2, axis=1)\n y = (s * row_).sum(axis=0) / row.sum()\n kls = class_col.iat[i]\n to_plot[kls][0].append(y[0])\n to_plot[kls][1].append(y[1])\n\n for i, kls in enumerate(classes):\n ax.scatter(to_plot[kls][0], to_plot[kls][1], color=colors[i],\n label=pprint_thing(kls), **kwds)\n ax.legend()\n\n ax.add_patch(patches.Circle((0.0, 0.0), radius=1.0, facecolor='none'))\n\n for xy, name in zip(s, df.columns):\n\n ax.add_patch(patches.Circle(xy, radius=0.025, facecolor='gray'))\n\n if xy[0] < 0.0 and xy[1] < 0.0:\n ax.text(xy[0] - 0.025, xy[1] - 0.025, name,\n ha='right', va='top', size='small')\n elif xy[0] < 0.0 and xy[1] >= 0.0:\n ax.text(xy[0] - 0.025, xy[1] + 0.025, name,\n ha='right', va='bottom', size='small')\n elif xy[0] >= 0.0 and xy[1] < 0.0:\n ax.text(xy[0] + 0.025, xy[1] - 0.025, name,\n ha='left', va='top', size='small')\n elif xy[0] >= 0.0 and xy[1] >= 0.0:\n ax.text(xy[0] + 0.025, xy[1] + 0.025, name,\n ha='left', va='bottom', size='small')\n\n ax.axis('equal')\n return ax", "response": "Plots a multidimensional dataset in 2D space."} {"instruction": "Write a Python 3 script to\ngenerate a matplotlib plot of Andrews curves for visualising a single cluster of a single class.", "response": "def andrews_curves(frame, class_column, ax=None, samples=200, color=None,\n colormap=None, **kwds):\n \"\"\"\n Generate a matplotlib plot of Andrews curves, for visualising clusters of\n multivariate data.\n\n Andrews curves have the functional form:\n\n f(t) = x_1/sqrt(2) + x_2 sin(t) + x_3 cos(t) +\n x_4 sin(2t) + x_5 cos(2t) + ...\n\n Where x coefficients correspond to the values of each dimension and t is\n linearly spaced between -pi and +pi. Each row of frame then corresponds to\n a single curve.\n\n Parameters\n ----------\n frame : DataFrame\n Data to be plotted, preferably normalized to (0.0, 1.0)\n class_column : Name of the column containing class names\n ax : matplotlib axes object, default None\n samples : Number of points to plot in each curve\n color : list or tuple, optional\n Colors to use for the different classes\n colormap : str or matplotlib colormap object, default None\n Colormap to select colors from. If string, load colormap with that name\n from matplotlib.\n kwds : keywords\n Options to pass to matplotlib plotting method\n\n Returns\n -------\n class:`matplotlip.axis.Axes`\n \"\"\"\n from math import sqrt, pi\n import matplotlib.pyplot as plt\n\n def function(amplitudes):\n def f(t):\n x1 = amplitudes[0]\n result = x1 / sqrt(2.0)\n\n # Take the rest of the coefficients and resize them\n # appropriately. Take a copy of amplitudes as otherwise numpy\n # deletes the element from amplitudes itself.\n coeffs = np.delete(np.copy(amplitudes), 0)\n coeffs.resize(int((coeffs.size + 1) / 2), 2)\n\n # Generate the harmonics and arguments for the sin and cos\n # functions.\n harmonics = np.arange(0, coeffs.shape[0]) + 1\n trig_args = np.outer(harmonics, t)\n\n result += np.sum(coeffs[:, 0, np.newaxis] * np.sin(trig_args) +\n coeffs[:, 1, np.newaxis] * np.cos(trig_args),\n axis=0)\n return result\n return f\n\n n = len(frame)\n class_col = frame[class_column]\n classes = frame[class_column].drop_duplicates()\n df = frame.drop(class_column, axis=1)\n t = np.linspace(-pi, pi, samples)\n used_legends = set()\n\n color_values = _get_standard_colors(num_colors=len(classes),\n colormap=colormap, color_type='random',\n color=color)\n colors = dict(zip(classes, color_values))\n if ax is None:\n ax = plt.gca(xlim=(-pi, pi))\n for i in range(n):\n row = df.iloc[i].values\n f = function(row)\n y = f(t)\n kls = class_col.iat[i]\n label = pprint_thing(kls)\n if label not in used_legends:\n used_legends.add(label)\n ax.plot(t, y, color=colors[kls], label=label, **kwds)\n else:\n ax.plot(t, y, color=colors[kls], **kwds)\n\n ax.legend(loc='upper right')\n ax.grid()\n return ax"} {"instruction": "Write a Python 3 script to\nbootstrap plot on mean, median and mid-range statistics. The bootstrap plot is used to estimate the uncertainty of a statistic by relaying on random sampling with replacement [1]_. This function will generate bootstrapping plots for mean, median and mid-range statistics for the given number of samples of the given size. .. [1] \"Bootstrapping (statistics)\" in \\ https://en.wikipedia.org/wiki/Bootstrapping_%28statistics%29 Parameters ---------- series : pandas.Series Pandas Series from where to get the samplings for the bootstrapping. fig : matplotlib.figure.Figure, default None If given, it will use the `fig` reference for plotting instead of creating a new one with default parameters. size : int, default 50 Number of data points to consider during each sampling. It must be greater or equal than the length of the `series`. samples : int, default 500 Number of times the bootstrap procedure is performed. **kwds : Options to pass to matplotlib plotting method. Returns ------- matplotlib.figure.Figure Matplotlib figure. See Also -------- DataFrame.plot : Basic plotting for DataFrame objects. Series.plot : Basic plotting for Series objects. Examples -------- .. plot:: :context: close-figs >>> s = pd.Series(np.random.uniform(size=100)) >>> fig = pd.plotting.bootstrap_plot(s) # doctest: +SKIP", "response": "def bootstrap_plot(series, fig=None, size=50, samples=500, **kwds):\n \"\"\"\n Bootstrap plot on mean, median and mid-range statistics.\n\n The bootstrap plot is used to estimate the uncertainty of a statistic\n by relaying on random sampling with replacement [1]_. This function will\n generate bootstrapping plots for mean, median and mid-range statistics\n for the given number of samples of the given size.\n\n .. [1] \"Bootstrapping (statistics)\" in \\\n https://en.wikipedia.org/wiki/Bootstrapping_%28statistics%29\n\n Parameters\n ----------\n series : pandas.Series\n Pandas Series from where to get the samplings for the bootstrapping.\n fig : matplotlib.figure.Figure, default None\n If given, it will use the `fig` reference for plotting instead of\n creating a new one with default parameters.\n size : int, default 50\n Number of data points to consider during each sampling. It must be\n greater or equal than the length of the `series`.\n samples : int, default 500\n Number of times the bootstrap procedure is performed.\n **kwds :\n Options to pass to matplotlib plotting method.\n\n Returns\n -------\n matplotlib.figure.Figure\n Matplotlib figure.\n\n See Also\n --------\n DataFrame.plot : Basic plotting for DataFrame objects.\n Series.plot : Basic plotting for Series objects.\n\n Examples\n --------\n\n .. plot::\n :context: close-figs\n\n >>> s = pd.Series(np.random.uniform(size=100))\n >>> fig = pd.plotting.bootstrap_plot(s) # doctest: +SKIP\n \"\"\"\n import random\n import matplotlib.pyplot as plt\n\n # random.sample(ndarray, int) fails on python 3.3, sigh\n data = list(series.values)\n samplings = [random.sample(data, size) for _ in range(samples)]\n\n means = np.array([np.mean(sampling) for sampling in samplings])\n medians = np.array([np.median(sampling) for sampling in samplings])\n midranges = np.array([(min(sampling) + max(sampling)) * 0.5\n for sampling in samplings])\n if fig is None:\n fig = plt.figure()\n x = lrange(samples)\n axes = []\n ax1 = fig.add_subplot(2, 3, 1)\n ax1.set_xlabel(\"Sample\")\n axes.append(ax1)\n ax1.plot(x, means, **kwds)\n ax2 = fig.add_subplot(2, 3, 2)\n ax2.set_xlabel(\"Sample\")\n axes.append(ax2)\n ax2.plot(x, medians, **kwds)\n ax3 = fig.add_subplot(2, 3, 3)\n ax3.set_xlabel(\"Sample\")\n axes.append(ax3)\n ax3.plot(x, midranges, **kwds)\n ax4 = fig.add_subplot(2, 3, 4)\n ax4.set_xlabel(\"Mean\")\n axes.append(ax4)\n ax4.hist(means, **kwds)\n ax5 = fig.add_subplot(2, 3, 5)\n ax5.set_xlabel(\"Median\")\n axes.append(ax5)\n ax5.hist(medians, **kwds)\n ax6 = fig.add_subplot(2, 3, 6)\n ax6.set_xlabel(\"Midrange\")\n axes.append(ax6)\n ax6.hist(midranges, **kwds)\n for axis in axes:\n plt.setp(axis.get_xticklabels(), fontsize=8)\n plt.setp(axis.get_yticklabels(), fontsize=8)\n return fig"} {"instruction": "Make a summary of the following Python 3 code\ndef parallel_coordinates(frame, class_column, cols=None, ax=None, color=None,\n use_columns=False, xticks=None, colormap=None,\n axvlines=True, axvlines_kwds=None, sort_labels=False,\n **kwds):\n \"\"\"Parallel coordinates plotting.\n\n Parameters\n ----------\n frame : DataFrame\n class_column : str\n Column name containing class names\n cols : list, optional\n A list of column names to use\n ax : matplotlib.axis, optional\n matplotlib axis object\n color : list or tuple, optional\n Colors to use for the different classes\n use_columns : bool, optional\n If true, columns will be used as xticks\n xticks : list or tuple, optional\n A list of values to use for xticks\n colormap : str or matplotlib colormap, default None\n Colormap to use for line colors.\n axvlines : bool, optional\n If true, vertical lines will be added at each xtick\n axvlines_kwds : keywords, optional\n Options to be passed to axvline method for vertical lines\n sort_labels : bool, False\n Sort class_column labels, useful when assigning colors\n\n .. versionadded:: 0.20.0\n\n kwds : keywords\n Options to pass to matplotlib plotting method\n\n Returns\n -------\n class:`matplotlib.axis.Axes`\n\n Examples\n --------\n >>> from matplotlib import pyplot as plt\n >>> df = pd.read_csv('https://raw.github.com/pandas-dev/pandas/master'\n '/pandas/tests/data/iris.csv')\n >>> pd.plotting.parallel_coordinates(\n df, 'Name',\n color=('#556270', '#4ECDC4', '#C7F464'))\n >>> plt.show()\n \"\"\"\n if axvlines_kwds is None:\n axvlines_kwds = {'linewidth': 1, 'color': 'black'}\n import matplotlib.pyplot as plt\n\n n = len(frame)\n classes = frame[class_column].drop_duplicates()\n class_col = frame[class_column]\n\n if cols is None:\n df = frame.drop(class_column, axis=1)\n else:\n df = frame[cols]\n\n used_legends = set()\n\n ncols = len(df.columns)\n\n # determine values to use for xticks\n if use_columns is True:\n if not np.all(np.isreal(list(df.columns))):\n raise ValueError('Columns must be numeric to be used as xticks')\n x = df.columns\n elif xticks is not None:\n if not np.all(np.isreal(xticks)):\n raise ValueError('xticks specified must be numeric')\n elif len(xticks) != ncols:\n raise ValueError('Length of xticks must match number of columns')\n x = xticks\n else:\n x = lrange(ncols)\n\n if ax is None:\n ax = plt.gca()\n\n color_values = _get_standard_colors(num_colors=len(classes),\n colormap=colormap, color_type='random',\n color=color)\n\n if sort_labels:\n classes = sorted(classes)\n color_values = sorted(color_values)\n colors = dict(zip(classes, color_values))\n\n for i in range(n):\n y = df.iloc[i].values\n kls = class_col.iat[i]\n label = pprint_thing(kls)\n if label not in used_legends:\n used_legends.add(label)\n ax.plot(x, y, color=colors[kls], label=label, **kwds)\n else:\n ax.plot(x, y, color=colors[kls], **kwds)\n\n if axvlines:\n for i in x:\n ax.axvline(i, **axvlines_kwds)\n\n ax.set_xticks(x)\n ax.set_xticklabels(df.columns)\n ax.set_xlim(x[0], x[-1])\n ax.legend(loc='upper right')\n ax.grid()\n return ax", "response": "Plots a series of parallel coordinates for the given class_column."} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef lag_plot(series, lag=1, ax=None, **kwds):\n import matplotlib.pyplot as plt\n\n # workaround because `c='b'` is hardcoded in matplotlibs scatter method\n kwds.setdefault('c', plt.rcParams['patch.facecolor'])\n\n data = series.values\n y1 = data[:-lag]\n y2 = data[lag:]\n if ax is None:\n ax = plt.gca()\n ax.set_xlabel(\"y(t)\")\n ax.set_ylabel(\"y(t + {lag})\".format(lag=lag))\n ax.scatter(y1, y2, **kwds)\n return ax", "response": "Lag plot for time series."} {"instruction": "Given the following Python 3 function, write the documentation\ndef autocorrelation_plot(series, ax=None, **kwds):\n import matplotlib.pyplot as plt\n n = len(series)\n data = np.asarray(series)\n if ax is None:\n ax = plt.gca(xlim=(1, n), ylim=(-1.0, 1.0))\n mean = np.mean(data)\n c0 = np.sum((data - mean) ** 2) / float(n)\n\n def r(h):\n return ((data[:n - h] - mean) *\n (data[h:] - mean)).sum() / float(n) / c0\n x = np.arange(n) + 1\n y = lmap(r, x)\n z95 = 1.959963984540054\n z99 = 2.5758293035489004\n ax.axhline(y=z99 / np.sqrt(n), linestyle='--', color='grey')\n ax.axhline(y=z95 / np.sqrt(n), color='grey')\n ax.axhline(y=0.0, color='black')\n ax.axhline(y=-z95 / np.sqrt(n), color='grey')\n ax.axhline(y=-z99 / np.sqrt(n), linestyle='--', color='grey')\n ax.set_xlabel(\"Lag\")\n ax.set_ylabel(\"Autocorrelation\")\n ax.plot(x, y, **kwds)\n if 'label' in kwds:\n ax.legend()\n ax.grid()\n return ax", "response": "Plots the autocorrelation of a time series."} {"instruction": "How would you explain what the following Python 3 function does\ndef _any_pandas_objects(terms):\n return any(isinstance(term.value, pd.core.generic.PandasObject)\n for term in terms)", "response": "Check a sequence of terms for instances of PandasObject."} {"instruction": "Here you have a function in Python 3, explain what it does\ndef _align(terms):\n try:\n # flatten the parse tree (a nested list, really)\n terms = list(com.flatten(terms))\n except TypeError:\n # can't iterate so it must just be a constant or single variable\n if isinstance(terms.value, pd.core.generic.NDFrame):\n typ = type(terms.value)\n return typ, _zip_axes_from_type(typ, terms.value.axes)\n return np.result_type(terms.type), None\n\n # if all resolved variables are numeric scalars\n if all(term.is_scalar for term in terms):\n return _result_type_many(*(term.value for term in terms)).type, None\n\n # perform the main alignment\n typ, axes = _align_core(terms)\n return typ, axes", "response": "Align a set of terms into a single object"} {"instruction": "Implement a Python 3 function for\nreconstructing an object given its type raw value and possibly empty tuple.", "response": "def _reconstruct_object(typ, obj, axes, dtype):\n \"\"\"Reconstruct an object given its type, raw value, and possibly empty\n (None) axes.\n\n Parameters\n ----------\n typ : object\n A type\n obj : object\n The value to use in the type constructor\n axes : dict\n The axes to use to construct the resulting pandas object\n\n Returns\n -------\n ret : typ\n An object of type ``typ`` with the value `obj` and possible axes\n `axes`.\n \"\"\"\n try:\n typ = typ.type\n except AttributeError:\n pass\n\n res_t = np.result_type(obj.dtype, dtype)\n\n if (not isinstance(typ, partial) and\n issubclass(typ, pd.core.generic.PandasObject)):\n return typ(obj, dtype=res_t, **axes)\n\n # special case for pathological things like ~True/~False\n if hasattr(res_t, 'type') and typ == np.bool_ and res_t != np.bool_:\n ret_value = res_t.type(obj)\n else:\n ret_value = typ(obj).astype(res_t)\n # The condition is to distinguish 0-dim array (returned in case of\n # scalar) and 1 element array\n # e.g. np.array(0) and np.array([0])\n if len(obj.shape) == 1 and len(obj) == 1:\n if not isinstance(ret_value, np.ndarray):\n ret_value = np.array([ret_value]).astype(res_t)\n\n return ret_value"} {"instruction": "Make a summary of the following Python 3 code\ndef tsplot(series, plotf, ax=None, **kwargs):\n import warnings\n \"\"\"\n Plots a Series on the given Matplotlib axes or the current axes\n\n Parameters\n ----------\n axes : Axes\n series : Series\n\n Notes\n _____\n Supports same kwargs as Axes.plot\n\n\n .. deprecated:: 0.23.0\n Use Series.plot() instead\n \"\"\"\n warnings.warn(\"'tsplot' is deprecated and will be removed in a \"\n \"future version. Please use Series.plot() instead.\",\n FutureWarning, stacklevel=2)\n\n # Used inferred freq is possible, need a test case for inferred\n if ax is None:\n import matplotlib.pyplot as plt\n ax = plt.gca()\n\n freq, series = _maybe_resample(series, ax, kwargs)\n\n # Set ax with freq info\n _decorate_axes(ax, freq, kwargs)\n ax._plot_data.append((series, plotf, kwargs))\n lines = plotf(ax, series.index._mpl_repr(), series.values, **kwargs)\n\n # set date formatter, locators and rescale limits\n format_dateaxis(ax, ax.freq, series.index)\n return lines", "response": "Plots a Series on the given Matplotlib axes or the current axes."} {"instruction": "Can you create a Python 3 function that\ndecorates axes for time - series plotting", "response": "def _decorate_axes(ax, freq, kwargs):\n \"\"\"Initialize axes for time-series plotting\"\"\"\n if not hasattr(ax, '_plot_data'):\n ax._plot_data = []\n\n ax.freq = freq\n xaxis = ax.get_xaxis()\n xaxis.freq = freq\n if not hasattr(ax, 'legendlabels'):\n ax.legendlabels = [kwargs.get('label', None)]\n else:\n ax.legendlabels.append(kwargs.get('label', None))\n ax.view_interval = None\n ax.date_axis_info = None"} {"instruction": "How would you explain what the following Python 3 function does\ndef _get_ax_freq(ax):\n ax_freq = getattr(ax, 'freq', None)\n if ax_freq is None:\n # check for left/right ax in case of secondary yaxis\n if hasattr(ax, 'left_ax'):\n ax_freq = getattr(ax.left_ax, 'freq', None)\n elif hasattr(ax, 'right_ax'):\n ax_freq = getattr(ax.right_ax, 'freq', None)\n if ax_freq is None:\n # check if a shared ax (sharex/twinx) has already freq set\n shared_axes = ax.get_shared_x_axes().get_siblings(ax)\n if len(shared_axes) > 1:\n for shared_ax in shared_axes:\n ax_freq = getattr(shared_ax, 'freq', None)\n if ax_freq is not None:\n break\n return ax_freq", "response": "Get the freq attribute of the ax object if set."} {"instruction": "Create a Python 3 function to\nconvert seconds to D days HH : MM : SS. F", "response": "def format_timedelta_ticks(x, pos, n_decimals):\n \"\"\"\n Convert seconds to 'D days HH:MM:SS.F'\n \"\"\"\n s, ns = divmod(x, 1e9)\n m, s = divmod(s, 60)\n h, m = divmod(m, 60)\n d, h = divmod(h, 24)\n decimals = int(ns * 10**(n_decimals - 9))\n s = r'{:02d}:{:02d}:{:02d}'.format(int(h), int(m), int(s))\n if n_decimals > 0:\n s += '.{{:0{:0d}d}}'.format(n_decimals).format(decimals)\n if d != 0:\n s = '{:d} days '.format(int(d)) + s\n return s"} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef format_dateaxis(subplot, freq, index):\n\n # handle index specific formatting\n # Note: DatetimeIndex does not use this\n # interface. DatetimeIndex uses matplotlib.date directly\n if isinstance(index, ABCPeriodIndex):\n\n majlocator = TimeSeries_DateLocator(freq, dynamic_mode=True,\n minor_locator=False,\n plot_obj=subplot)\n minlocator = TimeSeries_DateLocator(freq, dynamic_mode=True,\n minor_locator=True,\n plot_obj=subplot)\n subplot.xaxis.set_major_locator(majlocator)\n subplot.xaxis.set_minor_locator(minlocator)\n\n majformatter = TimeSeries_DateFormatter(freq, dynamic_mode=True,\n minor_locator=False,\n plot_obj=subplot)\n minformatter = TimeSeries_DateFormatter(freq, dynamic_mode=True,\n minor_locator=True,\n plot_obj=subplot)\n subplot.xaxis.set_major_formatter(majformatter)\n subplot.xaxis.set_minor_formatter(minformatter)\n\n # x and y coord info\n subplot.format_coord = functools.partial(_format_coord, freq)\n\n elif isinstance(index, ABCTimedeltaIndex):\n subplot.xaxis.set_major_formatter(\n TimeSeries_TimedeltaFormatter())\n else:\n raise TypeError('index type not supported')\n\n pylab.draw_if_interactive()", "response": "Format the date axis for the current time series."} {"instruction": "Write a Python 3 script to\nreturn True if all the columns in a DataFrame have the same type.", "response": "def _is_homogeneous_type(self):\n \"\"\"\n Whether all the columns in a DataFrame have the same type.\n\n Returns\n -------\n bool\n\n Examples\n --------\n >>> DataFrame({\"A\": [1, 2], \"B\": [3, 4]})._is_homogeneous_type\n True\n >>> DataFrame({\"A\": [1, 2], \"B\": [3.0, 4.0]})._is_homogeneous_type\n False\n\n Items with the same type but different sizes are considered\n different types.\n\n >>> DataFrame({\n ... \"A\": np.array([1, 2], dtype=np.int32),\n ... \"B\": np.array([1, 2], dtype=np.int64)})._is_homogeneous_type\n False\n \"\"\"\n if self._data.any_extension_types:\n return len({block.dtype for block in self._data.blocks}) == 1\n else:\n return not self._data.is_mixed_type"} {"instruction": "Make a summary of the following Python 3 code\ndef _repr_html_(self):\n if self._info_repr():\n buf = StringIO(\"\")\n self.info(buf=buf)\n # need to escape the , should be the first line.\n val = buf.getvalue().replace('<', r'<', 1)\n val = val.replace('>', r'>', 1)\n return '
' + val + '
'\n\n if get_option(\"display.notebook_repr_html\"):\n max_rows = get_option(\"display.max_rows\")\n max_cols = get_option(\"display.max_columns\")\n show_dimensions = get_option(\"display.show_dimensions\")\n\n return self.to_html(max_rows=max_rows, max_cols=max_cols,\n show_dimensions=show_dimensions, notebook=True)\n else:\n return None", "response": "Return a html representation for a particular DataFrame."} {"instruction": "Implement a Python 3 function for\nrendering a DataFrame to a console - friendly string.", "response": "def to_string(self, buf=None, columns=None, col_space=None, header=True,\n index=True, na_rep='NaN', formatters=None, float_format=None,\n sparsify=None, index_names=True, justify=None,\n max_rows=None, max_cols=None, show_dimensions=False,\n decimal='.', line_width=None):\n \"\"\"\n Render a DataFrame to a console-friendly tabular output.\n %(shared_params)s\n line_width : int, optional\n Width to wrap a line in characters.\n %(returns)s\n See Also\n --------\n to_html : Convert DataFrame to HTML.\n\n Examples\n --------\n >>> d = {'col1': [1, 2, 3], 'col2': [4, 5, 6]}\n >>> df = pd.DataFrame(d)\n >>> print(df.to_string())\n col1 col2\n 0 1 4\n 1 2 5\n 2 3 6\n \"\"\"\n\n formatter = fmt.DataFrameFormatter(self, buf=buf, columns=columns,\n col_space=col_space, na_rep=na_rep,\n formatters=formatters,\n float_format=float_format,\n sparsify=sparsify, justify=justify,\n index_names=index_names,\n header=header, index=index,\n max_rows=max_rows,\n max_cols=max_cols,\n show_dimensions=show_dimensions,\n decimal=decimal,\n line_width=line_width)\n formatter.to_string()\n\n if buf is None:\n result = formatter.buf.getvalue()\n return result"} {"instruction": "How would you explain what the following Python 3 function does\ndef iteritems(self):\n if self.columns.is_unique and hasattr(self, '_item_cache'):\n for k in self.columns:\n yield k, self._get_item_cache(k)\n else:\n for i, k in enumerate(self.columns):\n yield k, self._ixs(i, axis=1)", "response": "r Returns an iterator over the items in the DataFrame."} {"instruction": "Write a Python 3 script for\niterating over DataFrame rows as namedtuples and Series objects.", "response": "def iterrows(self):\n \"\"\"\n Iterate over DataFrame rows as (index, Series) pairs.\n\n Yields\n ------\n index : label or tuple of label\n The index of the row. A tuple for a `MultiIndex`.\n data : Series\n The data of the row as a Series.\n\n it : generator\n A generator that iterates over the rows of the frame.\n\n See Also\n --------\n itertuples : Iterate over DataFrame rows as namedtuples of the values.\n iteritems : Iterate over (column name, Series) pairs.\n\n Notes\n -----\n\n 1. Because ``iterrows`` returns a Series for each row,\n it does **not** preserve dtypes across the rows (dtypes are\n preserved across columns for DataFrames). For example,\n\n >>> df = pd.DataFrame([[1, 1.5]], columns=['int', 'float'])\n >>> row = next(df.iterrows())[1]\n >>> row\n int 1.0\n float 1.5\n Name: 0, dtype: float64\n >>> print(row['int'].dtype)\n float64\n >>> print(df['int'].dtype)\n int64\n\n To preserve dtypes while iterating over the rows, it is better\n to use :meth:`itertuples` which returns namedtuples of the values\n and which is generally faster than ``iterrows``.\n\n 2. You should **never modify** something you are iterating over.\n This is not guaranteed to work in all cases. Depending on the\n data types, the iterator returns a copy and not a view, and writing\n to it will have no effect.\n \"\"\"\n columns = self.columns\n klass = self._constructor_sliced\n for k, v in zip(self.index, self.values):\n s = klass(v, index=columns, name=k)\n yield k, s"} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef itertuples(self, index=True, name=\"Pandas\"):\n arrays = []\n fields = list(self.columns)\n if index:\n arrays.append(self.index)\n fields.insert(0, \"Index\")\n\n # use integer indexing because of possible duplicate column names\n arrays.extend(self.iloc[:, k] for k in range(len(self.columns)))\n\n # Python 3 supports at most 255 arguments to constructor\n if name is not None and len(self.columns) + index < 256:\n itertuple = collections.namedtuple(name, fields, rename=True)\n return map(itertuple._make, zip(*arrays))\n\n # fallback to regular tuples\n return zip(*arrays)", "response": "Iterate over the DataFrame rows as namedtuples."} {"instruction": "How would you explain what the following Python 3 function does\ndef dot(self, other):\n if isinstance(other, (Series, DataFrame)):\n common = self.columns.union(other.index)\n if (len(common) > len(self.columns) or\n len(common) > len(other.index)):\n raise ValueError('matrices are not aligned')\n\n left = self.reindex(columns=common, copy=False)\n right = other.reindex(index=common, copy=False)\n lvals = left.values\n rvals = right.values\n else:\n left = self\n lvals = self.values\n rvals = np.asarray(other)\n if lvals.shape[1] != rvals.shape[0]:\n raise ValueError('Dot product shape mismatch, '\n '{s} vs {r}'.format(s=lvals.shape,\n r=rvals.shape))\n\n if isinstance(other, DataFrame):\n return self._constructor(np.dot(lvals, rvals), index=left.index,\n columns=other.columns)\n elif isinstance(other, Series):\n return Series(np.dot(lvals, rvals), index=left.index)\n elif isinstance(rvals, (np.ndarray, Index)):\n result = np.dot(lvals, rvals)\n if result.ndim == 2:\n return self._constructor(result, index=left.index)\n else:\n return Series(result, index=left.index)\n else: # pragma: no cover\n raise TypeError('unsupported type: {oth}'.format(oth=type(other)))", "response": "This method computes the matrix mutiplication between the DataFrame and other."} {"instruction": "Can you tell what is the following Python 3 function doing\ndef from_dict(cls, data, orient='columns', dtype=None, columns=None):\n index = None\n orient = orient.lower()\n if orient == 'index':\n if len(data) > 0:\n # TODO speed up Series case\n if isinstance(list(data.values())[0], (Series, dict)):\n data = _from_nested_dict(data)\n else:\n data, index = list(data.values()), list(data.keys())\n elif orient == 'columns':\n if columns is not None:\n raise ValueError(\"cannot use columns parameter with \"\n \"orient='columns'\")\n else: # pragma: no cover\n raise ValueError('only recognize index or columns for orient')\n\n return cls(data, index=index, columns=columns, dtype=dtype)", "response": "Construct a DataFrame object from a dictionary of array - like or dicts."} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef to_numpy(self, dtype=None, copy=False):\n result = np.array(self.values, dtype=dtype, copy=copy)\n return result", "response": "Convert the DataFrame to a NumPy array."} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef to_dict(self, orient='dict', into=dict):\n if not self.columns.is_unique:\n warnings.warn(\"DataFrame columns are not unique, some \"\n \"columns will be omitted.\", UserWarning,\n stacklevel=2)\n # GH16122\n into_c = com.standardize_mapping(into)\n if orient.lower().startswith('d'):\n return into_c(\n (k, v.to_dict(into)) for k, v in self.items())\n elif orient.lower().startswith('l'):\n return into_c((k, v.tolist()) for k, v in self.items())\n elif orient.lower().startswith('sp'):\n return into_c((('index', self.index.tolist()),\n ('columns', self.columns.tolist()),\n ('data', [\n list(map(com.maybe_box_datetimelike, t))\n for t in self.itertuples(index=False, name=None)\n ])))\n elif orient.lower().startswith('s'):\n return into_c((k, com.maybe_box_datetimelike(v))\n for k, v in self.items())\n elif orient.lower().startswith('r'):\n columns = self.columns.tolist()\n rows = (dict(zip(columns, row))\n for row in self.itertuples(index=False, name=None))\n return [\n into_c((k, com.maybe_box_datetimelike(v))\n for k, v in row.items())\n for row in rows]\n elif orient.lower().startswith('i'):\n if not self.index.is_unique:\n raise ValueError(\n \"DataFrame index must be unique for orient='index'.\"\n )\n return into_c((t[0], dict(zip(self.columns, t[1:])))\n for t in self.itertuples(name=None))\n else:\n raise ValueError(\"orient '{o}' not understood\".format(o=orient))", "response": "Convert the DataFrame to a dictionary."} {"instruction": "Explain what the following Python 3 code does\ndef to_gbq(self, destination_table, project_id=None, chunksize=None,\n reauth=False, if_exists='fail', auth_local_webserver=False,\n table_schema=None, location=None, progress_bar=True,\n credentials=None, verbose=None, private_key=None):\n \"\"\"\n Write a DataFrame to a Google BigQuery table.\n\n This function requires the `pandas-gbq package\n `__.\n\n See the `How to authenticate with Google BigQuery\n `__\n guide for authentication instructions.\n\n Parameters\n ----------\n destination_table : str\n Name of table to be written, in the form ``dataset.tablename``.\n project_id : str, optional\n Google BigQuery Account project ID. Optional when available from\n the environment.\n chunksize : int, optional\n Number of rows to be inserted in each chunk from the dataframe.\n Set to ``None`` to load the whole dataframe at once.\n reauth : bool, default False\n Force Google BigQuery to re-authenticate the user. This is useful\n if multiple accounts are used.\n if_exists : str, default 'fail'\n Behavior when the destination table exists. Value can be one of:\n\n ``'fail'``\n If table exists, do nothing.\n ``'replace'``\n If table exists, drop it, recreate it, and insert data.\n ``'append'``\n If table exists, insert data. Create if does not exist.\n auth_local_webserver : bool, default False\n Use the `local webserver flow`_ instead of the `console flow`_\n when getting user credentials.\n\n .. _local webserver flow:\n http://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_local_server\n .. _console flow:\n http://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_console\n\n *New in version 0.2.0 of pandas-gbq*.\n table_schema : list of dicts, optional\n List of BigQuery table fields to which according DataFrame\n columns conform to, e.g. ``[{'name': 'col1', 'type':\n 'STRING'},...]``. If schema is not provided, it will be\n generated according to dtypes of DataFrame columns. See\n BigQuery API documentation on available names of a field.\n\n *New in version 0.3.1 of pandas-gbq*.\n location : str, optional\n Location where the load job should run. See the `BigQuery locations\n documentation\n `__ for a\n list of available locations. The location must match that of the\n target dataset.\n\n *New in version 0.5.0 of pandas-gbq*.\n progress_bar : bool, default True\n Use the library `tqdm` to show the progress bar for the upload,\n chunk by chunk.\n\n *New in version 0.5.0 of pandas-gbq*.\n credentials : google.auth.credentials.Credentials, optional\n Credentials for accessing Google APIs. Use this parameter to\n override default credentials, such as to use Compute Engine\n :class:`google.auth.compute_engine.Credentials` or Service\n Account :class:`google.oauth2.service_account.Credentials`\n directly.\n\n *New in version 0.8.0 of pandas-gbq*.\n\n .. versionadded:: 0.24.0\n verbose : bool, deprecated\n Deprecated in pandas-gbq version 0.4.0. Use the `logging module\n to adjust verbosity instead\n `__.\n private_key : str, deprecated\n Deprecated in pandas-gbq version 0.8.0. Use the ``credentials``\n parameter and\n :func:`google.oauth2.service_account.Credentials.from_service_account_info`\n or\n :func:`google.oauth2.service_account.Credentials.from_service_account_file`\n instead.\n\n Service account private key in JSON format. Can be file path\n or string contents. This is useful for remote server\n authentication (eg. Jupyter/IPython notebook on remote host).\n\n See Also\n --------\n pandas_gbq.to_gbq : This function in the pandas-gbq library.\n read_gbq : Read a DataFrame from Google BigQuery.\n \"\"\"\n from pandas.io import gbq\n return gbq.to_gbq(\n self, destination_table, project_id=project_id,\n chunksize=chunksize, reauth=reauth, if_exists=if_exists,\n auth_local_webserver=auth_local_webserver,\n table_schema=table_schema, location=location,\n progress_bar=progress_bar, credentials=credentials,\n verbose=verbose, private_key=private_key)", "response": "Write a DataFrame to a Google BigQuery table."} {"instruction": "Can you tell what is the following Python 3 function doing\ndef from_records(cls, data, index=None, exclude=None, columns=None,\n coerce_float=False, nrows=None):\n \"\"\"\n Convert structured or record ndarray to DataFrame.\n\n Parameters\n ----------\n data : ndarray (structured dtype), list of tuples, dict, or DataFrame\n index : string, list of fields, array-like\n Field of array to use as the index, alternately a specific set of\n input labels to use\n exclude : sequence, default None\n Columns or fields to exclude\n columns : sequence, default None\n Column names to use. If the passed data do not have names\n associated with them, this argument provides names for the\n columns. Otherwise this argument indicates the order of the columns\n in the result (any names not found in the data will become all-NA\n columns)\n coerce_float : boolean, default False\n Attempt to convert values of non-string, non-numeric objects (like\n decimal.Decimal) to floating point, useful for SQL result sets\n nrows : int, default None\n Number of rows to read if data is an iterator\n\n Returns\n -------\n DataFrame\n \"\"\"\n\n # Make a copy of the input columns so we can modify it\n if columns is not None:\n columns = ensure_index(columns)\n\n if is_iterator(data):\n if nrows == 0:\n return cls()\n\n try:\n first_row = next(data)\n except StopIteration:\n return cls(index=index, columns=columns)\n\n dtype = None\n if hasattr(first_row, 'dtype') and first_row.dtype.names:\n dtype = first_row.dtype\n\n values = [first_row]\n\n if nrows is None:\n values += data\n else:\n values.extend(itertools.islice(data, nrows - 1))\n\n if dtype is not None:\n data = np.array(values, dtype=dtype)\n else:\n data = values\n\n if isinstance(data, dict):\n if columns is None:\n columns = arr_columns = ensure_index(sorted(data))\n arrays = [data[k] for k in columns]\n else:\n arrays = []\n arr_columns = []\n for k, v in data.items():\n if k in columns:\n arr_columns.append(k)\n arrays.append(v)\n\n arrays, arr_columns = reorder_arrays(arrays, arr_columns,\n columns)\n\n elif isinstance(data, (np.ndarray, DataFrame)):\n arrays, columns = to_arrays(data, columns)\n if columns is not None:\n columns = ensure_index(columns)\n arr_columns = columns\n else:\n arrays, arr_columns = to_arrays(data, columns,\n coerce_float=coerce_float)\n\n arr_columns = ensure_index(arr_columns)\n if columns is not None:\n columns = ensure_index(columns)\n else:\n columns = arr_columns\n\n if exclude is None:\n exclude = set()\n else:\n exclude = set(exclude)\n\n result_index = None\n if index is not None:\n if (isinstance(index, str) or\n not hasattr(index, \"__iter__\")):\n i = columns.get_loc(index)\n exclude.add(index)\n if len(arrays) > 0:\n result_index = Index(arrays[i], name=index)\n else:\n result_index = Index([], name=index)\n else:\n try:\n index_data = [arrays[arr_columns.get_loc(field)]\n for field in index]\n result_index = ensure_index_from_sequences(index_data,\n names=index)\n\n exclude.update(index)\n except Exception:\n result_index = index\n\n if any(exclude):\n arr_exclude = [x for x in exclude if x in arr_columns]\n to_remove = [arr_columns.get_loc(col) for col in arr_exclude]\n arrays = [v for i, v in enumerate(arrays) if i not in to_remove]\n\n arr_columns = arr_columns.drop(arr_exclude)\n columns = columns.drop(exclude)\n\n mgr = arrays_to_mgr(arrays, arr_columns, result_index, columns)\n\n return cls(mgr)", "response": "Convert a structured or record ndarray to DataFrame."} {"instruction": "How would you explain what the following Python 3 function does\ndef to_records(self, index=True, convert_datetime64=None,\n column_dtypes=None, index_dtypes=None):\n \"\"\"\n Convert DataFrame to a NumPy record array.\n\n Index will be included as the first field of the record array if\n requested.\n\n Parameters\n ----------\n index : bool, default True\n Include index in resulting record array, stored in 'index'\n field or using the index label, if set.\n convert_datetime64 : bool, default None\n .. deprecated:: 0.23.0\n\n Whether to convert the index to datetime.datetime if it is a\n DatetimeIndex.\n column_dtypes : str, type, dict, default None\n .. versionadded:: 0.24.0\n\n If a string or type, the data type to store all columns. If\n a dictionary, a mapping of column names and indices (zero-indexed)\n to specific data types.\n index_dtypes : str, type, dict, default None\n .. versionadded:: 0.24.0\n\n If a string or type, the data type to store all index levels. If\n a dictionary, a mapping of index level names and indices\n (zero-indexed) to specific data types.\n\n This mapping is applied only if `index=True`.\n\n Returns\n -------\n numpy.recarray\n NumPy ndarray with the DataFrame labels as fields and each row\n of the DataFrame as entries.\n\n See Also\n --------\n DataFrame.from_records: Convert structured or record ndarray\n to DataFrame.\n numpy.recarray: An ndarray that allows field access using\n attributes, analogous to typed columns in a\n spreadsheet.\n\n Examples\n --------\n >>> df = pd.DataFrame({'A': [1, 2], 'B': [0.5, 0.75]},\n ... index=['a', 'b'])\n >>> df\n A B\n a 1 0.50\n b 2 0.75\n >>> df.to_records()\n rec.array([('a', 1, 0.5 ), ('b', 2, 0.75)],\n dtype=[('index', 'O'), ('A', '>> df.index = df.index.rename(\"I\")\n >>> df.to_records()\n rec.array([('a', 1, 0.5 ), ('b', 2, 0.75)],\n dtype=[('I', 'O'), ('A', '>> df.to_records(index=False)\n rec.array([(1, 0.5 ), (2, 0.75)],\n dtype=[('A', '>> df.to_records(column_dtypes={\"A\": \"int32\"})\n rec.array([('a', 1, 0.5 ), ('b', 2, 0.75)],\n dtype=[('I', 'O'), ('A', '>> df.to_records(index_dtypes=\">> index_dtypes = \">> df.to_records(index_dtypes=index_dtypes)\n rec.array([(b'a', 1, 0.5 ), (b'b', 2, 0.75)],\n dtype=[('I', 'S1'), ('A', '`\n instead.\n :meth:`DataFrame.from_dict(OrderedDict(items)) `\n may be used to preserve the key order.\n\n Convert (key, value) pairs to DataFrame. The keys will be the axis\n index (usually the columns, but depends on the specified\n orientation). The values should be arrays or Series.\n\n Parameters\n ----------\n items : sequence of (key, value) pairs\n Values should be arrays or Series.\n columns : sequence of column labels, optional\n Must be passed if orient='index'.\n orient : {'columns', 'index'}, default 'columns'\n The \"orientation\" of the data. If the keys of the\n input correspond to column labels, pass 'columns'\n (default). Otherwise if the keys correspond to the index,\n pass 'index'.\n\n Returns\n -------\n DataFrame\n \"\"\"\n\n warnings.warn(\"from_items is deprecated. Please use \"\n \"DataFrame.from_dict(dict(items), ...) instead. \"\n \"DataFrame.from_dict(OrderedDict(items)) may be used to \"\n \"preserve the key order.\",\n FutureWarning, stacklevel=2)\n\n keys, values = lzip(*items)\n\n if orient == 'columns':\n if columns is not None:\n columns = ensure_index(columns)\n\n idict = dict(items)\n if len(idict) < len(items):\n if not columns.equals(ensure_index(keys)):\n raise ValueError('With non-unique item names, passed '\n 'columns must be identical')\n arrays = values\n else:\n arrays = [idict[k] for k in columns if k in idict]\n else:\n columns = ensure_index(keys)\n arrays = values\n\n # GH 17312\n # Provide more informative error msg when scalar values passed\n try:\n return cls._from_arrays(arrays, columns, None)\n\n except ValueError:\n if not is_nested_list_like(values):\n raise ValueError('The value in each (key, value) pair '\n 'must be an array, Series, or dict')\n\n elif orient == 'index':\n if columns is None:\n raise TypeError(\"Must pass columns with orient='index'\")\n\n keys = ensure_index(keys)\n\n # GH 17312\n # Provide more informative error msg when scalar values passed\n try:\n arr = np.array(values, dtype=object).T\n data = [lib.maybe_convert_objects(v) for v in arr]\n return cls._from_arrays(data, columns, keys)\n\n except TypeError:\n if not is_nested_list_like(values):\n raise ValueError('The value in each (key, value) pair '\n 'must be an array, Series, or dict')\n\n else: # pragma: no cover\n raise ValueError(\"'orient' must be either 'columns' or 'index'\")"} {"instruction": "Explain what the following Python 3 code does\ndef from_csv(cls, path, header=0, sep=',', index_col=0, parse_dates=True,\n encoding=None, tupleize_cols=None,\n infer_datetime_format=False):\n \"\"\"\n Read CSV file.\n\n .. deprecated:: 0.21.0\n Use :func:`read_csv` instead.\n\n It is preferable to use the more powerful :func:`read_csv`\n for most general purposes, but ``from_csv`` makes for an easy\n roundtrip to and from a file (the exact counterpart of\n ``to_csv``), especially with a DataFrame of time series data.\n\n This method only differs from the preferred :func:`read_csv`\n in some defaults:\n\n - `index_col` is ``0`` instead of ``None`` (take first column as index\n by default)\n - `parse_dates` is ``True`` instead of ``False`` (try parsing the index\n as datetime by default)\n\n So a ``pd.DataFrame.from_csv(path)`` can be replaced by\n ``pd.read_csv(path, index_col=0, parse_dates=True)``.\n\n Parameters\n ----------\n path : string file path or file handle / StringIO\n header : int, default 0\n Row to use as header (skip prior rows)\n sep : string, default ','\n Field delimiter\n index_col : int or sequence, default 0\n Column to use for index. If a sequence is given, a MultiIndex\n is used. Different default from read_table\n parse_dates : boolean, default True\n Parse dates. Different default from read_table\n tupleize_cols : boolean, default False\n write multi_index columns as a list of tuples (if True)\n or new (expanded format) if False)\n infer_datetime_format : boolean, default False\n If True and `parse_dates` is True for a column, try to infer the\n datetime format based on the first datetime string. If the format\n can be inferred, there often will be a large parsing speed-up.\n\n Returns\n -------\n DataFrame\n\n See Also\n --------\n read_csv\n \"\"\"\n\n warnings.warn(\"from_csv is deprecated. Please use read_csv(...) \"\n \"instead. Note that some of the default arguments are \"\n \"different, so please refer to the documentation \"\n \"for from_csv when changing your function calls\",\n FutureWarning, stacklevel=2)\n\n from pandas.io.parsers import read_csv\n return read_csv(path, header=header, sep=sep,\n parse_dates=parse_dates, index_col=index_col,\n encoding=encoding, tupleize_cols=tupleize_cols,\n infer_datetime_format=infer_datetime_format)", "response": "Read a new entry from a CSV file."} {"instruction": "Create a Python 3 function for\nconverting the DataFrame to a sparse DataFrame.", "response": "def to_sparse(self, fill_value=None, kind='block'):\n \"\"\"\n Convert to SparseDataFrame.\n\n Implement the sparse version of the DataFrame meaning that any data\n matching a specific value it's omitted in the representation.\n The sparse DataFrame allows for a more efficient storage.\n\n Parameters\n ----------\n fill_value : float, default None\n The specific value that should be omitted in the representation.\n kind : {'block', 'integer'}, default 'block'\n The kind of the SparseIndex tracking where data is not equal to\n the fill value:\n\n - 'block' tracks only the locations and sizes of blocks of data.\n - 'integer' keeps an array with all the locations of the data.\n\n In most cases 'block' is recommended, since it's more memory\n efficient.\n\n Returns\n -------\n SparseDataFrame\n The sparse representation of the DataFrame.\n\n See Also\n --------\n DataFrame.to_dense :\n Converts the DataFrame back to the its dense form.\n\n Examples\n --------\n >>> df = pd.DataFrame([(np.nan, np.nan),\n ... (1., np.nan),\n ... (np.nan, 1.)])\n >>> df\n 0 1\n 0 NaN NaN\n 1 1.0 NaN\n 2 NaN 1.0\n >>> type(df)\n \n\n >>> sdf = df.to_sparse()\n >>> sdf\n 0 1\n 0 NaN NaN\n 1 1.0 NaN\n 2 NaN 1.0\n >>> type(sdf)\n \n \"\"\"\n from pandas.core.sparse.api import SparseDataFrame\n return SparseDataFrame(self._series, index=self.index,\n columns=self.columns, default_kind=kind,\n default_fill_value=fill_value)"} {"instruction": "Explain what the following Python 3 code does\ndef to_stata(self, fname, convert_dates=None, write_index=True,\n encoding=\"latin-1\", byteorder=None, time_stamp=None,\n data_label=None, variable_labels=None, version=114,\n convert_strl=None):\n \"\"\"\n Export DataFrame object to Stata dta format.\n\n Writes the DataFrame to a Stata dataset file.\n \"dta\" files contain a Stata dataset.\n\n Parameters\n ----------\n fname : str, buffer or path object\n String, path object (pathlib.Path or py._path.local.LocalPath) or\n object implementing a binary write() function. If using a buffer\n then the buffer will not be automatically closed after the file\n data has been written.\n convert_dates : dict\n Dictionary mapping columns containing datetime types to stata\n internal format to use when writing the dates. Options are 'tc',\n 'td', 'tm', 'tw', 'th', 'tq', 'ty'. Column can be either an integer\n or a name. Datetime columns that do not have a conversion type\n specified will be converted to 'tc'. Raises NotImplementedError if\n a datetime column has timezone information.\n write_index : bool\n Write the index to Stata dataset.\n encoding : str\n Default is latin-1. Unicode is not supported.\n byteorder : str\n Can be \">\", \"<\", \"little\", or \"big\". default is `sys.byteorder`.\n time_stamp : datetime\n A datetime to use as file creation date. Default is the current\n time.\n data_label : str, optional\n A label for the data set. Must be 80 characters or smaller.\n variable_labels : dict\n Dictionary containing columns as keys and variable labels as\n values. Each label must be 80 characters or smaller.\n\n .. versionadded:: 0.19.0\n\n version : {114, 117}, default 114\n Version to use in the output dta file. Version 114 can be used\n read by Stata 10 and later. Version 117 can be read by Stata 13\n or later. Version 114 limits string variables to 244 characters or\n fewer while 117 allows strings with lengths up to 2,000,000\n characters.\n\n .. versionadded:: 0.23.0\n\n convert_strl : list, optional\n List of column names to convert to string columns to Stata StrL\n format. Only available if version is 117. Storing strings in the\n StrL format can produce smaller dta files if strings have more than\n 8 characters and values are repeated.\n\n .. versionadded:: 0.23.0\n\n Raises\n ------\n NotImplementedError\n * If datetimes contain timezone information\n * Column dtype is not representable in Stata\n ValueError\n * Columns listed in convert_dates are neither datetime64[ns]\n or datetime.datetime\n * Column listed in convert_dates is not in DataFrame\n * Categorical label contains more than 32,000 characters\n\n .. versionadded:: 0.19.0\n\n See Also\n --------\n read_stata : Import Stata data files.\n io.stata.StataWriter : Low-level writer for Stata data files.\n io.stata.StataWriter117 : Low-level writer for version 117 files.\n\n Examples\n --------\n >>> df = pd.DataFrame({'animal': ['falcon', 'parrot', 'falcon',\n ... 'parrot'],\n ... 'speed': [350, 18, 361, 15]})\n >>> df.to_stata('animals.dta') # doctest: +SKIP\n \"\"\"\n kwargs = {}\n if version not in (114, 117):\n raise ValueError('Only formats 114 and 117 supported.')\n if version == 114:\n if convert_strl is not None:\n raise ValueError('strl support is only available when using '\n 'format 117')\n from pandas.io.stata import StataWriter as statawriter\n else:\n from pandas.io.stata import StataWriter117 as statawriter\n kwargs['convert_strl'] = convert_strl\n\n writer = statawriter(fname, self, convert_dates=convert_dates,\n byteorder=byteorder, time_stamp=time_stamp,\n data_label=data_label, write_index=write_index,\n variable_labels=variable_labels, **kwargs)\n writer.write_file()", "response": "Exports the DataFrame object to a Stata dataset file."} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef to_feather(self, fname):\n from pandas.io.feather_format import to_feather\n to_feather(self, fname)", "response": "Write out the binary feather - format for DataFrames."} {"instruction": "How would you implement a function in Python 3 that\nwrites a DataFrame to a parquet file.", "response": "def to_parquet(self, fname, engine='auto', compression='snappy',\n index=None, partition_cols=None, **kwargs):\n \"\"\"\n Write a DataFrame to the binary parquet format.\n\n .. versionadded:: 0.21.0\n\n This function writes the dataframe as a `parquet file\n `_. You can choose different parquet\n backends, and have the option of compression. See\n :ref:`the user guide ` for more details.\n\n Parameters\n ----------\n fname : str\n File path or Root Directory path. Will be used as Root Directory\n path while writing a partitioned dataset.\n\n .. versionchanged:: 0.24.0\n\n engine : {'auto', 'pyarrow', 'fastparquet'}, default 'auto'\n Parquet library to use. If 'auto', then the option\n ``io.parquet.engine`` is used. The default ``io.parquet.engine``\n behavior is to try 'pyarrow', falling back to 'fastparquet' if\n 'pyarrow' is unavailable.\n compression : {'snappy', 'gzip', 'brotli', None}, default 'snappy'\n Name of the compression to use. Use ``None`` for no compression.\n index : bool, default None\n If ``True``, include the dataframe's index(es) in the file output.\n If ``False``, they will not be written to the file. If ``None``,\n the behavior depends on the chosen engine.\n\n .. versionadded:: 0.24.0\n\n partition_cols : list, optional, default None\n Column names by which to partition the dataset\n Columns are partitioned in the order they are given\n\n .. versionadded:: 0.24.0\n\n **kwargs\n Additional arguments passed to the parquet library. See\n :ref:`pandas io ` for more details.\n\n See Also\n --------\n read_parquet : Read a parquet file.\n DataFrame.to_csv : Write a csv file.\n DataFrame.to_sql : Write to a sql table.\n DataFrame.to_hdf : Write to hdf.\n\n Notes\n -----\n This function requires either the `fastparquet\n `_ or `pyarrow\n `_ library.\n\n Examples\n --------\n >>> df = pd.DataFrame(data={'col1': [1, 2], 'col2': [3, 4]})\n >>> df.to_parquet('df.parquet.gzip',\n ... compression='gzip') # doctest: +SKIP\n >>> pd.read_parquet('df.parquet.gzip') # doctest: +SKIP\n col1 col2\n 0 1 3\n 1 2 4\n \"\"\"\n from pandas.io.parquet import to_parquet\n to_parquet(self, fname, engine,\n compression=compression, index=index,\n partition_cols=partition_cols, **kwargs)"} {"instruction": "Create a Python 3 function to\nrender a DataFrame as an HTML table.", "response": "def to_html(self, buf=None, columns=None, col_space=None, header=True,\n index=True, na_rep='NaN', formatters=None, float_format=None,\n sparsify=None, index_names=True, justify=None, max_rows=None,\n max_cols=None, show_dimensions=False, decimal='.',\n bold_rows=True, classes=None, escape=True, notebook=False,\n border=None, table_id=None, render_links=False):\n \"\"\"\n Render a DataFrame as an HTML table.\n %(shared_params)s\n bold_rows : bool, default True\n Make the row labels bold in the output.\n classes : str or list or tuple, default None\n CSS class(es) to apply to the resulting html table.\n escape : bool, default True\n Convert the characters <, >, and & to HTML-safe sequences.\n notebook : {True, False}, default False\n Whether the generated HTML is for IPython Notebook.\n border : int\n A ``border=border`` attribute is included in the opening\n `` tag. Default ``pd.options.html.border``.\n\n .. versionadded:: 0.19.0\n\n table_id : str, optional\n A css id is included in the opening `
` tag if specified.\n\n .. versionadded:: 0.23.0\n\n render_links : bool, default False\n Convert URLs to HTML links.\n\n .. versionadded:: 0.24.0\n %(returns)s\n See Also\n --------\n to_string : Convert DataFrame to a string.\n \"\"\"\n\n if (justify is not None and\n justify not in fmt._VALID_JUSTIFY_PARAMETERS):\n raise ValueError(\"Invalid value for justify parameter\")\n\n formatter = fmt.DataFrameFormatter(self, buf=buf, columns=columns,\n col_space=col_space, na_rep=na_rep,\n formatters=formatters,\n float_format=float_format,\n sparsify=sparsify, justify=justify,\n index_names=index_names,\n header=header, index=index,\n bold_rows=bold_rows, escape=escape,\n max_rows=max_rows,\n max_cols=max_cols,\n show_dimensions=show_dimensions,\n decimal=decimal, table_id=table_id,\n render_links=render_links)\n # TODO: a generic formatter wld b in DataFrameFormatter\n formatter.to_html(classes=classes, notebook=notebook, border=border)\n\n if buf is None:\n return formatter.buf.getvalue()"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef info(self, verbose=None, buf=None, max_cols=None, memory_usage=None,\n null_counts=None):\n \"\"\"\n Print a concise summary of a DataFrame.\n\n This method prints information about a DataFrame including\n the index dtype and column dtypes, non-null values and memory usage.\n\n Parameters\n ----------\n verbose : bool, optional\n Whether to print the full summary. By default, the setting in\n ``pandas.options.display.max_info_columns`` is followed.\n buf : writable buffer, defaults to sys.stdout\n Where to send the output. By default, the output is printed to\n sys.stdout. Pass a writable buffer if you need to further process\n the output.\n max_cols : int, optional\n When to switch from the verbose to the truncated output. If the\n DataFrame has more than `max_cols` columns, the truncated output\n is used. By default, the setting in\n ``pandas.options.display.max_info_columns`` is used.\n memory_usage : bool, str, optional\n Specifies whether total memory usage of the DataFrame\n elements (including the index) should be displayed. By default,\n this follows the ``pandas.options.display.memory_usage`` setting.\n\n True always show memory usage. False never shows memory usage.\n A value of 'deep' is equivalent to \"True with deep introspection\".\n Memory usage is shown in human-readable units (base-2\n representation). Without deep introspection a memory estimation is\n made based in column dtype and number of rows assuming values\n consume the same memory amount for corresponding dtypes. With deep\n memory introspection, a real memory usage calculation is performed\n at the cost of computational resources.\n null_counts : bool, optional\n Whether to show the non-null counts. By default, this is shown\n only if the frame is smaller than\n ``pandas.options.display.max_info_rows`` and\n ``pandas.options.display.max_info_columns``. A value of True always\n shows the counts, and False never shows the counts.\n\n Returns\n -------\n None\n This method prints a summary of a DataFrame and returns None.\n\n See Also\n --------\n DataFrame.describe: Generate descriptive statistics of DataFrame\n columns.\n DataFrame.memory_usage: Memory usage of DataFrame columns.\n\n Examples\n --------\n >>> int_values = [1, 2, 3, 4, 5]\n >>> text_values = ['alpha', 'beta', 'gamma', 'delta', 'epsilon']\n >>> float_values = [0.0, 0.25, 0.5, 0.75, 1.0]\n >>> df = pd.DataFrame({\"int_col\": int_values, \"text_col\": text_values,\n ... \"float_col\": float_values})\n >>> df\n int_col text_col float_col\n 0 1 alpha 0.00\n 1 2 beta 0.25\n 2 3 gamma 0.50\n 3 4 delta 0.75\n 4 5 epsilon 1.00\n\n Prints information of all columns:\n\n >>> df.info(verbose=True)\n \n RangeIndex: 5 entries, 0 to 4\n Data columns (total 3 columns):\n int_col 5 non-null int64\n text_col 5 non-null object\n float_col 5 non-null float64\n dtypes: float64(1), int64(1), object(1)\n memory usage: 200.0+ bytes\n\n Prints a summary of columns count and its dtypes but not per column\n information:\n\n >>> df.info(verbose=False)\n \n RangeIndex: 5 entries, 0 to 4\n Columns: 3 entries, int_col to float_col\n dtypes: float64(1), int64(1), object(1)\n memory usage: 200.0+ bytes\n\n Pipe output of DataFrame.info to buffer instead of sys.stdout, get\n buffer content and writes to a text file:\n\n >>> import io\n >>> buffer = io.StringIO()\n >>> df.info(buf=buffer)\n >>> s = buffer.getvalue()\n >>> with open(\"df_info.txt\", \"w\",\n ... encoding=\"utf-8\") as f: # doctest: +SKIP\n ... f.write(s)\n 260\n\n The `memory_usage` parameter allows deep introspection mode, specially\n useful for big DataFrames and fine-tune memory optimization:\n\n >>> random_strings_array = np.random.choice(['a', 'b', 'c'], 10 ** 6)\n >>> df = pd.DataFrame({\n ... 'column_1': np.random.choice(['a', 'b', 'c'], 10 ** 6),\n ... 'column_2': np.random.choice(['a', 'b', 'c'], 10 ** 6),\n ... 'column_3': np.random.choice(['a', 'b', 'c'], 10 ** 6)\n ... })\n >>> df.info()\n \n RangeIndex: 1000000 entries, 0 to 999999\n Data columns (total 3 columns):\n column_1 1000000 non-null object\n column_2 1000000 non-null object\n column_3 1000000 non-null object\n dtypes: object(3)\n memory usage: 22.9+ MB\n\n >>> df.info(memory_usage='deep')\n \n RangeIndex: 1000000 entries, 0 to 999999\n Data columns (total 3 columns):\n column_1 1000000 non-null object\n column_2 1000000 non-null object\n column_3 1000000 non-null object\n dtypes: object(3)\n memory usage: 188.8 MB\n \"\"\"\n\n if buf is None: # pragma: no cover\n buf = sys.stdout\n\n lines = []\n\n lines.append(str(type(self)))\n lines.append(self.index._summary())\n\n if len(self.columns) == 0:\n lines.append('Empty {name}'.format(name=type(self).__name__))\n fmt.buffer_put_lines(buf, lines)\n return\n\n cols = self.columns\n\n # hack\n if max_cols is None:\n max_cols = get_option('display.max_info_columns',\n len(self.columns) + 1)\n\n max_rows = get_option('display.max_info_rows', len(self) + 1)\n\n if null_counts is None:\n show_counts = ((len(self.columns) <= max_cols) and\n (len(self) < max_rows))\n else:\n show_counts = null_counts\n exceeds_info_cols = len(self.columns) > max_cols\n\n def _verbose_repr():\n lines.append('Data columns (total %d columns):' %\n len(self.columns))\n space = max(len(pprint_thing(k)) for k in self.columns) + 4\n counts = None\n\n tmpl = \"{count}{dtype}\"\n if show_counts:\n counts = self.count()\n if len(cols) != len(counts): # pragma: no cover\n raise AssertionError(\n 'Columns must equal counts '\n '({cols:d} != {counts:d})'.format(\n cols=len(cols), counts=len(counts)))\n tmpl = \"{count} non-null {dtype}\"\n\n dtypes = self.dtypes\n for i, col in enumerate(self.columns):\n dtype = dtypes.iloc[i]\n col = pprint_thing(col)\n\n count = \"\"\n if show_counts:\n count = counts.iloc[i]\n\n lines.append(_put_str(col, space) + tmpl.format(count=count,\n dtype=dtype))\n\n def _non_verbose_repr():\n lines.append(self.columns._summary(name='Columns'))\n\n def _sizeof_fmt(num, size_qualifier):\n # returns size in human readable format\n for x in ['bytes', 'KB', 'MB', 'GB', 'TB']:\n if num < 1024.0:\n return (\"{num:3.1f}{size_q} \"\n \"{x}\".format(num=num, size_q=size_qualifier, x=x))\n num /= 1024.0\n return \"{num:3.1f}{size_q} {pb}\".format(num=num,\n size_q=size_qualifier,\n pb='PB')\n\n if verbose:\n _verbose_repr()\n elif verbose is False: # specifically set to False, not nesc None\n _non_verbose_repr()\n else:\n if exceeds_info_cols:\n _non_verbose_repr()\n else:\n _verbose_repr()\n\n counts = self.get_dtype_counts()\n dtypes = ['{k}({kk:d})'.format(k=k[0], kk=k[1]) for k\n in sorted(counts.items())]\n lines.append('dtypes: {types}'.format(types=', '.join(dtypes)))\n\n if memory_usage is None:\n memory_usage = get_option('display.memory_usage')\n if memory_usage:\n # append memory usage of df to display\n size_qualifier = ''\n if memory_usage == 'deep':\n deep = True\n else:\n # size_qualifier is just a best effort; not guaranteed to catch\n # all cases (e.g., it misses categorical data even with object\n # categories)\n deep = False\n if ('object' in counts or\n self.index._is_memory_usage_qualified()):\n size_qualifier = '+'\n mem_usage = self.memory_usage(index=True, deep=deep).sum()\n lines.append(\"memory usage: {mem}\\n\".format(\n mem=_sizeof_fmt(mem_usage, size_qualifier)))\n\n fmt.buffer_put_lines(buf, lines)", "response": "Print a concise summary of a DataFrame."} {"instruction": "Can you write a function in Python 3 where it\nreturns the memory usage of each column in bytes.", "response": "def memory_usage(self, index=True, deep=False):\n \"\"\"\n Return the memory usage of each column in bytes.\n\n The memory usage can optionally include the contribution of\n the index and elements of `object` dtype.\n\n This value is displayed in `DataFrame.info` by default. This can be\n suppressed by setting ``pandas.options.display.memory_usage`` to False.\n\n Parameters\n ----------\n index : bool, default True\n Specifies whether to include the memory usage of the DataFrame's\n index in returned Series. If ``index=True``, the memory usage of\n the index is the first item in the output.\n deep : bool, default False\n If True, introspect the data deeply by interrogating\n `object` dtypes for system-level memory consumption, and include\n it in the returned values.\n\n Returns\n -------\n Series\n A Series whose index is the original column names and whose values\n is the memory usage of each column in bytes.\n\n See Also\n --------\n numpy.ndarray.nbytes : Total bytes consumed by the elements of an\n ndarray.\n Series.memory_usage : Bytes consumed by a Series.\n Categorical : Memory-efficient array for string values with\n many repeated values.\n DataFrame.info : Concise summary of a DataFrame.\n\n Examples\n --------\n >>> dtypes = ['int64', 'float64', 'complex128', 'object', 'bool']\n >>> data = dict([(t, np.ones(shape=5000).astype(t))\n ... for t in dtypes])\n >>> df = pd.DataFrame(data)\n >>> df.head()\n int64 float64 complex128 object bool\n 0 1 1.0 1.0+0.0j 1 True\n 1 1 1.0 1.0+0.0j 1 True\n 2 1 1.0 1.0+0.0j 1 True\n 3 1 1.0 1.0+0.0j 1 True\n 4 1 1.0 1.0+0.0j 1 True\n\n >>> df.memory_usage()\n Index 80\n int64 40000\n float64 40000\n complex128 80000\n object 40000\n bool 5000\n dtype: int64\n\n >>> df.memory_usage(index=False)\n int64 40000\n float64 40000\n complex128 80000\n object 40000\n bool 5000\n dtype: int64\n\n The memory footprint of `object` dtype columns is ignored by default:\n\n >>> df.memory_usage(deep=True)\n Index 80\n int64 40000\n float64 40000\n complex128 80000\n object 160000\n bool 5000\n dtype: int64\n\n Use a Categorical for efficient storage of an object-dtype column with\n many repeated values.\n\n >>> df['object'].astype('category').memory_usage(deep=True)\n 5168\n \"\"\"\n result = Series([c.memory_usage(index=False, deep=deep)\n for col, c in self.iteritems()], index=self.columns)\n if index:\n result = Series(self.index.memory_usage(deep=deep),\n index=['Index']).append(result)\n return result"} {"instruction": "How would you explain what the following Python 3 function does\ndef transpose(self, *args, **kwargs):\n nv.validate_transpose(args, dict())\n return super().transpose(1, 0, **kwargs)", "response": "Transposes the DataFrame over its main diagonal by writing rows as columns and vice - versa."} {"instruction": "Here you have a function in Python 3, explain what it does\ndef get_value(self, index, col, takeable=False):\n\n warnings.warn(\"get_value is deprecated and will be removed \"\n \"in a future release. Please use \"\n \".at[] or .iat[] accessors instead\", FutureWarning,\n stacklevel=2)\n return self._get_value(index, col, takeable=takeable)", "response": "Quickly retrieve a value at passed column and index."} {"instruction": "Create a Python 3 function for\nsetting single value at passed column and index.", "response": "def set_value(self, index, col, value, takeable=False):\n \"\"\"\n Put single value at passed column and index.\n\n .. deprecated:: 0.21.0\n Use .at[] or .iat[] accessors instead.\n\n Parameters\n ----------\n index : row label\n col : column label\n value : scalar\n takeable : interpret the index/col as indexers, default False\n\n Returns\n -------\n DataFrame\n If label pair is contained, will be reference to calling DataFrame,\n otherwise a new object.\n \"\"\"\n warnings.warn(\"set_value is deprecated and will be removed \"\n \"in a future release. Please use \"\n \".at[] or .iat[] accessors instead\", FutureWarning,\n stacklevel=2)\n return self._set_value(index, col, value, takeable=takeable)"} {"instruction": "Create a Python 3 function to\nreturn a copy of the array i with the data i. e. a copy of the array.", "response": "def _ixs(self, i, axis=0):\n \"\"\"\n Parameters\n ----------\n i : int, slice, or sequence of integers\n axis : int\n\n Notes\n -----\n If slice passed, the resulting data will be a view.\n \"\"\"\n # irow\n if axis == 0:\n if isinstance(i, slice):\n return self[i]\n else:\n label = self.index[i]\n if isinstance(label, Index):\n # a location index by definition\n result = self.take(i, axis=axis)\n copy = True\n else:\n new_values = self._data.fast_xs(i)\n if is_scalar(new_values):\n return new_values\n\n # if we are a copy, mark as such\n copy = (isinstance(new_values, np.ndarray) and\n new_values.base is None)\n result = self._constructor_sliced(new_values,\n index=self.columns,\n name=self.index[i],\n dtype=new_values.dtype)\n result._set_is_copy(self, copy=copy)\n return result\n\n # icol\n else:\n label = self.columns[i]\n if isinstance(i, slice):\n # need to return view\n lab_slice = slice(label[0], label[-1])\n return self.loc[:, lab_slice]\n else:\n if isinstance(label, Index):\n return self._take(i, axis=1)\n\n index_len = len(self.index)\n\n # if the values returned are not the same length\n # as the index (iow a not found value), iget returns\n # a 0-len ndarray. This is effectively catching\n # a numpy error (as numpy should really raise)\n values = self._data.iget(i)\n\n if index_len and not len(values):\n values = np.array([np.nan] * index_len, dtype=object)\n result = self._box_col_values(values, label)\n\n # this is a cached value, mark it so\n result._set_as_cached(label, self)\n\n return result"} {"instruction": "Create a Python 3 function to\nquery the columns of a DataFrame with a boolean expression.", "response": "def query(self, expr, inplace=False, **kwargs):\n \"\"\"\n Query the columns of a DataFrame with a boolean expression.\n\n Parameters\n ----------\n expr : str\n The query string to evaluate. You can refer to variables\n in the environment by prefixing them with an '@' character like\n ``@a + b``.\n\n .. versionadded:: 0.25.0\n\n You can refer to column names that contain spaces by surrounding\n them in backticks.\n\n For example, if one of your columns is called ``a a`` and you want\n to sum it with ``b``, your query should be ```a a` + b``.\n\n inplace : bool\n Whether the query should modify the data in place or return\n a modified copy.\n **kwargs\n See the documentation for :func:`eval` for complete details\n on the keyword arguments accepted by :meth:`DataFrame.query`.\n\n .. versionadded:: 0.18.0\n\n Returns\n -------\n DataFrame\n DataFrame resulting from the provided query expression.\n\n See Also\n --------\n eval : Evaluate a string describing operations on\n DataFrame columns.\n DataFrame.eval : Evaluate a string describing operations on\n DataFrame columns.\n\n Notes\n -----\n The result of the evaluation of this expression is first passed to\n :attr:`DataFrame.loc` and if that fails because of a\n multidimensional key (e.g., a DataFrame) then the result will be passed\n to :meth:`DataFrame.__getitem__`.\n\n This method uses the top-level :func:`eval` function to\n evaluate the passed query.\n\n The :meth:`~pandas.DataFrame.query` method uses a slightly\n modified Python syntax by default. For example, the ``&`` and ``|``\n (bitwise) operators have the precedence of their boolean cousins,\n :keyword:`and` and :keyword:`or`. This *is* syntactically valid Python,\n however the semantics are different.\n\n You can change the semantics of the expression by passing the keyword\n argument ``parser='python'``. This enforces the same semantics as\n evaluation in Python space. Likewise, you can pass ``engine='python'``\n to evaluate an expression using Python itself as a backend. This is not\n recommended as it is inefficient compared to using ``numexpr`` as the\n engine.\n\n The :attr:`DataFrame.index` and\n :attr:`DataFrame.columns` attributes of the\n :class:`~pandas.DataFrame` instance are placed in the query namespace\n by default, which allows you to treat both the index and columns of the\n frame as a column in the frame.\n The identifier ``index`` is used for the frame index; you can also\n use the name of the index to identify it in a query. Please note that\n Python keywords may not be used as identifiers.\n\n For further details and examples see the ``query`` documentation in\n :ref:`indexing `.\n\n Examples\n --------\n >>> df = pd.DataFrame({'A': range(1, 6),\n ... 'B': range(10, 0, -2),\n ... 'C C': range(10, 5, -1)})\n >>> df\n A B C C\n 0 1 10 10\n 1 2 8 9\n 2 3 6 8\n 3 4 4 7\n 4 5 2 6\n >>> df.query('A > B')\n A B C C\n 4 5 2 6\n\n The previous expression is equivalent to\n\n >>> df[df.A > df.B]\n A B C C\n 4 5 2 6\n\n For columns with spaces in their name, you can use backtick quoting.\n\n >>> df.query('B == `C C`')\n A B C C\n 0 1 10 10\n\n The previous expression is equivalent to\n\n >>> df[df.B == df['C C']]\n A B C C\n 0 1 10 10\n \"\"\"\n inplace = validate_bool_kwarg(inplace, 'inplace')\n if not isinstance(expr, str):\n msg = \"expr must be a string to be evaluated, {0} given\"\n raise ValueError(msg.format(type(expr)))\n kwargs['level'] = kwargs.pop('level', 0) + 1\n kwargs['target'] = None\n res = self.eval(expr, **kwargs)\n\n try:\n new_data = self.loc[res]\n except ValueError:\n # when res is multi-dimensional loc raises, but this is sometimes a\n # valid query\n new_data = self[res]\n\n if inplace:\n self._update_inplace(new_data)\n else:\n return new_data"} {"instruction": "Write a Python 3 function that can\nevaluate a Python expression string and return a DataFrame with the result.", "response": "def eval(self, expr, inplace=False, **kwargs):\n \"\"\"\n Evaluate a string describing operations on DataFrame columns.\n\n Operates on columns only, not specific rows or elements. This allows\n `eval` to run arbitrary code, which can make you vulnerable to code\n injection if you pass user input to this function.\n\n Parameters\n ----------\n expr : str\n The expression string to evaluate.\n inplace : bool, default False\n If the expression contains an assignment, whether to perform the\n operation inplace and mutate the existing DataFrame. Otherwise,\n a new DataFrame is returned.\n\n .. versionadded:: 0.18.0.\n kwargs : dict\n See the documentation for :func:`eval` for complete details\n on the keyword arguments accepted by\n :meth:`~pandas.DataFrame.query`.\n\n Returns\n -------\n ndarray, scalar, or pandas object\n The result of the evaluation.\n\n See Also\n --------\n DataFrame.query : Evaluates a boolean expression to query the columns\n of a frame.\n DataFrame.assign : Can evaluate an expression or function to create new\n values for a column.\n eval : Evaluate a Python expression as a string using various\n backends.\n\n Notes\n -----\n For more details see the API documentation for :func:`~eval`.\n For detailed examples see :ref:`enhancing performance with eval\n `.\n\n Examples\n --------\n >>> df = pd.DataFrame({'A': range(1, 6), 'B': range(10, 0, -2)})\n >>> df\n A B\n 0 1 10\n 1 2 8\n 2 3 6\n 3 4 4\n 4 5 2\n >>> df.eval('A + B')\n 0 11\n 1 10\n 2 9\n 3 8\n 4 7\n dtype: int64\n\n Assignment is allowed though by default the original DataFrame is not\n modified.\n\n >>> df.eval('C = A + B')\n A B C\n 0 1 10 11\n 1 2 8 10\n 2 3 6 9\n 3 4 4 8\n 4 5 2 7\n >>> df\n A B\n 0 1 10\n 1 2 8\n 2 3 6\n 3 4 4\n 4 5 2\n\n Use ``inplace=True`` to modify the original DataFrame.\n\n >>> df.eval('C = A + B', inplace=True)\n >>> df\n A B C\n 0 1 10 11\n 1 2 8 10\n 2 3 6 9\n 3 4 4 8\n 4 5 2 7\n \"\"\"\n from pandas.core.computation.eval import eval as _eval\n\n inplace = validate_bool_kwarg(inplace, 'inplace')\n resolvers = kwargs.pop('resolvers', None)\n kwargs['level'] = kwargs.pop('level', 0) + 1\n if resolvers is None:\n index_resolvers = self._get_index_resolvers()\n column_resolvers = \\\n self._get_space_character_free_column_resolvers()\n resolvers = column_resolvers, index_resolvers\n if 'target' not in kwargs:\n kwargs['target'] = self\n kwargs['resolvers'] = kwargs.get('resolvers', ()) + tuple(resolvers)\n return _eval(expr, inplace=inplace, **kwargs)"} {"instruction": "Write a Python 3 function that can\nreturn a subset of the DataFrame s columns based on the column dtypes in include and exclude.", "response": "def select_dtypes(self, include=None, exclude=None):\n \"\"\"\n Return a subset of the DataFrame's columns based on the column dtypes.\n\n Parameters\n ----------\n include, exclude : scalar or list-like\n A selection of dtypes or strings to be included/excluded. At least\n one of these parameters must be supplied.\n\n Returns\n -------\n DataFrame\n The subset of the frame including the dtypes in ``include`` and\n excluding the dtypes in ``exclude``.\n\n Raises\n ------\n ValueError\n * If both of ``include`` and ``exclude`` are empty\n * If ``include`` and ``exclude`` have overlapping elements\n * If any kind of string dtype is passed in.\n\n Notes\n -----\n * To select all *numeric* types, use ``np.number`` or ``'number'``\n * To select strings you must use the ``object`` dtype, but note that\n this will return *all* object dtype columns\n * See the `numpy dtype hierarchy\n `__\n * To select datetimes, use ``np.datetime64``, ``'datetime'`` or\n ``'datetime64'``\n * To select timedeltas, use ``np.timedelta64``, ``'timedelta'`` or\n ``'timedelta64'``\n * To select Pandas categorical dtypes, use ``'category'``\n * To select Pandas datetimetz dtypes, use ``'datetimetz'`` (new in\n 0.20.0) or ``'datetime64[ns, tz]'``\n\n Examples\n --------\n >>> df = pd.DataFrame({'a': [1, 2] * 3,\n ... 'b': [True, False] * 3,\n ... 'c': [1.0, 2.0] * 3})\n >>> df\n a b c\n 0 1 True 1.0\n 1 2 False 2.0\n 2 1 True 1.0\n 3 2 False 2.0\n 4 1 True 1.0\n 5 2 False 2.0\n\n >>> df.select_dtypes(include='bool')\n b\n 0 True\n 1 False\n 2 True\n 3 False\n 4 True\n 5 False\n\n >>> df.select_dtypes(include=['float64'])\n c\n 0 1.0\n 1 2.0\n 2 1.0\n 3 2.0\n 4 1.0\n 5 2.0\n\n >>> df.select_dtypes(exclude=['int'])\n b c\n 0 True 1.0\n 1 False 2.0\n 2 True 1.0\n 3 False 2.0\n 4 True 1.0\n 5 False 2.0\n \"\"\"\n def _get_info_slice(obj, indexer):\n \"\"\"Slice the info axis of `obj` with `indexer`.\"\"\"\n if not hasattr(obj, '_info_axis_number'):\n msg = 'object of type {typ!r} has no info axis'\n raise TypeError(msg.format(typ=type(obj).__name__))\n slices = [slice(None)] * obj.ndim\n slices[obj._info_axis_number] = indexer\n return tuple(slices)\n\n if not is_list_like(include):\n include = (include,) if include is not None else ()\n if not is_list_like(exclude):\n exclude = (exclude,) if exclude is not None else ()\n\n selection = tuple(map(frozenset, (include, exclude)))\n\n if not any(selection):\n raise ValueError('at least one of include or exclude must be '\n 'nonempty')\n\n # convert the myriad valid dtypes object to a single representation\n include, exclude = map(\n lambda x: frozenset(map(infer_dtype_from_object, x)), selection)\n for dtypes in (include, exclude):\n invalidate_string_dtypes(dtypes)\n\n # can't both include AND exclude!\n if not include.isdisjoint(exclude):\n raise ValueError('include and exclude overlap on {inc_ex}'.format(\n inc_ex=(include & exclude)))\n\n # empty include/exclude -> defaults to True\n # three cases (we've already raised if both are empty)\n # case 1: empty include, nonempty exclude\n # we have True, True, ... True for include, same for exclude\n # in the loop below we get the excluded\n # and when we call '&' below we get only the excluded\n # case 2: nonempty include, empty exclude\n # same as case 1, but with include\n # case 3: both nonempty\n # the \"union\" of the logic of case 1 and case 2:\n # we get the included and excluded, and return their logical and\n include_these = Series(not bool(include), index=self.columns)\n exclude_these = Series(not bool(exclude), index=self.columns)\n\n def is_dtype_instance_mapper(idx, dtype):\n return idx, functools.partial(issubclass, dtype.type)\n\n for idx, f in itertools.starmap(is_dtype_instance_mapper,\n enumerate(self.dtypes)):\n if include: # checks for the case of empty include or exclude\n include_these.iloc[idx] = any(map(f, include))\n if exclude:\n exclude_these.iloc[idx] = not any(map(f, exclude))\n\n dtype_indexer = include_these & exclude_these\n return self.loc[_get_info_slice(self, dtype_indexer)]"} {"instruction": "Can you write a function in Python 3 where it\nprovides boxed values for a column.", "response": "def _box_col_values(self, values, items):\n \"\"\"\n Provide boxed values for a column.\n \"\"\"\n klass = self._constructor_sliced\n return klass(values, index=self.index, name=items, fastpath=True)"} {"instruction": "Explain what the following Python 3 code does\ndef _ensure_valid_index(self, value):\n # GH5632, make sure that we are a Series convertible\n if not len(self.index) and is_list_like(value):\n try:\n value = Series(value)\n except (ValueError, NotImplementedError, TypeError):\n raise ValueError('Cannot set a frame with no defined index '\n 'and a value that cannot be converted to a '\n 'Series')\n\n self._data = self._data.reindex_axis(value.index.copy(), axis=1,\n fill_value=np.nan)", "response": "Ensure that we have an index and that we can create one from the passed value."} {"instruction": "Can you create a Python 3 function that\nadds series to DataFrame in specified column.", "response": "def _set_item(self, key, value):\n \"\"\"\n Add series to DataFrame in specified column.\n\n If series is a numpy-array (not a Series/TimeSeries), it must be the\n same length as the DataFrames index or an error will be thrown.\n\n Series/TimeSeries will be conformed to the DataFrames index to\n ensure homogeneity.\n \"\"\"\n\n self._ensure_valid_index(value)\n value = self._sanitize_column(key, value)\n NDFrame._set_item(self, key, value)\n\n # check if we are modifying a copy\n # try to set first as we want an invalid\n # value exception to occur first\n if len(self):\n self._check_setitem_copy()"} {"instruction": "Given the following Python 3 function, write the documentation\ndef insert(self, loc, column, value, allow_duplicates=False):\n self._ensure_valid_index(value)\n value = self._sanitize_column(column, value, broadcast=False)\n self._data.insert(loc, column, value,\n allow_duplicates=allow_duplicates)", "response": "Inserts a column into the DataFrame at the specified location."} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef assign(self, **kwargs):\n data = self.copy()\n\n # >= 3.6 preserve order of kwargs\n if PY36:\n for k, v in kwargs.items():\n data[k] = com.apply_if_callable(v, data)\n else:\n # <= 3.5: do all calculations first...\n results = OrderedDict()\n for k, v in kwargs.items():\n results[k] = com.apply_if_callable(v, data)\n\n # <= 3.5 and earlier\n results = sorted(results.items())\n # ... and then assign\n for k, v in results:\n data[k] = v\n return data", "response": "r Assigns new columns to a DataFrame."} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef _sanitize_column(self, key, value, broadcast=True):\n\n def reindexer(value):\n # reindex if necessary\n\n if value.index.equals(self.index) or not len(self.index):\n value = value._values.copy()\n else:\n\n # GH 4107\n try:\n value = value.reindex(self.index)._values\n except Exception as e:\n\n # duplicate axis\n if not value.index.is_unique:\n raise e\n\n # other\n raise TypeError('incompatible index of inserted column '\n 'with frame index')\n return value\n\n if isinstance(value, Series):\n value = reindexer(value)\n\n elif isinstance(value, DataFrame):\n # align right-hand-side columns if self.columns\n # is multi-index and self[key] is a sub-frame\n if isinstance(self.columns, MultiIndex) and key in self.columns:\n loc = self.columns.get_loc(key)\n if isinstance(loc, (slice, Series, np.ndarray, Index)):\n cols = maybe_droplevels(self.columns[loc], key)\n if len(cols) and not cols.equals(value.columns):\n value = value.reindex(cols, axis=1)\n # now align rows\n value = reindexer(value).T\n\n elif isinstance(value, ExtensionArray):\n # Explicitly copy here, instead of in sanitize_index,\n # as sanitize_index won't copy an EA, even with copy=True\n value = value.copy()\n value = sanitize_index(value, self.index, copy=False)\n\n elif isinstance(value, Index) or is_sequence(value):\n\n # turn me into an ndarray\n value = sanitize_index(value, self.index, copy=False)\n if not isinstance(value, (np.ndarray, Index)):\n if isinstance(value, list) and len(value) > 0:\n value = maybe_convert_platform(value)\n else:\n value = com.asarray_tuplesafe(value)\n elif value.ndim == 2:\n value = value.copy().T\n elif isinstance(value, Index):\n value = value.copy(deep=True)\n else:\n value = value.copy()\n\n # possibly infer to datetimelike\n if is_object_dtype(value.dtype):\n value = maybe_infer_to_datetimelike(value)\n\n else:\n # cast ignores pandas dtypes. so save the dtype first\n infer_dtype, _ = infer_dtype_from_scalar(\n value, pandas_dtype=True)\n\n # upcast\n value = cast_scalar_to_array(len(self.index), value)\n value = maybe_cast_to_datetime(value, infer_dtype)\n\n # return internal types directly\n if is_extension_type(value) or is_extension_array_dtype(value):\n return value\n\n # broadcast across multiple columns if necessary\n if broadcast and key in self.columns and value.ndim == 1:\n if (not self.columns.is_unique or\n isinstance(self.columns, MultiIndex)):\n existing_piece = self[key]\n if isinstance(existing_piece, DataFrame):\n value = np.tile(value, (len(existing_piece.columns), 1))\n\n return np.atleast_2d(np.asarray(value))", "response": "Sanitize a column of the log file."} {"instruction": "How would you implement a function in Python 3 that\ngives equal - length arrays of row and column labels return an array of the values corresponding to each pair of row and column labels.", "response": "def lookup(self, row_labels, col_labels):\n \"\"\"\n Label-based \"fancy indexing\" function for DataFrame.\n\n Given equal-length arrays of row and column labels, return an\n array of the values corresponding to each (row, col) pair.\n\n Parameters\n ----------\n row_labels : sequence\n The row labels to use for lookup\n col_labels : sequence\n The column labels to use for lookup\n\n Notes\n -----\n Akin to::\n\n result = [df.get_value(row, col)\n for row, col in zip(row_labels, col_labels)]\n\n Examples\n --------\n values : ndarray\n The found values\n \"\"\"\n n = len(row_labels)\n if n != len(col_labels):\n raise ValueError('Row labels must have same size as column labels')\n\n thresh = 1000\n if not self._is_mixed_type or n > thresh:\n values = self.values\n ridx = self.index.get_indexer(row_labels)\n cidx = self.columns.get_indexer(col_labels)\n if (ridx == -1).any():\n raise KeyError('One or more row labels was not found')\n if (cidx == -1).any():\n raise KeyError('One or more column labels was not found')\n flat_index = ridx * len(self.columns) + cidx\n result = values.flat[flat_index]\n else:\n result = np.empty(n, dtype='O')\n for i, (r, c) in enumerate(zip(row_labels, col_labels)):\n result[i] = self._get_value(r, c)\n\n if is_object_dtype(result):\n result = lib.maybe_convert_objects(result)\n\n return result"} {"instruction": "Here you have a function in Python 3, explain what it does\ndef _reindex_multi(self, axes, copy, fill_value):\n\n new_index, row_indexer = self.index.reindex(axes['index'])\n new_columns, col_indexer = self.columns.reindex(axes['columns'])\n\n if row_indexer is not None and col_indexer is not None:\n indexer = row_indexer, col_indexer\n new_values = algorithms.take_2d_multi(self.values, indexer,\n fill_value=fill_value)\n return self._constructor(new_values, index=new_index,\n columns=new_columns)\n else:\n return self._reindex_with_indexers({0: [new_index, row_indexer],\n 1: [new_columns, col_indexer]},\n copy=copy,\n fill_value=fill_value)", "response": "Reindex the data for the specified set of axes."} {"instruction": "Write a Python 3 function for\ndropping specified labels from rows or columns.", "response": "def drop(self, labels=None, axis=0, index=None, columns=None,\n level=None, inplace=False, errors='raise'):\n \"\"\"\n Drop specified labels from rows or columns.\n\n Remove rows or columns by specifying label names and corresponding\n axis, or by specifying directly index or column names. When using a\n multi-index, labels on different levels can be removed by specifying\n the level.\n\n Parameters\n ----------\n labels : single label or list-like\n Index or column labels to drop.\n axis : {0 or 'index', 1 or 'columns'}, default 0\n Whether to drop labels from the index (0 or 'index') or\n columns (1 or 'columns').\n index : single label or list-like\n Alternative to specifying axis (``labels, axis=0``\n is equivalent to ``index=labels``).\n\n .. versionadded:: 0.21.0\n columns : single label or list-like\n Alternative to specifying axis (``labels, axis=1``\n is equivalent to ``columns=labels``).\n\n .. versionadded:: 0.21.0\n level : int or level name, optional\n For MultiIndex, level from which the labels will be removed.\n inplace : bool, default False\n If True, do operation inplace and return None.\n errors : {'ignore', 'raise'}, default 'raise'\n If 'ignore', suppress error and only existing labels are\n dropped.\n\n Returns\n -------\n DataFrame\n DataFrame without the removed index or column labels.\n\n Raises\n ------\n KeyError\n If any of the labels is not found in the selected axis.\n\n See Also\n --------\n DataFrame.loc : Label-location based indexer for selection by label.\n DataFrame.dropna : Return DataFrame with labels on given axis omitted\n where (all or any) data are missing.\n DataFrame.drop_duplicates : Return DataFrame with duplicate rows\n removed, optionally only considering certain columns.\n Series.drop : Return Series with specified index labels removed.\n\n Examples\n --------\n >>> df = pd.DataFrame(np.arange(12).reshape(3, 4),\n ... columns=['A', 'B', 'C', 'D'])\n >>> df\n A B C D\n 0 0 1 2 3\n 1 4 5 6 7\n 2 8 9 10 11\n\n Drop columns\n\n >>> df.drop(['B', 'C'], axis=1)\n A D\n 0 0 3\n 1 4 7\n 2 8 11\n\n >>> df.drop(columns=['B', 'C'])\n A D\n 0 0 3\n 1 4 7\n 2 8 11\n\n Drop a row by index\n\n >>> df.drop([0, 1])\n A B C D\n 2 8 9 10 11\n\n Drop columns and/or rows of MultiIndex DataFrame\n\n >>> midx = pd.MultiIndex(levels=[['lama', 'cow', 'falcon'],\n ... ['speed', 'weight', 'length']],\n ... codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2],\n ... [0, 1, 2, 0, 1, 2, 0, 1, 2]])\n >>> df = pd.DataFrame(index=midx, columns=['big', 'small'],\n ... data=[[45, 30], [200, 100], [1.5, 1], [30, 20],\n ... [250, 150], [1.5, 0.8], [320, 250],\n ... [1, 0.8], [0.3, 0.2]])\n >>> df\n big small\n lama speed 45.0 30.0\n weight 200.0 100.0\n length 1.5 1.0\n cow speed 30.0 20.0\n weight 250.0 150.0\n length 1.5 0.8\n falcon speed 320.0 250.0\n weight 1.0 0.8\n length 0.3 0.2\n\n >>> df.drop(index='cow', columns='small')\n big\n lama speed 45.0\n weight 200.0\n length 1.5\n falcon speed 320.0\n weight 1.0\n length 0.3\n\n >>> df.drop(index='length', level=1)\n big small\n lama speed 45.0 30.0\n weight 200.0 100.0\n cow speed 30.0 20.0\n weight 250.0 150.0\n falcon speed 320.0 250.0\n weight 1.0 0.8\n \"\"\"\n return super().drop(labels=labels, axis=axis, index=index,\n columns=columns, level=level, inplace=inplace,\n errors=errors)"} {"instruction": "Create a Python 3 function for\nreturning a new DataFrame with the renamed labels.", "response": "def rename(self, *args, **kwargs):\n \"\"\"\n Alter axes labels.\n\n Function / dict values must be unique (1-to-1). Labels not contained in\n a dict / Series will be left as-is. Extra labels listed don't throw an\n error.\n\n See the :ref:`user guide ` for more.\n\n Parameters\n ----------\n mapper : dict-like or function\n Dict-like or functions transformations to apply to\n that axis' values. Use either ``mapper`` and ``axis`` to\n specify the axis to target with ``mapper``, or ``index`` and\n ``columns``.\n index : dict-like or function\n Alternative to specifying axis (``mapper, axis=0``\n is equivalent to ``index=mapper``).\n columns : dict-like or function\n Alternative to specifying axis (``mapper, axis=1``\n is equivalent to ``columns=mapper``).\n axis : int or str\n Axis to target with ``mapper``. Can be either the axis name\n ('index', 'columns') or number (0, 1). The default is 'index'.\n copy : bool, default True\n Also copy underlying data.\n inplace : bool, default False\n Whether to return a new DataFrame. If True then value of copy is\n ignored.\n level : int or level name, default None\n In case of a MultiIndex, only rename labels in the specified\n level.\n errors : {'ignore', 'raise'}, default 'ignore'\n If 'raise', raise a `KeyError` when a dict-like `mapper`, `index`,\n or `columns` contains labels that are not present in the Index\n being transformed.\n If 'ignore', existing keys will be renamed and extra keys will be\n ignored.\n\n Returns\n -------\n DataFrame\n DataFrame with the renamed axis labels.\n\n Raises\n ------\n KeyError\n If any of the labels is not found in the selected axis and\n \"errors='raise'\".\n\n See Also\n --------\n DataFrame.rename_axis : Set the name of the axis.\n\n Examples\n --------\n\n ``DataFrame.rename`` supports two calling conventions\n\n * ``(index=index_mapper, columns=columns_mapper, ...)``\n * ``(mapper, axis={'index', 'columns'}, ...)``\n\n We *highly* recommend using keyword arguments to clarify your\n intent.\n\n >>> df = pd.DataFrame({\"A\": [1, 2, 3], \"B\": [4, 5, 6]})\n >>> df.rename(index=str, columns={\"A\": \"a\", \"B\": \"c\"})\n a c\n 0 1 4\n 1 2 5\n 2 3 6\n\n >>> df.rename(index=str, columns={\"A\": \"a\", \"C\": \"c\"})\n a B\n 0 1 4\n 1 2 5\n 2 3 6\n\n >>> df.rename(index=str, columns={\"A\": \"a\", \"C\": \"c\"}, errors=\"raise\")\n Traceback (most recent call last):\n KeyError: ['C'] not found in axis\n\n Using axis-style parameters\n\n >>> df.rename(str.lower, axis='columns')\n a b\n 0 1 4\n 1 2 5\n 2 3 6\n\n >>> df.rename({1: 2, 2: 4}, axis='index')\n A B\n 0 1 4\n 2 2 5\n 4 3 6\n \"\"\"\n axes = validate_axis_style_args(self, args, kwargs, 'mapper', 'rename')\n kwargs.update(axes)\n # Pop these, since the values are in `kwargs` under different names\n kwargs.pop('axis', None)\n kwargs.pop('mapper', None)\n return super().rename(**kwargs)"} {"instruction": "Write a Python 3 function that can\nset the index of the DataFrame.", "response": "def set_index(self, keys, drop=True, append=False, inplace=False,\n verify_integrity=False):\n \"\"\"\n Set the DataFrame index using existing columns.\n\n Set the DataFrame index (row labels) using one or more existing\n columns or arrays (of the correct length). The index can replace the\n existing index or expand on it.\n\n Parameters\n ----------\n keys : label or array-like or list of labels/arrays\n This parameter can be either a single column key, a single array of\n the same length as the calling DataFrame, or a list containing an\n arbitrary combination of column keys and arrays. Here, \"array\"\n encompasses :class:`Series`, :class:`Index`, ``np.ndarray``, and\n instances of :class:`~collections.abc.Iterator`.\n drop : bool, default True\n Delete columns to be used as the new index.\n append : bool, default False\n Whether to append columns to existing index.\n inplace : bool, default False\n Modify the DataFrame in place (do not create a new object).\n verify_integrity : bool, default False\n Check the new index for duplicates. Otherwise defer the check until\n necessary. Setting to False will improve the performance of this\n method.\n\n Returns\n -------\n DataFrame\n Changed row labels.\n\n See Also\n --------\n DataFrame.reset_index : Opposite of set_index.\n DataFrame.reindex : Change to new indices or expand indices.\n DataFrame.reindex_like : Change to same indices as other DataFrame.\n\n Examples\n --------\n >>> df = pd.DataFrame({'month': [1, 4, 7, 10],\n ... 'year': [2012, 2014, 2013, 2014],\n ... 'sale': [55, 40, 84, 31]})\n >>> df\n month year sale\n 0 1 2012 55\n 1 4 2014 40\n 2 7 2013 84\n 3 10 2014 31\n\n Set the index to become the 'month' column:\n\n >>> df.set_index('month')\n year sale\n month\n 1 2012 55\n 4 2014 40\n 7 2013 84\n 10 2014 31\n\n Create a MultiIndex using columns 'year' and 'month':\n\n >>> df.set_index(['year', 'month'])\n sale\n year month\n 2012 1 55\n 2014 4 40\n 2013 7 84\n 2014 10 31\n\n Create a MultiIndex using an Index and a column:\n\n >>> df.set_index([pd.Index([1, 2, 3, 4]), 'year'])\n month sale\n year\n 1 2012 1 55\n 2 2014 4 40\n 3 2013 7 84\n 4 2014 10 31\n\n Create a MultiIndex using two Series:\n\n >>> s = pd.Series([1, 2, 3, 4])\n >>> df.set_index([s, s**2])\n month year sale\n 1 1 1 2012 55\n 2 4 4 2014 40\n 3 9 7 2013 84\n 4 16 10 2014 31\n \"\"\"\n inplace = validate_bool_kwarg(inplace, 'inplace')\n if not isinstance(keys, list):\n keys = [keys]\n\n err_msg = ('The parameter \"keys\" may be a column key, one-dimensional '\n 'array, or a list containing only valid column keys and '\n 'one-dimensional arrays.')\n\n missing = []\n for col in keys:\n if isinstance(col, (ABCIndexClass, ABCSeries, np.ndarray,\n list, abc.Iterator)):\n # arrays are fine as long as they are one-dimensional\n # iterators get converted to list below\n if getattr(col, 'ndim', 1) != 1:\n raise ValueError(err_msg)\n else:\n # everything else gets tried as a key; see GH 24969\n try:\n found = col in self.columns\n except TypeError:\n raise TypeError(err_msg + ' Received column of '\n 'type {}'.format(type(col)))\n else:\n if not found:\n missing.append(col)\n\n if missing:\n raise KeyError('None of {} are in the columns'.format(missing))\n\n if inplace:\n frame = self\n else:\n frame = self.copy()\n\n arrays = []\n names = []\n if append:\n names = [x for x in self.index.names]\n if isinstance(self.index, ABCMultiIndex):\n for i in range(self.index.nlevels):\n arrays.append(self.index._get_level_values(i))\n else:\n arrays.append(self.index)\n\n to_remove = []\n for col in keys:\n if isinstance(col, ABCMultiIndex):\n for n in range(col.nlevels):\n arrays.append(col._get_level_values(n))\n names.extend(col.names)\n elif isinstance(col, (ABCIndexClass, ABCSeries)):\n # if Index then not MultiIndex (treated above)\n arrays.append(col)\n names.append(col.name)\n elif isinstance(col, (list, np.ndarray)):\n arrays.append(col)\n names.append(None)\n elif isinstance(col, abc.Iterator):\n arrays.append(list(col))\n names.append(None)\n # from here, col can only be a column label\n else:\n arrays.append(frame[col]._values)\n names.append(col)\n if drop:\n to_remove.append(col)\n\n if len(arrays[-1]) != len(self):\n # check newest element against length of calling frame, since\n # ensure_index_from_sequences would not raise for append=False.\n raise ValueError('Length mismatch: Expected {len_self} rows, '\n 'received array of length {len_col}'.format(\n len_self=len(self),\n len_col=len(arrays[-1])\n ))\n\n index = ensure_index_from_sequences(arrays, names)\n\n if verify_integrity and not index.is_unique:\n duplicates = index[index.duplicated()].unique()\n raise ValueError('Index has duplicate keys: {dup}'.format(\n dup=duplicates))\n\n # use set to handle duplicate column names gracefully in case of drop\n for c in set(to_remove):\n del frame[c]\n\n # clear up memory usage\n index._cleanup()\n\n frame.index = index\n\n if not inplace:\n return frame"} {"instruction": "Implement a function in Python 3 to\nreset the index of the DataFrame or a level of it.", "response": "def reset_index(self, level=None, drop=False, inplace=False, col_level=0,\n col_fill=''):\n \"\"\"\n Reset the index, or a level of it.\n\n Reset the index of the DataFrame, and use the default one instead.\n If the DataFrame has a MultiIndex, this method can remove one or more\n levels.\n\n Parameters\n ----------\n level : int, str, tuple, or list, default None\n Only remove the given levels from the index. Removes all levels by\n default.\n drop : bool, default False\n Do not try to insert index into dataframe columns. This resets\n the index to the default integer index.\n inplace : bool, default False\n Modify the DataFrame in place (do not create a new object).\n col_level : int or str, default 0\n If the columns have multiple levels, determines which level the\n labels are inserted into. By default it is inserted into the first\n level.\n col_fill : object, default ''\n If the columns have multiple levels, determines how the other\n levels are named. If None then the index name is repeated.\n\n Returns\n -------\n DataFrame\n DataFrame with the new index.\n\n See Also\n --------\n DataFrame.set_index : Opposite of reset_index.\n DataFrame.reindex : Change to new indices or expand indices.\n DataFrame.reindex_like : Change to same indices as other DataFrame.\n\n Examples\n --------\n >>> df = pd.DataFrame([('bird', 389.0),\n ... ('bird', 24.0),\n ... ('mammal', 80.5),\n ... ('mammal', np.nan)],\n ... index=['falcon', 'parrot', 'lion', 'monkey'],\n ... columns=('class', 'max_speed'))\n >>> df\n class max_speed\n falcon bird 389.0\n parrot bird 24.0\n lion mammal 80.5\n monkey mammal NaN\n\n When we reset the index, the old index is added as a column, and a\n new sequential index is used:\n\n >>> df.reset_index()\n index class max_speed\n 0 falcon bird 389.0\n 1 parrot bird 24.0\n 2 lion mammal 80.5\n 3 monkey mammal NaN\n\n We can use the `drop` parameter to avoid the old index being added as\n a column:\n\n >>> df.reset_index(drop=True)\n class max_speed\n 0 bird 389.0\n 1 bird 24.0\n 2 mammal 80.5\n 3 mammal NaN\n\n You can also use `reset_index` with `MultiIndex`.\n\n >>> index = pd.MultiIndex.from_tuples([('bird', 'falcon'),\n ... ('bird', 'parrot'),\n ... ('mammal', 'lion'),\n ... ('mammal', 'monkey')],\n ... names=['class', 'name'])\n >>> columns = pd.MultiIndex.from_tuples([('speed', 'max'),\n ... ('species', 'type')])\n >>> df = pd.DataFrame([(389.0, 'fly'),\n ... ( 24.0, 'fly'),\n ... ( 80.5, 'run'),\n ... (np.nan, 'jump')],\n ... index=index,\n ... columns=columns)\n >>> df\n speed species\n max type\n class name\n bird falcon 389.0 fly\n parrot 24.0 fly\n mammal lion 80.5 run\n monkey NaN jump\n\n If the index has multiple levels, we can reset a subset of them:\n\n >>> df.reset_index(level='class')\n class speed species\n max type\n name\n falcon bird 389.0 fly\n parrot bird 24.0 fly\n lion mammal 80.5 run\n monkey mammal NaN jump\n\n If we are not dropping the index, by default, it is placed in the top\n level. We can place it in another level:\n\n >>> df.reset_index(level='class', col_level=1)\n speed species\n class max type\n name\n falcon bird 389.0 fly\n parrot bird 24.0 fly\n lion mammal 80.5 run\n monkey mammal NaN jump\n\n When the index is inserted under another level, we can specify under\n which one with the parameter `col_fill`:\n\n >>> df.reset_index(level='class', col_level=1, col_fill='species')\n species speed species\n class max type\n name\n falcon bird 389.0 fly\n parrot bird 24.0 fly\n lion mammal 80.5 run\n monkey mammal NaN jump\n\n If we specify a nonexistent level for `col_fill`, it is created:\n\n >>> df.reset_index(level='class', col_level=1, col_fill='genus')\n genus speed species\n class max type\n name\n falcon bird 389.0 fly\n parrot bird 24.0 fly\n lion mammal 80.5 run\n monkey mammal NaN jump\n \"\"\"\n inplace = validate_bool_kwarg(inplace, 'inplace')\n if inplace:\n new_obj = self\n else:\n new_obj = self.copy()\n\n def _maybe_casted_values(index, labels=None):\n values = index._values\n if not isinstance(index, (PeriodIndex, DatetimeIndex)):\n if values.dtype == np.object_:\n values = lib.maybe_convert_objects(values)\n\n # if we have the labels, extract the values with a mask\n if labels is not None:\n mask = labels == -1\n\n # we can have situations where the whole mask is -1,\n # meaning there is nothing found in labels, so make all nan's\n if mask.all():\n values = np.empty(len(mask))\n values.fill(np.nan)\n else:\n values = values.take(labels)\n\n # TODO(https://github.com/pandas-dev/pandas/issues/24206)\n # Push this into maybe_upcast_putmask?\n # We can't pass EAs there right now. Looks a bit\n # complicated.\n # So we unbox the ndarray_values, op, re-box.\n values_type = type(values)\n values_dtype = values.dtype\n\n if issubclass(values_type, DatetimeLikeArray):\n values = values._data\n\n if mask.any():\n values, changed = maybe_upcast_putmask(\n values, mask, np.nan)\n\n if issubclass(values_type, DatetimeLikeArray):\n values = values_type(values, dtype=values_dtype)\n\n return values\n\n new_index = ibase.default_index(len(new_obj))\n if level is not None:\n if not isinstance(level, (tuple, list)):\n level = [level]\n level = [self.index._get_level_number(lev) for lev in level]\n if len(level) < self.index.nlevels:\n new_index = self.index.droplevel(level)\n\n if not drop:\n if isinstance(self.index, MultiIndex):\n names = [n if n is not None else ('level_%d' % i)\n for (i, n) in enumerate(self.index.names)]\n to_insert = lzip(self.index.levels, self.index.codes)\n else:\n default = 'index' if 'index' not in self else 'level_0'\n names = ([default] if self.index.name is None\n else [self.index.name])\n to_insert = ((self.index, None),)\n\n multi_col = isinstance(self.columns, MultiIndex)\n for i, (lev, lab) in reversed(list(enumerate(to_insert))):\n if not (level is None or i in level):\n continue\n name = names[i]\n if multi_col:\n col_name = (list(name) if isinstance(name, tuple)\n else [name])\n if col_fill is None:\n if len(col_name) not in (1, self.columns.nlevels):\n raise ValueError(\"col_fill=None is incompatible \"\n \"with incomplete column name \"\n \"{}\".format(name))\n col_fill = col_name[0]\n\n lev_num = self.columns._get_level_number(col_level)\n name_lst = [col_fill] * lev_num + col_name\n missing = self.columns.nlevels - len(name_lst)\n name_lst += [col_fill] * missing\n name = tuple(name_lst)\n # to ndarray and maybe infer different dtype\n level_values = _maybe_casted_values(lev, lab)\n new_obj.insert(0, name, level_values)\n\n new_obj.index = new_index\n if not inplace:\n return new_obj"} {"instruction": "Given the following Python 3 function, write the documentation\ndef dropna(self, axis=0, how='any', thresh=None, subset=None,\n inplace=False):\n \"\"\"\n Remove missing values.\n\n See the :ref:`User Guide ` for more on which values are\n considered missing, and how to work with missing data.\n\n Parameters\n ----------\n axis : {0 or 'index', 1 or 'columns'}, default 0\n Determine if rows or columns which contain missing values are\n removed.\n\n * 0, or 'index' : Drop rows which contain missing values.\n * 1, or 'columns' : Drop columns which contain missing value.\n\n .. deprecated:: 0.23.0\n\n Pass tuple or list to drop on multiple axes.\n Only a single axis is allowed.\n\n how : {'any', 'all'}, default 'any'\n Determine if row or column is removed from DataFrame, when we have\n at least one NA or all NA.\n\n * 'any' : If any NA values are present, drop that row or column.\n * 'all' : If all values are NA, drop that row or column.\n\n thresh : int, optional\n Require that many non-NA values.\n subset : array-like, optional\n Labels along other axis to consider, e.g. if you are dropping rows\n these would be a list of columns to include.\n inplace : bool, default False\n If True, do operation inplace and return None.\n\n Returns\n -------\n DataFrame\n DataFrame with NA entries dropped from it.\n\n See Also\n --------\n DataFrame.isna: Indicate missing values.\n DataFrame.notna : Indicate existing (non-missing) values.\n DataFrame.fillna : Replace missing values.\n Series.dropna : Drop missing values.\n Index.dropna : Drop missing indices.\n\n Examples\n --------\n >>> df = pd.DataFrame({\"name\": ['Alfred', 'Batman', 'Catwoman'],\n ... \"toy\": [np.nan, 'Batmobile', 'Bullwhip'],\n ... \"born\": [pd.NaT, pd.Timestamp(\"1940-04-25\"),\n ... pd.NaT]})\n >>> df\n name toy born\n 0 Alfred NaN NaT\n 1 Batman Batmobile 1940-04-25\n 2 Catwoman Bullwhip NaT\n\n Drop the rows where at least one element is missing.\n\n >>> df.dropna()\n name toy born\n 1 Batman Batmobile 1940-04-25\n\n Drop the columns where at least one element is missing.\n\n >>> df.dropna(axis='columns')\n name\n 0 Alfred\n 1 Batman\n 2 Catwoman\n\n Drop the rows where all elements are missing.\n\n >>> df.dropna(how='all')\n name toy born\n 0 Alfred NaN NaT\n 1 Batman Batmobile 1940-04-25\n 2 Catwoman Bullwhip NaT\n\n Keep only the rows with at least 2 non-NA values.\n\n >>> df.dropna(thresh=2)\n name toy born\n 1 Batman Batmobile 1940-04-25\n 2 Catwoman Bullwhip NaT\n\n Define in which columns to look for missing values.\n\n >>> df.dropna(subset=['name', 'born'])\n name toy born\n 1 Batman Batmobile 1940-04-25\n\n Keep the DataFrame with valid entries in the same variable.\n\n >>> df.dropna(inplace=True)\n >>> df\n name toy born\n 1 Batman Batmobile 1940-04-25\n \"\"\"\n inplace = validate_bool_kwarg(inplace, 'inplace')\n if isinstance(axis, (tuple, list)):\n # GH20987\n msg = (\"supplying multiple axes to axis is deprecated and \"\n \"will be removed in a future version.\")\n warnings.warn(msg, FutureWarning, stacklevel=2)\n\n result = self\n for ax in axis:\n result = result.dropna(how=how, thresh=thresh, subset=subset,\n axis=ax)\n else:\n axis = self._get_axis_number(axis)\n agg_axis = 1 - axis\n\n agg_obj = self\n if subset is not None:\n ax = self._get_axis(agg_axis)\n indices = ax.get_indexer_for(subset)\n check = indices == -1\n if check.any():\n raise KeyError(list(np.compress(check, subset)))\n agg_obj = self.take(indices, axis=agg_axis)\n\n count = agg_obj.count(axis=agg_axis)\n\n if thresh is not None:\n mask = count >= thresh\n elif how == 'any':\n mask = count == len(agg_obj._get_axis(agg_axis))\n elif how == 'all':\n mask = count > 0\n else:\n if how is not None:\n raise ValueError('invalid how option: {h}'.format(h=how))\n else:\n raise TypeError('must specify how or thresh')\n\n result = self.loc(axis=axis)[mask]\n\n if inplace:\n self._update_inplace(result)\n else:\n return result", "response": "Drop missing values from a DataFrame."} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef drop_duplicates(self, subset=None, keep='first', inplace=False):\n if self.empty:\n return self.copy()\n\n inplace = validate_bool_kwarg(inplace, 'inplace')\n duplicated = self.duplicated(subset, keep=keep)\n\n if inplace:\n inds, = (-duplicated)._ndarray_values.nonzero()\n new_data = self._data.take(inds)\n self._update_inplace(new_data)\n else:\n return self[-duplicated]", "response": "Return a new DataFrame with duplicate rows removed optionally only\n considering certain columns."} {"instruction": "Given the following Python 3 function, write the documentation\ndef duplicated(self, subset=None, keep='first'):\n from pandas.core.sorting import get_group_index\n from pandas._libs.hashtable import duplicated_int64, _SIZE_HINT_LIMIT\n\n if self.empty:\n return Series(dtype=bool)\n\n def f(vals):\n labels, shape = algorithms.factorize(\n vals, size_hint=min(len(self), _SIZE_HINT_LIMIT))\n return labels.astype('i8', copy=False), len(shape)\n\n if subset is None:\n subset = self.columns\n elif (not np.iterable(subset) or\n isinstance(subset, str) or\n isinstance(subset, tuple) and subset in self.columns):\n subset = subset,\n\n # Verify all columns in subset exist in the queried dataframe\n # Otherwise, raise a KeyError, same as if you try to __getitem__ with a\n # key that doesn't exist.\n diff = Index(subset).difference(self.columns)\n if not diff.empty:\n raise KeyError(diff)\n\n vals = (col.values for name, col in self.iteritems()\n if name in subset)\n labels, shape = map(list, zip(*map(f, vals)))\n\n ids = get_group_index(labels, shape, sort=False, xnull=False)\n return Series(duplicated_int64(ids, keep), index=self.index)", "response": "Return boolean Series denoting duplicate rows optionally only considering certain columns."} {"instruction": "Write a Python 3 function that can\nreturn the first n rows with the largest values in columns in descending order.", "response": "def nlargest(self, n, columns, keep='first'):\n \"\"\"\n Return the first `n` rows ordered by `columns` in descending order.\n\n Return the first `n` rows with the largest values in `columns`, in\n descending order. The columns that are not specified are returned as\n well, but not used for ordering.\n\n This method is equivalent to\n ``df.sort_values(columns, ascending=False).head(n)``, but more\n performant.\n\n Parameters\n ----------\n n : int\n Number of rows to return.\n columns : label or list of labels\n Column label(s) to order by.\n keep : {'first', 'last', 'all'}, default 'first'\n Where there are duplicate values:\n\n - `first` : prioritize the first occurrence(s)\n - `last` : prioritize the last occurrence(s)\n - ``all`` : do not drop any duplicates, even it means\n selecting more than `n` items.\n\n .. versionadded:: 0.24.0\n\n Returns\n -------\n DataFrame\n The first `n` rows ordered by the given columns in descending\n order.\n\n See Also\n --------\n DataFrame.nsmallest : Return the first `n` rows ordered by `columns` in\n ascending order.\n DataFrame.sort_values : Sort DataFrame by the values.\n DataFrame.head : Return the first `n` rows without re-ordering.\n\n Notes\n -----\n This function cannot be used with all column types. For example, when\n specifying columns with `object` or `category` dtypes, ``TypeError`` is\n raised.\n\n Examples\n --------\n >>> df = pd.DataFrame({'population': [59000000, 65000000, 434000,\n ... 434000, 434000, 337000, 11300,\n ... 11300, 11300],\n ... 'GDP': [1937894, 2583560 , 12011, 4520, 12128,\n ... 17036, 182, 38, 311],\n ... 'alpha-2': [\"IT\", \"FR\", \"MT\", \"MV\", \"BN\",\n ... \"IS\", \"NR\", \"TV\", \"AI\"]},\n ... index=[\"Italy\", \"France\", \"Malta\",\n ... \"Maldives\", \"Brunei\", \"Iceland\",\n ... \"Nauru\", \"Tuvalu\", \"Anguilla\"])\n >>> df\n population GDP alpha-2\n Italy 59000000 1937894 IT\n France 65000000 2583560 FR\n Malta 434000 12011 MT\n Maldives 434000 4520 MV\n Brunei 434000 12128 BN\n Iceland 337000 17036 IS\n Nauru 11300 182 NR\n Tuvalu 11300 38 TV\n Anguilla 11300 311 AI\n\n In the following example, we will use ``nlargest`` to select the three\n rows having the largest values in column \"population\".\n\n >>> df.nlargest(3, 'population')\n population GDP alpha-2\n France 65000000 2583560 FR\n Italy 59000000 1937894 IT\n Malta 434000 12011 MT\n\n When using ``keep='last'``, ties are resolved in reverse order:\n\n >>> df.nlargest(3, 'population', keep='last')\n population GDP alpha-2\n France 65000000 2583560 FR\n Italy 59000000 1937894 IT\n Brunei 434000 12128 BN\n\n When using ``keep='all'``, all duplicate items are maintained:\n\n >>> df.nlargest(3, 'population', keep='all')\n population GDP alpha-2\n France 65000000 2583560 FR\n Italy 59000000 1937894 IT\n Malta 434000 12011 MT\n Maldives 434000 4520 MV\n Brunei 434000 12128 BN\n\n To order by the largest values in column \"population\" and then \"GDP\",\n we can specify multiple columns like in the next example.\n\n >>> df.nlargest(3, ['population', 'GDP'])\n population GDP alpha-2\n France 65000000 2583560 FR\n Italy 59000000 1937894 IT\n Brunei 434000 12128 BN\n \"\"\"\n return algorithms.SelectNFrame(self,\n n=n,\n keep=keep,\n columns=columns).nlargest()"} {"instruction": "Implement a Python 3 function for\nreturning the first n rows ordered by columns in ascending order.", "response": "def nsmallest(self, n, columns, keep='first'):\n \"\"\"\n Return the first `n` rows ordered by `columns` in ascending order.\n\n Return the first `n` rows with the smallest values in `columns`, in\n ascending order. The columns that are not specified are returned as\n well, but not used for ordering.\n\n This method is equivalent to\n ``df.sort_values(columns, ascending=True).head(n)``, but more\n performant.\n\n Parameters\n ----------\n n : int\n Number of items to retrieve.\n columns : list or str\n Column name or names to order by.\n keep : {'first', 'last', 'all'}, default 'first'\n Where there are duplicate values:\n\n - ``first`` : take the first occurrence.\n - ``last`` : take the last occurrence.\n - ``all`` : do not drop any duplicates, even it means\n selecting more than `n` items.\n\n .. versionadded:: 0.24.0\n\n Returns\n -------\n DataFrame\n\n See Also\n --------\n DataFrame.nlargest : Return the first `n` rows ordered by `columns` in\n descending order.\n DataFrame.sort_values : Sort DataFrame by the values.\n DataFrame.head : Return the first `n` rows without re-ordering.\n\n Examples\n --------\n >>> df = pd.DataFrame({'population': [59000000, 65000000, 434000,\n ... 434000, 434000, 337000, 11300,\n ... 11300, 11300],\n ... 'GDP': [1937894, 2583560 , 12011, 4520, 12128,\n ... 17036, 182, 38, 311],\n ... 'alpha-2': [\"IT\", \"FR\", \"MT\", \"MV\", \"BN\",\n ... \"IS\", \"NR\", \"TV\", \"AI\"]},\n ... index=[\"Italy\", \"France\", \"Malta\",\n ... \"Maldives\", \"Brunei\", \"Iceland\",\n ... \"Nauru\", \"Tuvalu\", \"Anguilla\"])\n >>> df\n population GDP alpha-2\n Italy 59000000 1937894 IT\n France 65000000 2583560 FR\n Malta 434000 12011 MT\n Maldives 434000 4520 MV\n Brunei 434000 12128 BN\n Iceland 337000 17036 IS\n Nauru 11300 182 NR\n Tuvalu 11300 38 TV\n Anguilla 11300 311 AI\n\n In the following example, we will use ``nsmallest`` to select the\n three rows having the smallest values in column \"a\".\n\n >>> df.nsmallest(3, 'population')\n population GDP alpha-2\n Nauru 11300 182 NR\n Tuvalu 11300 38 TV\n Anguilla 11300 311 AI\n\n When using ``keep='last'``, ties are resolved in reverse order:\n\n >>> df.nsmallest(3, 'population', keep='last')\n population GDP alpha-2\n Anguilla 11300 311 AI\n Tuvalu 11300 38 TV\n Nauru 11300 182 NR\n\n When using ``keep='all'``, all duplicate items are maintained:\n\n >>> df.nsmallest(3, 'population', keep='all')\n population GDP alpha-2\n Nauru 11300 182 NR\n Tuvalu 11300 38 TV\n Anguilla 11300 311 AI\n\n To order by the largest values in column \"a\" and then \"c\", we can\n specify multiple columns like in the next example.\n\n >>> df.nsmallest(3, ['population', 'GDP'])\n population GDP alpha-2\n Tuvalu 11300 38 TV\n Nauru 11300 182 NR\n Anguilla 11300 311 AI\n \"\"\"\n return algorithms.SelectNFrame(self,\n n=n,\n keep=keep,\n columns=columns).nsmallest()"} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef swaplevel(self, i=-2, j=-1, axis=0):\n result = self.copy()\n\n axis = self._get_axis_number(axis)\n if axis == 0:\n result.index = result.index.swaplevel(i, j)\n else:\n result.columns = result.columns.swaplevel(i, j)\n return result", "response": "Returns a new MultiIndex with the swap levels i and j in a particular axis."} {"instruction": "Explain what the following Python 3 code does\ndef reorder_levels(self, order, axis=0):\n axis = self._get_axis_number(axis)\n if not isinstance(self._get_axis(axis),\n MultiIndex): # pragma: no cover\n raise TypeError('Can only reorder levels on a hierarchical axis.')\n\n result = self.copy()\n\n if axis == 0:\n result.index = result.index.reorder_levels(order)\n else:\n result.columns = result.columns.reorder_levels(order)\n return result", "response": "Rearrange index levels using input order."} {"instruction": "Here you have a function in Python 3, explain what it does\ndef combine(self, other, func, fill_value=None, overwrite=True):\n other_idxlen = len(other.index) # save for compare\n\n this, other = self.align(other, copy=False)\n new_index = this.index\n\n if other.empty and len(new_index) == len(self.index):\n return self.copy()\n\n if self.empty and len(other) == other_idxlen:\n return other.copy()\n\n # sorts if possible\n new_columns = this.columns.union(other.columns)\n do_fill = fill_value is not None\n result = {}\n for col in new_columns:\n series = this[col]\n otherSeries = other[col]\n\n this_dtype = series.dtype\n other_dtype = otherSeries.dtype\n\n this_mask = isna(series)\n other_mask = isna(otherSeries)\n\n # don't overwrite columns unecessarily\n # DO propagate if this column is not in the intersection\n if not overwrite and other_mask.all():\n result[col] = this[col].copy()\n continue\n\n if do_fill:\n series = series.copy()\n otherSeries = otherSeries.copy()\n series[this_mask] = fill_value\n otherSeries[other_mask] = fill_value\n\n if col not in self.columns:\n # If self DataFrame does not have col in other DataFrame,\n # try to promote series, which is all NaN, as other_dtype.\n new_dtype = other_dtype\n try:\n series = series.astype(new_dtype, copy=False)\n except ValueError:\n # e.g. new_dtype is integer types\n pass\n else:\n # if we have different dtypes, possibly promote\n new_dtype = find_common_type([this_dtype, other_dtype])\n if not is_dtype_equal(this_dtype, new_dtype):\n series = series.astype(new_dtype)\n if not is_dtype_equal(other_dtype, new_dtype):\n otherSeries = otherSeries.astype(new_dtype)\n\n arr = func(series, otherSeries)\n arr = maybe_downcast_to_dtype(arr, this_dtype)\n\n result[col] = arr\n\n # convert_objects just in case\n return self._constructor(result, index=new_index,\n columns=new_columns)", "response": "Perform column - wise combine with another DataFrame."} {"instruction": "Write a Python 3 function for\ncombining two DataFrames in one DataFrame with non - null values in the same location in other.", "response": "def combine_first(self, other):\n \"\"\"\n Update null elements with value in the same location in `other`.\n\n Combine two DataFrame objects by filling null values in one DataFrame\n with non-null values from other DataFrame. The row and column indexes\n of the resulting DataFrame will be the union of the two.\n\n Parameters\n ----------\n other : DataFrame\n Provided DataFrame to use to fill null values.\n\n Returns\n -------\n DataFrame\n\n See Also\n --------\n DataFrame.combine : Perform series-wise operation on two DataFrames\n using a given function.\n\n Examples\n --------\n\n >>> df1 = pd.DataFrame({'A': [None, 0], 'B': [None, 4]})\n >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]})\n >>> df1.combine_first(df2)\n A B\n 0 1.0 3.0\n 1 0.0 4.0\n\n Null values still persist if the location of that null value\n does not exist in `other`\n\n >>> df1 = pd.DataFrame({'A': [None, 0], 'B': [4, None]})\n >>> df2 = pd.DataFrame({'B': [3, 3], 'C': [1, 1]}, index=[1, 2])\n >>> df1.combine_first(df2)\n A B C\n 0 NaN 4.0 NaN\n 1 0.0 3.0 1.0\n 2 NaN 3.0 1.0\n \"\"\"\n import pandas.core.computation.expressions as expressions\n\n def extract_values(arr):\n # Does two things:\n # 1. maybe gets the values from the Series / Index\n # 2. convert datelike to i8\n if isinstance(arr, (ABCIndexClass, ABCSeries)):\n arr = arr._values\n\n if needs_i8_conversion(arr):\n if is_extension_array_dtype(arr.dtype):\n arr = arr.asi8\n else:\n arr = arr.view('i8')\n return arr\n\n def combiner(x, y):\n mask = isna(x)\n if isinstance(mask, (ABCIndexClass, ABCSeries)):\n mask = mask._values\n\n x_values = extract_values(x)\n y_values = extract_values(y)\n\n # If the column y in other DataFrame is not in first DataFrame,\n # just return y_values.\n if y.name not in self.columns:\n return y_values\n\n return expressions.where(mask, y_values, x_values)\n\n return self.combine(other, combiner, overwrite=False)"} {"instruction": "Write a Python 3 function that can\nmodify in place using non - NA values from another DataFrame.", "response": "def update(self, other, join='left', overwrite=True, filter_func=None,\n errors='ignore'):\n \"\"\"\n Modify in place using non-NA values from another DataFrame.\n\n Aligns on indices. There is no return value.\n\n Parameters\n ----------\n other : DataFrame, or object coercible into a DataFrame\n Should have at least one matching index/column label\n with the original DataFrame. If a Series is passed,\n its name attribute must be set, and that will be\n used as the column name to align with the original DataFrame.\n join : {'left'}, default 'left'\n Only left join is implemented, keeping the index and columns of the\n original object.\n overwrite : bool, default True\n How to handle non-NA values for overlapping keys:\n\n * True: overwrite original DataFrame's values\n with values from `other`.\n * False: only update values that are NA in\n the original DataFrame.\n\n filter_func : callable(1d-array) -> bool 1d-array, optional\n Can choose to replace values other than NA. Return True for values\n that should be updated.\n errors : {'raise', 'ignore'}, default 'ignore'\n If 'raise', will raise a ValueError if the DataFrame and `other`\n both contain non-NA data in the same place.\n\n .. versionchanged :: 0.24.0\n Changed from `raise_conflict=False|True`\n to `errors='ignore'|'raise'`.\n\n Returns\n -------\n None : method directly changes calling object\n\n Raises\n ------\n ValueError\n * When `errors='raise'` and there's overlapping non-NA data.\n * When `errors` is not either `'ignore'` or `'raise'`\n NotImplementedError\n * If `join != 'left'`\n\n See Also\n --------\n dict.update : Similar method for dictionaries.\n DataFrame.merge : For column(s)-on-columns(s) operations.\n\n Examples\n --------\n >>> df = pd.DataFrame({'A': [1, 2, 3],\n ... 'B': [400, 500, 600]})\n >>> new_df = pd.DataFrame({'B': [4, 5, 6],\n ... 'C': [7, 8, 9]})\n >>> df.update(new_df)\n >>> df\n A B\n 0 1 4\n 1 2 5\n 2 3 6\n\n The DataFrame's length does not increase as a result of the update,\n only values at matching index/column labels are updated.\n\n >>> df = pd.DataFrame({'A': ['a', 'b', 'c'],\n ... 'B': ['x', 'y', 'z']})\n >>> new_df = pd.DataFrame({'B': ['d', 'e', 'f', 'g', 'h', 'i']})\n >>> df.update(new_df)\n >>> df\n A B\n 0 a d\n 1 b e\n 2 c f\n\n For Series, it's name attribute must be set.\n\n >>> df = pd.DataFrame({'A': ['a', 'b', 'c'],\n ... 'B': ['x', 'y', 'z']})\n >>> new_column = pd.Series(['d', 'e'], name='B', index=[0, 2])\n >>> df.update(new_column)\n >>> df\n A B\n 0 a d\n 1 b y\n 2 c e\n >>> df = pd.DataFrame({'A': ['a', 'b', 'c'],\n ... 'B': ['x', 'y', 'z']})\n >>> new_df = pd.DataFrame({'B': ['d', 'e']}, index=[1, 2])\n >>> df.update(new_df)\n >>> df\n A B\n 0 a x\n 1 b d\n 2 c e\n\n If `other` contains NaNs the corresponding values are not updated\n in the original dataframe.\n\n >>> df = pd.DataFrame({'A': [1, 2, 3],\n ... 'B': [400, 500, 600]})\n >>> new_df = pd.DataFrame({'B': [4, np.nan, 6]})\n >>> df.update(new_df)\n >>> df\n A B\n 0 1 4.0\n 1 2 500.0\n 2 3 6.0\n \"\"\"\n import pandas.core.computation.expressions as expressions\n # TODO: Support other joins\n if join != 'left': # pragma: no cover\n raise NotImplementedError(\"Only left join is supported\")\n if errors not in ['ignore', 'raise']:\n raise ValueError(\"The parameter errors must be either \"\n \"'ignore' or 'raise'\")\n\n if not isinstance(other, DataFrame):\n other = DataFrame(other)\n\n other = other.reindex_like(self)\n\n for col in self.columns:\n this = self[col]._values\n that = other[col]._values\n if filter_func is not None:\n with np.errstate(all='ignore'):\n mask = ~filter_func(this) | isna(that)\n else:\n if errors == 'raise':\n mask_this = notna(that)\n mask_that = notna(this)\n if any(mask_this & mask_that):\n raise ValueError(\"Data overlaps.\")\n\n if overwrite:\n mask = isna(that)\n else:\n mask = notna(this)\n\n # don't overwrite columns unecessarily\n if mask.all():\n continue\n\n self[col] = expressions.where(mask, this, that)"} {"instruction": "Write a Python 3 function for\nstacking the prescribed level of the current DataFrame onto the index axis.", "response": "def stack(self, level=-1, dropna=True):\n \"\"\"\n Stack the prescribed level(s) from columns to index.\n\n Return a reshaped DataFrame or Series having a multi-level\n index with one or more new inner-most levels compared to the current\n DataFrame. The new inner-most levels are created by pivoting the\n columns of the current dataframe:\n\n - if the columns have a single level, the output is a Series;\n - if the columns have multiple levels, the new index\n level(s) is (are) taken from the prescribed level(s) and\n the output is a DataFrame.\n\n The new index levels are sorted.\n\n Parameters\n ----------\n level : int, str, list, default -1\n Level(s) to stack from the column axis onto the index\n axis, defined as one index or label, or a list of indices\n or labels.\n dropna : bool, default True\n Whether to drop rows in the resulting Frame/Series with\n missing values. Stacking a column level onto the index\n axis can create combinations of index and column values\n that are missing from the original dataframe. See Examples\n section.\n\n Returns\n -------\n DataFrame or Series\n Stacked dataframe or series.\n\n See Also\n --------\n DataFrame.unstack : Unstack prescribed level(s) from index axis\n onto column axis.\n DataFrame.pivot : Reshape dataframe from long format to wide\n format.\n DataFrame.pivot_table : Create a spreadsheet-style pivot table\n as a DataFrame.\n\n Notes\n -----\n The function is named by analogy with a collection of books\n being reorganized from being side by side on a horizontal\n position (the columns of the dataframe) to being stacked\n vertically on top of each other (in the index of the\n dataframe).\n\n Examples\n --------\n **Single level columns**\n\n >>> df_single_level_cols = pd.DataFrame([[0, 1], [2, 3]],\n ... index=['cat', 'dog'],\n ... columns=['weight', 'height'])\n\n Stacking a dataframe with a single level column axis returns a Series:\n\n >>> df_single_level_cols\n weight height\n cat 0 1\n dog 2 3\n >>> df_single_level_cols.stack()\n cat weight 0\n height 1\n dog weight 2\n height 3\n dtype: int64\n\n **Multi level columns: simple case**\n\n >>> multicol1 = pd.MultiIndex.from_tuples([('weight', 'kg'),\n ... ('weight', 'pounds')])\n >>> df_multi_level_cols1 = pd.DataFrame([[1, 2], [2, 4]],\n ... index=['cat', 'dog'],\n ... columns=multicol1)\n\n Stacking a dataframe with a multi-level column axis:\n\n >>> df_multi_level_cols1\n weight\n kg pounds\n cat 1 2\n dog 2 4\n >>> df_multi_level_cols1.stack()\n weight\n cat kg 1\n pounds 2\n dog kg 2\n pounds 4\n\n **Missing values**\n\n >>> multicol2 = pd.MultiIndex.from_tuples([('weight', 'kg'),\n ... ('height', 'm')])\n >>> df_multi_level_cols2 = pd.DataFrame([[1.0, 2.0], [3.0, 4.0]],\n ... index=['cat', 'dog'],\n ... columns=multicol2)\n\n It is common to have missing values when stacking a dataframe\n with multi-level columns, as the stacked dataframe typically\n has more values than the original dataframe. Missing values\n are filled with NaNs:\n\n >>> df_multi_level_cols2\n weight height\n kg m\n cat 1.0 2.0\n dog 3.0 4.0\n >>> df_multi_level_cols2.stack()\n height weight\n cat kg NaN 1.0\n m 2.0 NaN\n dog kg NaN 3.0\n m 4.0 NaN\n\n **Prescribing the level(s) to be stacked**\n\n The first parameter controls which level or levels are stacked:\n\n >>> df_multi_level_cols2.stack(0)\n kg m\n cat height NaN 2.0\n weight 1.0 NaN\n dog height NaN 4.0\n weight 3.0 NaN\n >>> df_multi_level_cols2.stack([0, 1])\n cat height m 2.0\n weight kg 1.0\n dog height m 4.0\n weight kg 3.0\n dtype: float64\n\n **Dropping missing values**\n\n >>> df_multi_level_cols3 = pd.DataFrame([[None, 1.0], [2.0, 3.0]],\n ... index=['cat', 'dog'],\n ... columns=multicol2)\n\n Note that rows where all values are missing are dropped by\n default but this behaviour can be controlled via the dropna\n keyword parameter:\n\n >>> df_multi_level_cols3\n weight height\n kg m\n cat NaN 1.0\n dog 2.0 3.0\n >>> df_multi_level_cols3.stack(dropna=False)\n height weight\n cat kg NaN NaN\n m 1.0 NaN\n dog kg NaN 2.0\n m 3.0 NaN\n >>> df_multi_level_cols3.stack(dropna=True)\n height weight\n cat m 1.0 NaN\n dog kg NaN 2.0\n m 3.0 NaN\n \"\"\"\n from pandas.core.reshape.reshape import stack, stack_multiple\n\n if isinstance(level, (tuple, list)):\n return stack_multiple(self, level, dropna=dropna)\n else:\n return stack(self, level, dropna=dropna)"} {"instruction": "How would you explain what the following Python 3 function does\ndef unstack(self, level=-1, fill_value=None):\n from pandas.core.reshape.reshape import unstack\n return unstack(self, level, fill_value)", "response": "Unstacks the inner - most index labels into a new level of column labels."} {"instruction": "Implement a Python 3 function for\nreturning DataFrame with the first discrete difference of element compared with another DataFrame element.", "response": "def diff(self, periods=1, axis=0):\n \"\"\"\n First discrete difference of element.\n\n Calculates the difference of a DataFrame element compared with another\n element in the DataFrame (default is the element in the same column\n of the previous row).\n\n Parameters\n ----------\n periods : int, default 1\n Periods to shift for calculating difference, accepts negative\n values.\n axis : {0 or 'index', 1 or 'columns'}, default 0\n Take difference over rows (0) or columns (1).\n\n .. versionadded:: 0.16.1.\n\n Returns\n -------\n DataFrame\n\n See Also\n --------\n Series.diff: First discrete difference for a Series.\n DataFrame.pct_change: Percent change over given number of periods.\n DataFrame.shift: Shift index by desired number of periods with an\n optional time freq.\n\n Examples\n --------\n Difference with previous row\n\n >>> df = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6],\n ... 'b': [1, 1, 2, 3, 5, 8],\n ... 'c': [1, 4, 9, 16, 25, 36]})\n >>> df\n a b c\n 0 1 1 1\n 1 2 1 4\n 2 3 2 9\n 3 4 3 16\n 4 5 5 25\n 5 6 8 36\n\n >>> df.diff()\n a b c\n 0 NaN NaN NaN\n 1 1.0 0.0 3.0\n 2 1.0 1.0 5.0\n 3 1.0 1.0 7.0\n 4 1.0 2.0 9.0\n 5 1.0 3.0 11.0\n\n Difference with previous column\n\n >>> df.diff(axis=1)\n a b c\n 0 NaN 0.0 0.0\n 1 NaN -1.0 3.0\n 2 NaN -1.0 7.0\n 3 NaN -1.0 13.0\n 4 NaN 0.0 20.0\n 5 NaN 2.0 28.0\n\n Difference with 3rd previous row\n\n >>> df.diff(periods=3)\n a b c\n 0 NaN NaN NaN\n 1 NaN NaN NaN\n 2 NaN NaN NaN\n 3 3.0 2.0 15.0\n 4 3.0 4.0 21.0\n 5 3.0 6.0 27.0\n\n Difference with following row\n\n >>> df.diff(periods=-1)\n a b c\n 0 -1.0 0.0 -3.0\n 1 -1.0 -1.0 -5.0\n 2 -1.0 -1.0 -7.0\n 3 -1.0 -2.0 -9.0\n 4 -1.0 -3.0 -11.0\n 5 NaN NaN NaN\n \"\"\"\n bm_axis = self._get_block_manager_axis(axis)\n new_data = self._data.diff(n=periods, axis=bm_axis)\n return self._constructor(new_data)"} {"instruction": "How would you explain what the following Python 3 function does\ndef apply(self, func, axis=0, broadcast=None, raw=False, reduce=None,\n result_type=None, args=(), **kwds):\n \"\"\"\n Apply a function along an axis of the DataFrame.\n\n Objects passed to the function are Series objects whose index is\n either the DataFrame's index (``axis=0``) or the DataFrame's columns\n (``axis=1``). By default (``result_type=None``), the final return type\n is inferred from the return type of the applied function. Otherwise,\n it depends on the `result_type` argument.\n\n Parameters\n ----------\n func : function\n Function to apply to each column or row.\n axis : {0 or 'index', 1 or 'columns'}, default 0\n Axis along which the function is applied:\n\n * 0 or 'index': apply function to each column.\n * 1 or 'columns': apply function to each row.\n broadcast : bool, optional\n Only relevant for aggregation functions:\n\n * ``False`` or ``None`` : returns a Series whose length is the\n length of the index or the number of columns (based on the\n `axis` parameter)\n * ``True`` : results will be broadcast to the original shape\n of the frame, the original index and columns will be retained.\n\n .. deprecated:: 0.23.0\n This argument will be removed in a future version, replaced\n by result_type='broadcast'.\n\n raw : bool, default False\n * ``False`` : passes each row or column as a Series to the\n function.\n * ``True`` : the passed function will receive ndarray objects\n instead.\n If you are just applying a NumPy reduction function this will\n achieve much better performance.\n reduce : bool or None, default None\n Try to apply reduction procedures. If the DataFrame is empty,\n `apply` will use `reduce` to determine whether the result\n should be a Series or a DataFrame. If ``reduce=None`` (the\n default), `apply`'s return value will be guessed by calling\n `func` on an empty Series\n (note: while guessing, exceptions raised by `func` will be\n ignored).\n If ``reduce=True`` a Series will always be returned, and if\n ``reduce=False`` a DataFrame will always be returned.\n\n .. deprecated:: 0.23.0\n This argument will be removed in a future version, replaced\n by ``result_type='reduce'``.\n\n result_type : {'expand', 'reduce', 'broadcast', None}, default None\n These only act when ``axis=1`` (columns):\n\n * 'expand' : list-like results will be turned into columns.\n * 'reduce' : returns a Series if possible rather than expanding\n list-like results. This is the opposite of 'expand'.\n * 'broadcast' : results will be broadcast to the original shape\n of the DataFrame, the original index and columns will be\n retained.\n\n The default behaviour (None) depends on the return value of the\n applied function: list-like results will be returned as a Series\n of those. However if the apply function returns a Series these\n are expanded to columns.\n\n .. versionadded:: 0.23.0\n\n args : tuple\n Positional arguments to pass to `func` in addition to the\n array/series.\n **kwds\n Additional keyword arguments to pass as keywords arguments to\n `func`.\n\n Returns\n -------\n Series or DataFrame\n Result of applying ``func`` along the given axis of the\n DataFrame.\n\n See Also\n --------\n DataFrame.applymap: For elementwise operations.\n DataFrame.aggregate: Only perform aggregating type operations.\n DataFrame.transform: Only perform transforming type operations.\n\n Notes\n -----\n In the current implementation apply calls `func` twice on the\n first column/row to decide whether it can take a fast or slow\n code path. This can lead to unexpected behavior if `func` has\n side-effects, as they will take effect twice for the first\n column/row.\n\n Examples\n --------\n\n >>> df = pd.DataFrame([[4, 9]] * 3, columns=['A', 'B'])\n >>> df\n A B\n 0 4 9\n 1 4 9\n 2 4 9\n\n Using a numpy universal function (in this case the same as\n ``np.sqrt(df)``):\n\n >>> df.apply(np.sqrt)\n A B\n 0 2.0 3.0\n 1 2.0 3.0\n 2 2.0 3.0\n\n Using a reducing function on either axis\n\n >>> df.apply(np.sum, axis=0)\n A 12\n B 27\n dtype: int64\n\n >>> df.apply(np.sum, axis=1)\n 0 13\n 1 13\n 2 13\n dtype: int64\n\n Retuning a list-like will result in a Series\n\n >>> df.apply(lambda x: [1, 2], axis=1)\n 0 [1, 2]\n 1 [1, 2]\n 2 [1, 2]\n dtype: object\n\n Passing result_type='expand' will expand list-like results\n to columns of a Dataframe\n\n >>> df.apply(lambda x: [1, 2], axis=1, result_type='expand')\n 0 1\n 0 1 2\n 1 1 2\n 2 1 2\n\n Returning a Series inside the function is similar to passing\n ``result_type='expand'``. The resulting column names\n will be the Series index.\n\n >>> df.apply(lambda x: pd.Series([1, 2], index=['foo', 'bar']), axis=1)\n foo bar\n 0 1 2\n 1 1 2\n 2 1 2\n\n Passing ``result_type='broadcast'`` will ensure the same shape\n result, whether list-like or scalar is returned by the function,\n and broadcast it along the axis. The resulting column names will\n be the originals.\n\n >>> df.apply(lambda x: [1, 2], axis=1, result_type='broadcast')\n A B\n 0 1 2\n 1 1 2\n 2 1 2\n \"\"\"\n from pandas.core.apply import frame_apply\n op = frame_apply(self,\n func=func,\n axis=axis,\n broadcast=broadcast,\n raw=raw,\n reduce=reduce,\n result_type=result_type,\n args=args,\n kwds=kwds)\n return op.get_result()", "response": "Apply a function to each column or row of the DataFrame."} {"instruction": "Here you have a function in Python 3, explain what it does\ndef applymap(self, func):\n\n # if we have a dtype == 'M8[ns]', provide boxed values\n def infer(x):\n if x.empty:\n return lib.map_infer(x, func)\n return lib.map_infer(x.astype(object).values, func)\n\n return self.apply(infer)", "response": "Applies a function to a DataFrame elementwise."} {"instruction": "Explain what the following Python 3 code does\ndef append(self, other, ignore_index=False,\n verify_integrity=False, sort=None):\n \"\"\"\n Append rows of `other` to the end of caller, returning a new object.\n\n Columns in `other` that are not in the caller are added as new columns.\n\n Parameters\n ----------\n other : DataFrame or Series/dict-like object, or list of these\n The data to append.\n ignore_index : boolean, default False\n If True, do not use the index labels.\n verify_integrity : boolean, default False\n If True, raise ValueError on creating index with duplicates.\n sort : boolean, default None\n Sort columns if the columns of `self` and `other` are not aligned.\n The default sorting is deprecated and will change to not-sorting\n in a future version of pandas. Explicitly pass ``sort=True`` to\n silence the warning and sort. Explicitly pass ``sort=False`` to\n silence the warning and not sort.\n\n .. versionadded:: 0.23.0\n\n Returns\n -------\n DataFrame\n\n See Also\n --------\n concat : General function to concatenate DataFrame, Series\n or Panel objects.\n\n Notes\n -----\n If a list of dict/series is passed and the keys are all contained in\n the DataFrame's index, the order of the columns in the resulting\n DataFrame will be unchanged.\n\n Iteratively appending rows to a DataFrame can be more computationally\n intensive than a single concatenate. A better solution is to append\n those rows to a list and then concatenate the list with the original\n DataFrame all at once.\n\n Examples\n --------\n\n >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=list('AB'))\n >>> df\n A B\n 0 1 2\n 1 3 4\n >>> df2 = pd.DataFrame([[5, 6], [7, 8]], columns=list('AB'))\n >>> df.append(df2)\n A B\n 0 1 2\n 1 3 4\n 0 5 6\n 1 7 8\n\n With `ignore_index` set to True:\n\n >>> df.append(df2, ignore_index=True)\n A B\n 0 1 2\n 1 3 4\n 2 5 6\n 3 7 8\n\n The following, while not recommended methods for generating DataFrames,\n show two ways to generate a DataFrame from multiple data sources.\n\n Less efficient:\n\n >>> df = pd.DataFrame(columns=['A'])\n >>> for i in range(5):\n ... df = df.append({'A': i}, ignore_index=True)\n >>> df\n A\n 0 0\n 1 1\n 2 2\n 3 3\n 4 4\n\n More efficient:\n\n >>> pd.concat([pd.DataFrame([i], columns=['A']) for i in range(5)],\n ... ignore_index=True)\n A\n 0 0\n 1 1\n 2 2\n 3 3\n 4 4\n \"\"\"\n if isinstance(other, (Series, dict)):\n if isinstance(other, dict):\n other = Series(other)\n if other.name is None and not ignore_index:\n raise TypeError('Can only append a Series if ignore_index=True'\n ' or if the Series has a name')\n\n if other.name is None:\n index = None\n else:\n # other must have the same index name as self, otherwise\n # index name will be reset\n index = Index([other.name], name=self.index.name)\n\n idx_diff = other.index.difference(self.columns)\n try:\n combined_columns = self.columns.append(idx_diff)\n except TypeError:\n combined_columns = self.columns.astype(object).append(idx_diff)\n other = other.reindex(combined_columns, copy=False)\n other = DataFrame(other.values.reshape((1, len(other))),\n index=index,\n columns=combined_columns)\n other = other._convert(datetime=True, timedelta=True)\n if not self.columns.equals(combined_columns):\n self = self.reindex(columns=combined_columns)\n elif isinstance(other, list) and not isinstance(other[0], DataFrame):\n other = DataFrame(other)\n if (self.columns.get_indexer(other.columns) >= 0).all():\n other = other.reindex(columns=self.columns)\n\n from pandas.core.reshape.concat import concat\n if isinstance(other, (list, tuple)):\n to_concat = [self] + other\n else:\n to_concat = [self, other]\n return concat(to_concat, ignore_index=ignore_index,\n verify_integrity=verify_integrity,\n sort=sort)", "response": "Append rows of other to the end of self."} {"instruction": "How would you code a function in Python 3 to\njoin columns of another DataFrame. Join columns with `other` DataFrame either on index or on a key column. Efficiently join multiple DataFrame objects by index at once by passing a list. Parameters ---------- other : DataFrame, Series, or list of DataFrame Index should be similar to one of the columns in this one. If a Series is passed, its name attribute must be set, and that will be used as the column name in the resulting joined DataFrame. on : str, list of str, or array-like, optional Column or index level name(s) in the caller to join on the index in `other`, otherwise joins index-on-index. If multiple values given, the `other` DataFrame must have a MultiIndex. Can pass an array as the join key if it is not already contained in the calling DataFrame. Like an Excel VLOOKUP operation. how : {'left', 'right', 'outer', 'inner'}, default 'left' How to handle the operation of the two objects. * left: use calling frame's index (or column if on is specified) * right: use `other`'s index. * outer: form union of calling frame's index (or column if on is specified) with `other`'s index, and sort it. lexicographically. * inner: form intersection of calling frame's index (or column if on is specified) with `other`'s index, preserving the order of the calling's one. lsuffix : str, default '' Suffix to use from left frame's overlapping columns. rsuffix : str, default '' Suffix to use from right frame's overlapping columns. sort : bool, default False Order result DataFrame lexicographically by the join key. If False, the order of the join key depends on the join type (how keyword). Returns ------- DataFrame A dataframe containing columns from both the caller and `other`. See Also -------- DataFrame.merge : For column(s)-on-columns(s) operations. Notes ----- Parameters `on`, `lsuffix`, and `rsuffix` are not supported when passing a list of `DataFrame` objects. Support for specifying index levels as the `on` parameter was added in version 0.23.0. Examples -------- >>> df = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3', 'K4', 'K5'], ... 'A': ['A0', 'A1', 'A2', 'A3', 'A4', 'A5']}) >>> df key A 0 K0 A0 1 K1 A1 2 K2 A2 3 K3 A3 4 K4 A4 5 K5 A5 >>> other = pd.DataFrame({'key': ['K0', 'K1', 'K2'], ... 'B': ['B0', 'B1', 'B2']}) >>> other key B 0 K0 B0 1 K1 B1 2 K2 B2 Join DataFrames using their indexes. >>> df.join(other, lsuffix='_caller', rsuffix='_other') key_caller A key_other B 0 K0 A0 K0 B0 1 K1 A1 K1 B1 2 K2 A2 K2 B2 3 K3 A3 NaN NaN 4 K4 A4 NaN NaN 5 K5 A5 NaN NaN If we want to join using the key columns, we need to set key to be the index in both `df` and `other`. The joined DataFrame will have key as its index. >>> df.set_index('key').join(other.set_index('key')) A B key K0 A0 B0 K1 A1 B1 K2 A2 B2 K3 A3 NaN K4 A4 NaN K5 A5 NaN Another option to join using the key columns is to use the `on` parameter. DataFrame.join always uses `other`'s index but we can use any column in `df`. This method preserves the original DataFrame's index in the result. >>> df.join(other.set_index('key'), on='key') key A B 0 K0 A0 B0 1 K1 A1 B1 2 K2 A2 B2 3 K3 A3 NaN 4 K4 A4 NaN 5 K5 A5 NaN", "response": "def join(self, other, on=None, how='left', lsuffix='', rsuffix='',\n sort=False):\n \"\"\"\n Join columns of another DataFrame.\n\n Join columns with `other` DataFrame either on index or on a key\n column. Efficiently join multiple DataFrame objects by index at once by\n passing a list.\n\n Parameters\n ----------\n other : DataFrame, Series, or list of DataFrame\n Index should be similar to one of the columns in this one. If a\n Series is passed, its name attribute must be set, and that will be\n used as the column name in the resulting joined DataFrame.\n on : str, list of str, or array-like, optional\n Column or index level name(s) in the caller to join on the index\n in `other`, otherwise joins index-on-index. If multiple\n values given, the `other` DataFrame must have a MultiIndex. Can\n pass an array as the join key if it is not already contained in\n the calling DataFrame. Like an Excel VLOOKUP operation.\n how : {'left', 'right', 'outer', 'inner'}, default 'left'\n How to handle the operation of the two objects.\n\n * left: use calling frame's index (or column if on is specified)\n * right: use `other`'s index.\n * outer: form union of calling frame's index (or column if on is\n specified) with `other`'s index, and sort it.\n lexicographically.\n * inner: form intersection of calling frame's index (or column if\n on is specified) with `other`'s index, preserving the order\n of the calling's one.\n lsuffix : str, default ''\n Suffix to use from left frame's overlapping columns.\n rsuffix : str, default ''\n Suffix to use from right frame's overlapping columns.\n sort : bool, default False\n Order result DataFrame lexicographically by the join key. If False,\n the order of the join key depends on the join type (how keyword).\n\n Returns\n -------\n DataFrame\n A dataframe containing columns from both the caller and `other`.\n\n See Also\n --------\n DataFrame.merge : For column(s)-on-columns(s) operations.\n\n Notes\n -----\n Parameters `on`, `lsuffix`, and `rsuffix` are not supported when\n passing a list of `DataFrame` objects.\n\n Support for specifying index levels as the `on` parameter was added\n in version 0.23.0.\n\n Examples\n --------\n >>> df = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3', 'K4', 'K5'],\n ... 'A': ['A0', 'A1', 'A2', 'A3', 'A4', 'A5']})\n\n >>> df\n key A\n 0 K0 A0\n 1 K1 A1\n 2 K2 A2\n 3 K3 A3\n 4 K4 A4\n 5 K5 A5\n\n >>> other = pd.DataFrame({'key': ['K0', 'K1', 'K2'],\n ... 'B': ['B0', 'B1', 'B2']})\n\n >>> other\n key B\n 0 K0 B0\n 1 K1 B1\n 2 K2 B2\n\n Join DataFrames using their indexes.\n\n >>> df.join(other, lsuffix='_caller', rsuffix='_other')\n key_caller A key_other B\n 0 K0 A0 K0 B0\n 1 K1 A1 K1 B1\n 2 K2 A2 K2 B2\n 3 K3 A3 NaN NaN\n 4 K4 A4 NaN NaN\n 5 K5 A5 NaN NaN\n\n If we want to join using the key columns, we need to set key to be\n the index in both `df` and `other`. The joined DataFrame will have\n key as its index.\n\n >>> df.set_index('key').join(other.set_index('key'))\n A B\n key\n K0 A0 B0\n K1 A1 B1\n K2 A2 B2\n K3 A3 NaN\n K4 A4 NaN\n K5 A5 NaN\n\n Another option to join using the key columns is to use the `on`\n parameter. DataFrame.join always uses `other`'s index but we can use\n any column in `df`. This method preserves the original DataFrame's\n index in the result.\n\n >>> df.join(other.set_index('key'), on='key')\n key A B\n 0 K0 A0 B0\n 1 K1 A1 B1\n 2 K2 A2 B2\n 3 K3 A3 NaN\n 4 K4 A4 NaN\n 5 K5 A5 NaN\n \"\"\"\n # For SparseDataFrame's benefit\n return self._join_compat(other, on=on, how=how, lsuffix=lsuffix,\n rsuffix=rsuffix, sort=sort)"} {"instruction": "Can you create a Python 3 function that\nreturns a DataFrame with the affected columns rounded to the specified number of decimal places.", "response": "def round(self, decimals=0, *args, **kwargs):\n \"\"\"\n Round a DataFrame to a variable number of decimal places.\n\n Parameters\n ----------\n decimals : int, dict, Series\n Number of decimal places to round each column to. If an int is\n given, round each column to the same number of places.\n Otherwise dict and Series round to variable numbers of places.\n Column names should be in the keys if `decimals` is a\n dict-like, or in the index if `decimals` is a Series. Any\n columns not included in `decimals` will be left as is. Elements\n of `decimals` which are not columns of the input will be\n ignored.\n *args\n Additional keywords have no effect but might be accepted for\n compatibility with numpy.\n **kwargs\n Additional keywords have no effect but might be accepted for\n compatibility with numpy.\n\n Returns\n -------\n DataFrame\n A DataFrame with the affected columns rounded to the specified\n number of decimal places.\n\n See Also\n --------\n numpy.around : Round a numpy array to the given number of decimals.\n Series.round : Round a Series to the given number of decimals.\n\n Examples\n --------\n >>> df = pd.DataFrame([(.21, .32), (.01, .67), (.66, .03), (.21, .18)],\n ... columns=['dogs', 'cats'])\n >>> df\n dogs cats\n 0 0.21 0.32\n 1 0.01 0.67\n 2 0.66 0.03\n 3 0.21 0.18\n\n By providing an integer each column is rounded to the same number\n of decimal places\n\n >>> df.round(1)\n dogs cats\n 0 0.2 0.3\n 1 0.0 0.7\n 2 0.7 0.0\n 3 0.2 0.2\n\n With a dict, the number of places for specific columns can be\n specfified with the column names as key and the number of decimal\n places as value\n\n >>> df.round({'dogs': 1, 'cats': 0})\n dogs cats\n 0 0.2 0.0\n 1 0.0 1.0\n 2 0.7 0.0\n 3 0.2 0.0\n\n Using a Series, the number of places for specific columns can be\n specfified with the column names as index and the number of\n decimal places as value\n\n >>> decimals = pd.Series([0, 1], index=['cats', 'dogs'])\n >>> df.round(decimals)\n dogs cats\n 0 0.2 0.0\n 1 0.0 1.0\n 2 0.7 0.0\n 3 0.2 0.0\n \"\"\"\n from pandas.core.reshape.concat import concat\n\n def _dict_round(df, decimals):\n for col, vals in df.iteritems():\n try:\n yield _series_round(vals, decimals[col])\n except KeyError:\n yield vals\n\n def _series_round(s, decimals):\n if is_integer_dtype(s) or is_float_dtype(s):\n return s.round(decimals)\n return s\n\n nv.validate_round(args, kwargs)\n\n if isinstance(decimals, (dict, Series)):\n if isinstance(decimals, Series):\n if not decimals.index.is_unique:\n raise ValueError(\"Index of decimals must be unique\")\n new_cols = [col for col in _dict_round(self, decimals)]\n elif is_integer(decimals):\n # Dispatch to Series.round\n new_cols = [_series_round(v, decimals)\n for _, v in self.iteritems()]\n else:\n raise TypeError(\"decimals must be an integer, a dict-like or a \"\n \"Series\")\n\n if len(new_cols) > 0:\n return self._constructor(concat(new_cols, axis=1),\n index=self.index,\n columns=self.columns)\n else:\n return self"} {"instruction": "Given the following Python 3 function, write the documentation\ndef corr(self, method='pearson', min_periods=1):\n numeric_df = self._get_numeric_data()\n cols = numeric_df.columns\n idx = cols.copy()\n mat = numeric_df.values\n\n if method == 'pearson':\n correl = libalgos.nancorr(ensure_float64(mat), minp=min_periods)\n elif method == 'spearman':\n correl = libalgos.nancorr_spearman(ensure_float64(mat),\n minp=min_periods)\n elif method == 'kendall' or callable(method):\n if min_periods is None:\n min_periods = 1\n mat = ensure_float64(mat).T\n corrf = nanops.get_corr_func(method)\n K = len(cols)\n correl = np.empty((K, K), dtype=float)\n mask = np.isfinite(mat)\n for i, ac in enumerate(mat):\n for j, bc in enumerate(mat):\n if i > j:\n continue\n\n valid = mask[i] & mask[j]\n if valid.sum() < min_periods:\n c = np.nan\n elif i == j:\n c = 1.\n elif not valid.all():\n c = corrf(ac[valid], bc[valid])\n else:\n c = corrf(ac, bc)\n correl[i, j] = c\n correl[j, i] = c\n else:\n raise ValueError(\"method must be either 'pearson', \"\n \"'spearman', 'kendall', or a callable, \"\n \"'{method}' was supplied\".format(method=method))\n\n return self._constructor(correl, index=idx, columns=cols)", "response": "Compute pairwise correlation of columns excluding NA values."} {"instruction": "Given the following Python 3 function, write the documentation\ndef corrwith(self, other, axis=0, drop=False, method='pearson'):\n axis = self._get_axis_number(axis)\n this = self._get_numeric_data()\n\n if isinstance(other, Series):\n return this.apply(lambda x: other.corr(x, method=method),\n axis=axis)\n\n other = other._get_numeric_data()\n left, right = this.align(other, join='inner', copy=False)\n\n if axis == 1:\n left = left.T\n right = right.T\n\n if method == 'pearson':\n # mask missing values\n left = left + right * 0\n right = right + left * 0\n\n # demeaned data\n ldem = left - left.mean()\n rdem = right - right.mean()\n\n num = (ldem * rdem).sum()\n dom = (left.count() - 1) * left.std() * right.std()\n\n correl = num / dom\n\n elif method in ['kendall', 'spearman'] or callable(method):\n def c(x):\n return nanops.nancorr(x[0], x[1], method=method)\n\n correl = Series(map(c,\n zip(left.values.T, right.values.T)),\n index=left.columns)\n\n else:\n raise ValueError(\"Invalid method {method} was passed, \"\n \"valid methods are: 'pearson', 'kendall', \"\n \"'spearman', or callable\".\n format(method=method))\n\n if not drop:\n # Find non-matching labels along the given axis\n # and append missing correlations (GH 22375)\n raxis = 1 if axis == 0 else 0\n result_index = (this._get_axis(raxis).\n union(other._get_axis(raxis)))\n idx_diff = result_index.difference(correl.index)\n\n if len(idx_diff) > 0:\n correl = correl.append(Series([np.nan] * len(idx_diff),\n index=idx_diff))\n\n return correl", "response": "Compute pairwise correlation between rows or columns of DataFrame and Series or DataFrame."} {"instruction": "Write a Python 3 script for\ncounting the number of non - NA cells for each column or row.", "response": "def count(self, axis=0, level=None, numeric_only=False):\n \"\"\"\n Count non-NA cells for each column or row.\n\n The values `None`, `NaN`, `NaT`, and optionally `numpy.inf` (depending\n on `pandas.options.mode.use_inf_as_na`) are considered NA.\n\n Parameters\n ----------\n axis : {0 or 'index', 1 or 'columns'}, default 0\n If 0 or 'index' counts are generated for each column.\n If 1 or 'columns' counts are generated for each **row**.\n level : int or str, optional\n If the axis is a `MultiIndex` (hierarchical), count along a\n particular `level`, collapsing into a `DataFrame`.\n A `str` specifies the level name.\n numeric_only : bool, default False\n Include only `float`, `int` or `boolean` data.\n\n Returns\n -------\n Series or DataFrame\n For each column/row the number of non-NA/null entries.\n If `level` is specified returns a `DataFrame`.\n\n See Also\n --------\n Series.count: Number of non-NA elements in a Series.\n DataFrame.shape: Number of DataFrame rows and columns (including NA\n elements).\n DataFrame.isna: Boolean same-sized DataFrame showing places of NA\n elements.\n\n Examples\n --------\n Constructing DataFrame from a dictionary:\n\n >>> df = pd.DataFrame({\"Person\":\n ... [\"John\", \"Myla\", \"Lewis\", \"John\", \"Myla\"],\n ... \"Age\": [24., np.nan, 21., 33, 26],\n ... \"Single\": [False, True, True, True, False]})\n >>> df\n Person Age Single\n 0 John 24.0 False\n 1 Myla NaN True\n 2 Lewis 21.0 True\n 3 John 33.0 True\n 4 Myla 26.0 False\n\n Notice the uncounted NA values:\n\n >>> df.count()\n Person 5\n Age 4\n Single 5\n dtype: int64\n\n Counts for each **row**:\n\n >>> df.count(axis='columns')\n 0 3\n 1 2\n 2 3\n 3 3\n 4 3\n dtype: int64\n\n Counts for one level of a `MultiIndex`:\n\n >>> df.set_index([\"Person\", \"Single\"]).count(level=\"Person\")\n Age\n Person\n John 2\n Lewis 1\n Myla 1\n \"\"\"\n axis = self._get_axis_number(axis)\n if level is not None:\n return self._count_level(level, axis=axis,\n numeric_only=numeric_only)\n\n if numeric_only:\n frame = self._get_numeric_data()\n else:\n frame = self\n\n # GH #423\n if len(frame._get_axis(axis)) == 0:\n result = Series(0, index=frame._get_agg_axis(axis))\n else:\n if frame._is_mixed_type or frame._data.any_extension_types:\n # the or any_extension_types is really only hit for single-\n # column frames with an extension array\n result = notna(frame).sum(axis=axis)\n else:\n # GH13407\n series_counts = notna(frame).sum(axis=axis)\n counts = series_counts.values\n result = Series(counts, index=frame._get_agg_axis(axis))\n\n return result.astype('int64')"} {"instruction": "Write a Python 3 function that can\nreturn Series with number of unique observations over requested axis.", "response": "def nunique(self, axis=0, dropna=True):\n \"\"\"\n Count distinct observations over requested axis.\n\n Return Series with number of distinct observations. Can ignore NaN\n values.\n\n .. versionadded:: 0.20.0\n\n Parameters\n ----------\n axis : {0 or 'index', 1 or 'columns'}, default 0\n The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for\n column-wise.\n dropna : bool, default True\n Don't include NaN in the counts.\n\n Returns\n -------\n Series\n\n See Also\n --------\n Series.nunique: Method nunique for Series.\n DataFrame.count: Count non-NA cells for each column or row.\n\n Examples\n --------\n >>> df = pd.DataFrame({'A': [1, 2, 3], 'B': [1, 1, 1]})\n >>> df.nunique()\n A 3\n B 1\n dtype: int64\n\n >>> df.nunique(axis=1)\n 0 1\n 1 2\n 2 2\n dtype: int64\n \"\"\"\n return self.apply(Series.nunique, axis=axis, dropna=dropna)"} {"instruction": "Implement a function in Python 3 to\nreturn index of first occurrence of minimum over requested axis.", "response": "def idxmin(self, axis=0, skipna=True):\n \"\"\"\n Return index of first occurrence of minimum over requested axis.\n NA/null values are excluded.\n\n Parameters\n ----------\n axis : {0 or 'index', 1 or 'columns'}, default 0\n 0 or 'index' for row-wise, 1 or 'columns' for column-wise\n skipna : boolean, default True\n Exclude NA/null values. If an entire row/column is NA, the result\n will be NA.\n\n Returns\n -------\n Series\n Indexes of minima along the specified axis.\n\n Raises\n ------\n ValueError\n * If the row/column is empty\n\n See Also\n --------\n Series.idxmin\n\n Notes\n -----\n This method is the DataFrame version of ``ndarray.argmin``.\n \"\"\"\n axis = self._get_axis_number(axis)\n indices = nanops.nanargmin(self.values, axis=axis, skipna=skipna)\n index = self._get_axis(axis)\n result = [index[i] if i >= 0 else np.nan for i in indices]\n return Series(result, index=self._get_agg_axis(axis))"} {"instruction": "Write a Python 3 script to\nreturn the DataFrame columns or index of the DataFrame columns or index of the DataFrame.", "response": "def _get_agg_axis(self, axis_num):\n \"\"\"\n Let's be explicit about this.\n \"\"\"\n if axis_num == 0:\n return self.columns\n elif axis_num == 1:\n return self.index\n else:\n raise ValueError('Axis must be 0 or 1 (got %r)' % axis_num)"} {"instruction": "Given the following Python 3 function, write the documentation\ndef mode(self, axis=0, numeric_only=False, dropna=True):\n data = self if not numeric_only else self._get_numeric_data()\n\n def f(s):\n return s.mode(dropna=dropna)\n\n return data.apply(f, axis=axis)", "response": "Return a DataFrame containing the mode of each element along the specified axis."} {"instruction": "How would you explain what the following Python 3 function does\ndef quantile(self, q=0.5, axis=0, numeric_only=True,\n interpolation='linear'):\n \"\"\"\n Return values at the given quantile over requested axis.\n\n Parameters\n ----------\n q : float or array-like, default 0.5 (50% quantile)\n Value between 0 <= q <= 1, the quantile(s) to compute.\n axis : {0, 1, 'index', 'columns'} (default 0)\n Equals 0 or 'index' for row-wise, 1 or 'columns' for column-wise.\n numeric_only : bool, default True\n If False, the quantile of datetime and timedelta data will be\n computed as well.\n interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'}\n This optional parameter specifies the interpolation method to use,\n when the desired quantile lies between two data points `i` and `j`:\n\n * linear: `i + (j - i) * fraction`, where `fraction` is the\n fractional part of the index surrounded by `i` and `j`.\n * lower: `i`.\n * higher: `j`.\n * nearest: `i` or `j` whichever is nearest.\n * midpoint: (`i` + `j`) / 2.\n\n .. versionadded:: 0.18.0\n\n Returns\n -------\n Series or DataFrame\n\n If ``q`` is an array, a DataFrame will be returned where the\n index is ``q``, the columns are the columns of self, and the\n values are the quantiles.\n If ``q`` is a float, a Series will be returned where the\n index is the columns of self and the values are the quantiles.\n\n See Also\n --------\n core.window.Rolling.quantile: Rolling quantile.\n numpy.percentile: Numpy function to compute the percentile.\n\n Examples\n --------\n >>> df = pd.DataFrame(np.array([[1, 1], [2, 10], [3, 100], [4, 100]]),\n ... columns=['a', 'b'])\n >>> df.quantile(.1)\n a 1.3\n b 3.7\n Name: 0.1, dtype: float64\n >>> df.quantile([.1, .5])\n a b\n 0.1 1.3 3.7\n 0.5 2.5 55.0\n\n Specifying `numeric_only=False` will also compute the quantile of\n datetime and timedelta data.\n\n >>> df = pd.DataFrame({'A': [1, 2],\n ... 'B': [pd.Timestamp('2010'),\n ... pd.Timestamp('2011')],\n ... 'C': [pd.Timedelta('1 days'),\n ... pd.Timedelta('2 days')]})\n >>> df.quantile(0.5, numeric_only=False)\n A 1.5\n B 2010-07-02 12:00:00\n C 1 days 12:00:00\n Name: 0.5, dtype: object\n \"\"\"\n self._check_percentile(q)\n\n data = self._get_numeric_data() if numeric_only else self\n axis = self._get_axis_number(axis)\n is_transposed = axis == 1\n\n if is_transposed:\n data = data.T\n\n result = data._data.quantile(qs=q,\n axis=1,\n interpolation=interpolation,\n transposed=is_transposed)\n\n if result.ndim == 2:\n result = self._constructor(result)\n else:\n result = self._constructor_sliced(result, name=q)\n\n if is_transposed:\n result = result.T\n\n return result", "response": "Return values at the given quantile over the given axis."} {"instruction": "Here you have a function in Python 3, explain what it does\ndef to_timestamp(self, freq=None, how='start', axis=0, copy=True):\n new_data = self._data\n if copy:\n new_data = new_data.copy()\n\n axis = self._get_axis_number(axis)\n if axis == 0:\n new_data.set_axis(1, self.index.to_timestamp(freq=freq, how=how))\n elif axis == 1:\n new_data.set_axis(0, self.columns.to_timestamp(freq=freq, how=how))\n else: # pragma: no cover\n raise AssertionError('Axis must be 0 or 1. Got {ax!s}'.format(\n ax=axis))\n\n return self._constructor(new_data)", "response": "Cast to DatetimeIndex of timestamps at the beginning of period."} {"instruction": "How would you explain what the following Python 3 function does\ndef isin(self, values):\n if isinstance(values, dict):\n from pandas.core.reshape.concat import concat\n values = collections.defaultdict(list, values)\n return concat((self.iloc[:, [i]].isin(values[col])\n for i, col in enumerate(self.columns)), axis=1)\n elif isinstance(values, Series):\n if not values.index.is_unique:\n raise ValueError(\"cannot compute isin with \"\n \"a duplicate axis.\")\n return self.eq(values.reindex_like(self), axis='index')\n elif isinstance(values, DataFrame):\n if not (values.columns.is_unique and values.index.is_unique):\n raise ValueError(\"cannot compute isin with \"\n \"a duplicate axis.\")\n return self.eq(values.reindex_like(self))\n else:\n if not is_list_like(values):\n raise TypeError(\"only list-like or dict-like objects are \"\n \"allowed to be passed to DataFrame.isin(), \"\n \"you passed a \"\n \"{0!r}\".format(type(values).__name__))\n return DataFrame(\n algorithms.isin(self.values.ravel(),\n values).reshape(self.shape), self.index,\n self.columns)", "response": "Test whether each element in the DataFrame is contained in values."} {"instruction": "Given the following Python 3 function, write the documentation\ndef integer_array(values, dtype=None, copy=False):\n values, mask = coerce_to_array(values, dtype=dtype, copy=copy)\n return IntegerArray(values, mask)", "response": "Infer and return an integer array of the values."} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef safe_cast(values, dtype, copy):\n\n try:\n return values.astype(dtype, casting='safe', copy=copy)\n except TypeError:\n\n casted = values.astype(dtype, copy=copy)\n if (casted == values).all():\n return casted\n\n raise TypeError(\"cannot safely cast non-equivalent {} to {}\".format(\n values.dtype, np.dtype(dtype)))", "response": "Safely cast the values to the dtype."} {"instruction": "Given the following Python 3 function, write the documentation\ndef coerce_to_array(values, dtype, mask=None, copy=False):\n # if values is integer numpy array, preserve it's dtype\n if dtype is None and hasattr(values, 'dtype'):\n if is_integer_dtype(values.dtype):\n dtype = values.dtype\n\n if dtype is not None:\n if (isinstance(dtype, str) and\n (dtype.startswith(\"Int\") or dtype.startswith(\"UInt\"))):\n # Avoid DeprecationWarning from NumPy about np.dtype(\"Int64\")\n # https://github.com/numpy/numpy/pull/7476\n dtype = dtype.lower()\n\n if not issubclass(type(dtype), _IntegerDtype):\n try:\n dtype = _dtypes[str(np.dtype(dtype))]\n except KeyError:\n raise ValueError(\"invalid dtype specified {}\".format(dtype))\n\n if isinstance(values, IntegerArray):\n values, mask = values._data, values._mask\n if dtype is not None:\n values = values.astype(dtype.numpy_dtype, copy=False)\n\n if copy:\n values = values.copy()\n mask = mask.copy()\n return values, mask\n\n values = np.array(values, copy=copy)\n if is_object_dtype(values):\n inferred_type = lib.infer_dtype(values, skipna=True)\n if inferred_type == 'empty':\n values = np.empty(len(values))\n values.fill(np.nan)\n elif inferred_type not in ['floating', 'integer',\n 'mixed-integer', 'mixed-integer-float']:\n raise TypeError(\"{} cannot be converted to an IntegerDtype\".format(\n values.dtype))\n\n elif not (is_integer_dtype(values) or is_float_dtype(values)):\n raise TypeError(\"{} cannot be converted to an IntegerDtype\".format(\n values.dtype))\n\n if mask is None:\n mask = isna(values)\n else:\n assert len(mask) == len(values)\n\n if not values.ndim == 1:\n raise TypeError(\"values must be a 1D list-like\")\n if not mask.ndim == 1:\n raise TypeError(\"mask must be a 1D list-like\")\n\n # infer dtype if needed\n if dtype is None:\n dtype = np.dtype('int64')\n else:\n dtype = dtype.type\n\n # if we are float, let's make sure that we can\n # safely cast\n\n # we copy as need to coerce here\n if mask.any():\n values = values.copy()\n values[mask] = 1\n values = safe_cast(values, dtype, copy=False)\n else:\n values = safe_cast(values, dtype, copy=False)\n\n return values, mask", "response": "Coerce the input values array to numpy arrays with a mask"} {"instruction": "Implement a Python 3 function for\nconstructing a new object from a string.", "response": "def construct_from_string(cls, string):\n \"\"\"\n Construction from a string, raise a TypeError if not\n possible\n \"\"\"\n if string == cls.name:\n return cls()\n raise TypeError(\"Cannot construct a '{}' from \"\n \"'{}'\".format(cls, string))"} {"instruction": "Create a Python 3 function for\ncoercing to an ndarray of the correct dtype.", "response": "def _coerce_to_ndarray(self):\n \"\"\"\n coerce to an ndarary of object dtype\n \"\"\"\n\n # TODO(jreback) make this better\n data = self._data.astype(object)\n data[self._mask] = self._na_value\n return data"} {"instruction": "Given the following Python 3 function, write the documentation\ndef astype(self, dtype, copy=True):\n\n # if we are astyping to an existing IntegerDtype we can fastpath\n if isinstance(dtype, _IntegerDtype):\n result = self._data.astype(dtype.numpy_dtype, copy=False)\n return type(self)(result, mask=self._mask, copy=False)\n\n # coerce\n data = self._coerce_to_ndarray()\n return astype_nansafe(data, dtype, copy=None)", "response": "Cast to a NumPy array or IntegerArray with the specified type."} {"instruction": "Create a Python 3 function for\nreturning a Series containing counts of each category.", "response": "def value_counts(self, dropna=True):\n \"\"\"\n Returns a Series containing counts of each category.\n\n Every category will have an entry, even those with a count of 0.\n\n Parameters\n ----------\n dropna : boolean, default True\n Don't include counts of NaN.\n\n Returns\n -------\n counts : Series\n\n See Also\n --------\n Series.value_counts\n\n \"\"\"\n\n from pandas import Index, Series\n\n # compute counts on the data with no nans\n data = self._data[~self._mask]\n value_counts = Index(data).value_counts()\n array = value_counts.values\n\n # TODO(extension)\n # if we have allow Index to hold an ExtensionArray\n # this is easier\n index = value_counts.index.astype(object)\n\n # if we want nans, count the mask\n if not dropna:\n\n # TODO(extension)\n # appending to an Index *always* infers\n # w/o passing the dtype\n array = np.append(array, [self._mask.sum()])\n index = Index(np.concatenate(\n [index.values,\n np.array([np.nan], dtype=object)]), dtype=object)\n\n return Series(array, index=index)"} {"instruction": "Make a summary of the following Python 3 code\ndef _values_for_argsort(self) -> np.ndarray:\n data = self._data.copy()\n data[self._mask] = data.min() - 1\n return data", "response": "Return values for sorting."} {"instruction": "Can you implement a function in Python 3 that\nmasks the result if necessary.", "response": "def _maybe_mask_result(self, result, mask, other, op_name):\n \"\"\"\n Parameters\n ----------\n result : array-like\n mask : array-like bool\n other : scalar or array-like\n op_name : str\n \"\"\"\n\n # may need to fill infs\n # and mask wraparound\n if is_float_dtype(result):\n mask |= (result == np.inf) | (result == -np.inf)\n\n # if we have a float operand we are by-definition\n # a float result\n # or our op is a divide\n if ((is_float_dtype(other) or is_float(other)) or\n (op_name in ['rtruediv', 'truediv', 'rdiv', 'div'])):\n result[mask] = np.nan\n return result\n\n return type(self)(result, mask, copy=False)"} {"instruction": "How would you code a function in Python 3 to\nreturn the length of a single non - tuple indexer which could be a slice", "response": "def length_of_indexer(indexer, target=None):\n \"\"\"\n return the length of a single non-tuple indexer which could be a slice\n \"\"\"\n if target is not None and isinstance(indexer, slice):\n target_len = len(target)\n start = indexer.start\n stop = indexer.stop\n step = indexer.step\n if start is None:\n start = 0\n elif start < 0:\n start += target_len\n if stop is None or stop > target_len:\n stop = target_len\n elif stop < 0:\n stop += target_len\n if step is None:\n step = 1\n elif step < 0:\n step = -step\n return (stop - start + step - 1) // step\n elif isinstance(indexer, (ABCSeries, Index, np.ndarray, list)):\n return len(indexer)\n elif not is_list_like_indexer(indexer):\n return 1\n raise AssertionError(\"cannot find the length of the indexer\")"} {"instruction": "Explain what the following Python 3 code does\ndef convert_to_index_sliceable(obj, key):\n idx = obj.index\n if isinstance(key, slice):\n return idx._convert_slice_indexer(key, kind='getitem')\n\n elif isinstance(key, str):\n\n # we are an actual column\n if obj._data.items.contains(key):\n return None\n\n # We might have a datetimelike string that we can translate to a\n # slice here via partial string indexing\n if idx.is_all_dates:\n try:\n return idx._get_string_slice(key)\n except (KeyError, ValueError, NotImplementedError):\n return None\n\n return None", "response": "Convert a key to an index sliceable object."} {"instruction": "Can you tell what is the following Python 3 function doing\ndef check_setitem_lengths(indexer, value, values):\n # boolean with truth values == len of the value is ok too\n if isinstance(indexer, (np.ndarray, list)):\n if is_list_like(value) and len(indexer) != len(value):\n if not (isinstance(indexer, np.ndarray) and\n indexer.dtype == np.bool_ and\n len(indexer[indexer]) == len(value)):\n raise ValueError(\"cannot set using a list-like indexer \"\n \"with a different length than the value\")\n # slice\n elif isinstance(indexer, slice):\n\n if is_list_like(value) and len(values):\n if len(value) != length_of_indexer(indexer, values):\n raise ValueError(\"cannot set using a slice indexer with a \"\n \"different length than the value\")", "response": "Validate that the values and indexer are the same length."} {"instruction": "Explain what the following Python 3 code does\ndef convert_missing_indexer(indexer):\n\n if isinstance(indexer, dict):\n\n # a missing key (but not a tuple indexer)\n indexer = indexer['key']\n\n if isinstance(indexer, bool):\n raise KeyError(\"cannot use a single bool to index into setitem\")\n return indexer, True\n\n return indexer, False", "response": "reverse convert a missing indexer which is a dict\n return the scalar indexer and a boolean indicating if we converted\n "} {"instruction": "Can you tell what is the following Python 3 function doing\ndef convert_from_missing_indexer_tuple(indexer, axes):\n\n def get_indexer(_i, _idx):\n return (axes[_i].get_loc(_idx['key']) if isinstance(_idx, dict) else\n _idx)\n\n return tuple(get_indexer(_i, _idx) for _i, _idx in enumerate(indexer))", "response": "converts a tuple of indexers that doesn t have any indexers\n "} {"instruction": "Can you implement a function in Python 3 that\nattempts to convert indices into valid positive indices.", "response": "def maybe_convert_indices(indices, n):\n \"\"\"\n Attempt to convert indices into valid, positive indices.\n\n If we have negative indices, translate to positive here.\n If we have indices that are out-of-bounds, raise an IndexError.\n\n Parameters\n ----------\n indices : array-like\n The array of indices that we are to convert.\n n : int\n The number of elements in the array that we are indexing.\n\n Returns\n -------\n valid_indices : array-like\n An array-like of positive indices that correspond to the ones\n that were passed in initially to this function.\n\n Raises\n ------\n IndexError : one of the converted indices either exceeded the number\n of elements (specified by `n`) OR was still negative.\n \"\"\"\n\n if isinstance(indices, list):\n indices = np.array(indices)\n if len(indices) == 0:\n # If list is empty, np.array will return float and cause indexing\n # errors.\n return np.empty(0, dtype=np.intp)\n\n mask = indices < 0\n if mask.any():\n indices = indices.copy()\n indices[mask] += n\n\n mask = (indices >= n) | (indices < 0)\n if mask.any():\n raise IndexError(\"indices are out-of-bounds\")\n return indices"} {"instruction": "Write a Python 3 script to\nvalidate that the indices are within bounds.", "response": "def validate_indices(indices, n):\n \"\"\"\n Perform bounds-checking for an indexer.\n\n -1 is allowed for indicating missing values.\n\n Parameters\n ----------\n indices : ndarray\n n : int\n length of the array being indexed\n\n Raises\n ------\n ValueError\n\n Examples\n --------\n >>> validate_indices([1, 2], 3)\n # OK\n >>> validate_indices([1, -2], 3)\n ValueError\n >>> validate_indices([1, 2, 3], 3)\n IndexError\n >>> validate_indices([-1, -1], 0)\n # OK\n >>> validate_indices([0, 1], 0)\n IndexError\n \"\"\"\n if len(indices):\n min_idx = indices.min()\n if min_idx < -1:\n msg = (\"'indices' contains values less than allowed ({} < {})\"\n .format(min_idx, -1))\n raise ValueError(msg)\n\n max_idx = indices.max()\n if max_idx >= n:\n raise IndexError(\"indices are out-of-bounds\")"} {"instruction": "How would you explain what the following Python 3 function does\ndef _non_reducing_slice(slice_):\n # default to column slice, like DataFrame\n # ['A', 'B'] -> IndexSlices[:, ['A', 'B']]\n kinds = (ABCSeries, np.ndarray, Index, list, str)\n if isinstance(slice_, kinds):\n slice_ = IndexSlice[:, slice_]\n\n def pred(part):\n # true when slice does *not* reduce, False when part is a tuple,\n # i.e. MultiIndex slice\n return ((isinstance(part, slice) or is_list_like(part))\n and not isinstance(part, tuple))\n\n if not is_list_like(slice_):\n if not isinstance(slice_, slice):\n # a 1-d slice, like df.loc[1]\n slice_ = [[slice_]]\n else:\n # slice(a, b, c)\n slice_ = [slice_] # to tuplize later\n else:\n slice_ = [part if pred(part) else [part] for part in slice_]\n return tuple(slice_)", "response": "Returns a non - reduced version of a slice."} {"instruction": "How would you explain what the following Python 3 function does\ndef _maybe_numeric_slice(df, slice_, include_bool=False):\n if slice_ is None:\n dtypes = [np.number]\n if include_bool:\n dtypes.append(bool)\n slice_ = IndexSlice[:, df.select_dtypes(include=dtypes).columns]\n return slice_", "response": "If slice_ is None then we need to use that."} {"instruction": "Given the following Python 3 function, write the documentation\ndef _has_valid_tuple(self, key):\n for i, k in enumerate(key):\n if i >= self.obj.ndim:\n raise IndexingError('Too many indexers')\n try:\n self._validate_key(k, i)\n except ValueError:\n raise ValueError(\"Location based indexing can only have \"\n \"[{types}] types\"\n .format(types=self._valid_types))", "response": "check the key for valid keys across my indexer"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef _has_valid_positional_setitem_indexer(self, indexer):\n if isinstance(indexer, dict):\n raise IndexError(\"{0} cannot enlarge its target object\"\n .format(self.name))\n else:\n if not isinstance(indexer, tuple):\n indexer = self._tuplify(indexer)\n for ax, i in zip(self.obj.axes, indexer):\n if isinstance(i, slice):\n # should check the stop slice?\n pass\n elif is_list_like_indexer(i):\n # should check the elements?\n pass\n elif is_integer(i):\n if i >= len(ax):\n raise IndexError(\"{name} cannot enlarge its target \"\n \"object\".format(name=self.name))\n elif isinstance(i, dict):\n raise IndexError(\"{name} cannot enlarge its target object\"\n .format(name=self.name))\n\n return True", "response": "validate that an indexer cannot enlarge its target object"} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef _align_series(self, indexer, ser, multiindex_indexer=False):\n if isinstance(indexer, (slice, np.ndarray, list, Index)):\n indexer = tuple([indexer])\n\n if isinstance(indexer, tuple):\n\n # flatten np.ndarray indexers\n def ravel(i):\n return i.ravel() if isinstance(i, np.ndarray) else i\n indexer = tuple(map(ravel, indexer))\n\n aligners = [not com.is_null_slice(idx) for idx in indexer]\n sum_aligners = sum(aligners)\n single_aligner = sum_aligners == 1\n is_frame = self.obj.ndim == 2\n is_panel = self.obj.ndim >= 3\n obj = self.obj\n\n # are we a single alignable value on a non-primary\n # dim (e.g. panel: 1,2, or frame: 0) ?\n # hence need to align to a single axis dimension\n # rather that find all valid dims\n\n # frame\n if is_frame:\n single_aligner = single_aligner and aligners[0]\n\n # panel\n elif is_panel:\n single_aligner = (single_aligner and\n (aligners[1] or aligners[2]))\n\n # we have a frame, with multiple indexers on both axes; and a\n # series, so need to broadcast (see GH5206)\n if (sum_aligners == self.ndim and\n all(is_sequence(_) for _ in indexer)):\n ser = ser.reindex(obj.axes[0][indexer[0]], copy=True)._values\n\n # single indexer\n if len(indexer) > 1 and not multiindex_indexer:\n len_indexer = len(indexer[1])\n ser = np.tile(ser, len_indexer).reshape(len_indexer, -1).T\n\n return ser\n\n for i, idx in enumerate(indexer):\n ax = obj.axes[i]\n\n # multiple aligners (or null slices)\n if is_sequence(idx) or isinstance(idx, slice):\n if single_aligner and com.is_null_slice(idx):\n continue\n new_ix = ax[idx]\n if not is_list_like_indexer(new_ix):\n new_ix = Index([new_ix])\n else:\n new_ix = Index(new_ix)\n if ser.index.equals(new_ix) or not len(new_ix):\n return ser._values.copy()\n\n return ser.reindex(new_ix)._values\n\n # 2 dims\n elif single_aligner and is_frame:\n\n # reindex along index\n ax = self.obj.axes[1]\n if ser.index.equals(ax) or not len(ax):\n return ser._values.copy()\n return ser.reindex(ax)._values\n\n # >2 dims\n elif single_aligner:\n\n broadcast = []\n for n, labels in enumerate(self.obj._get_plane_axes(i)):\n\n # reindex along the matching dimensions\n if len(labels & ser.index):\n ser = ser.reindex(labels)\n else:\n broadcast.append((n, len(labels)))\n\n # broadcast along other dims\n ser = ser._values.copy()\n for (axis, l) in broadcast:\n shape = [-1] * (len(broadcast) + 1)\n shape[axis] = l\n ser = np.tile(ser, l).reshape(shape)\n\n if self.obj.ndim == 3:\n ser = ser.T\n\n return ser\n\n elif is_scalar(indexer):\n ax = self.obj._get_axis(1)\n\n if ser.index.equals(ax):\n return ser._values.copy()\n\n return ser.reindex(ax)._values\n\n raise ValueError('Incompatible indexer with Series')", "response": "Aligns the series with the specified indexer."} {"instruction": "How would you code a function in Python 3 to\ncheck whether the current indexing can be used through _multi_take.", "response": "def _multi_take_opportunity(self, tup):\n \"\"\"\n Check whether there is the possibility to use ``_multi_take``.\n Currently the limit is that all axes being indexed must be indexed with\n list-likes.\n\n Parameters\n ----------\n tup : tuple\n Tuple of indexers, one per axis\n\n Returns\n -------\n boolean: Whether the current indexing can be passed through _multi_take\n \"\"\"\n if not all(is_list_like_indexer(x) for x in tup):\n return False\n\n # just too complicated\n if any(com.is_bool_indexer(x) for x in tup):\n return False\n\n return True"} {"instruction": "Write a Python 3 function that can\ncreate the indexers for the passed tuple of keys, and execute the take operation. This allows the take operation to be executed all at once - rather than once for each dimension - improving efficiency. Parameters ---------- tup : tuple Tuple of indexers, one per axis Returns ------- values: same type as the object being indexed", "response": "def _multi_take(self, tup):\n \"\"\"\n Create the indexers for the passed tuple of keys, and execute the take\n operation. This allows the take operation to be executed all at once -\n rather than once for each dimension - improving efficiency.\n\n Parameters\n ----------\n tup : tuple\n Tuple of indexers, one per axis\n\n Returns\n -------\n values: same type as the object being indexed\n \"\"\"\n # GH 836\n o = self.obj\n d = {axis: self._get_listlike_indexer(key, axis)\n for (key, axis) in zip(tup, o._AXIS_ORDERS)}\n return o._reindex_with_indexers(d, copy=True, allow_dups=True)"} {"instruction": "Create a Python 3 function to\ntransform a list - like of keys into a new index and an indexer.", "response": "def _get_listlike_indexer(self, key, axis, raise_missing=False):\n \"\"\"\n Transform a list-like of keys into a new index and an indexer.\n\n Parameters\n ----------\n key : list-like\n Target labels\n axis: int\n Dimension on which the indexing is being made\n raise_missing: bool\n Whether to raise a KeyError if some labels are not found. Will be\n removed in the future, and then this method will always behave as\n if raise_missing=True.\n\n Raises\n ------\n KeyError\n If at least one key was requested but none was found, and\n raise_missing=True.\n\n Returns\n -------\n keyarr: Index\n New index (coinciding with 'key' if the axis is unique)\n values : array-like\n An indexer for the return object; -1 denotes keys not found\n \"\"\"\n o = self.obj\n ax = o._get_axis(axis)\n\n # Have the index compute an indexer or return None\n # if it cannot handle:\n indexer, keyarr = ax._convert_listlike_indexer(key,\n kind=self.name)\n # We only act on all found values:\n if indexer is not None and (indexer != -1).all():\n self._validate_read_indexer(key, indexer, axis,\n raise_missing=raise_missing)\n return ax[indexer], indexer\n\n if ax.is_unique:\n # If we are trying to get actual keys from empty Series, we\n # patiently wait for a KeyError later on - otherwise, convert\n if len(ax) or not len(key):\n key = self._convert_for_reindex(key, axis)\n indexer = ax.get_indexer_for(key)\n keyarr = ax.reindex(keyarr)[0]\n else:\n keyarr, indexer, new_indexer = ax._reindex_non_unique(keyarr)\n\n self._validate_read_indexer(keyarr, indexer,\n o._get_axis_number(axis),\n raise_missing=raise_missing)\n return keyarr, indexer"} {"instruction": "How would you explain what the following Python 3 function does\ndef _getitem_iterable(self, key, axis=None):\n\n if axis is None:\n axis = self.axis or 0\n\n self._validate_key(key, axis)\n\n labels = self.obj._get_axis(axis)\n\n if com.is_bool_indexer(key):\n # A boolean indexer\n key = check_bool_indexer(labels, key)\n inds, = key.nonzero()\n return self.obj._take(inds, axis=axis)\n else:\n # A collection of keys\n keyarr, indexer = self._get_listlike_indexer(key, axis,\n raise_missing=False)\n return self.obj._reindex_with_indexers({axis: [keyarr, indexer]},\n copy=True, allow_dups=True)", "response": "Return a Series or scalar with the values of the specified key in the specified axis."} {"instruction": "How would you implement a function in Python 3 that\nvalidates that the indexer can be used to return a result.", "response": "def _validate_read_indexer(self, key, indexer, axis, raise_missing=False):\n \"\"\"\n Check that indexer can be used to return a result (e.g. at least one\n element was found, unless the list of keys was actually empty).\n\n Parameters\n ----------\n key : list-like\n Target labels (only used to show correct error message)\n indexer: array-like of booleans\n Indices corresponding to the key (with -1 indicating not found)\n axis: int\n Dimension on which the indexing is being made\n raise_missing: bool\n Whether to raise a KeyError if some labels are not found. Will be\n removed in the future, and then this method will always behave as\n if raise_missing=True.\n\n Raises\n ------\n KeyError\n If at least one key was requested but none was found, and\n raise_missing=True.\n \"\"\"\n\n ax = self.obj._get_axis(axis)\n\n if len(key) == 0:\n return\n\n # Count missing values:\n missing = (indexer < 0).sum()\n\n if missing:\n if missing == len(indexer):\n raise KeyError(\n \"None of [{key}] are in the [{axis}]\".format(\n key=key, axis=self.obj._get_axis_name(axis)))\n\n # We (temporarily) allow for some missing keys with .loc, except in\n # some cases (e.g. setting) in which \"raise_missing\" will be False\n if not(self.name == 'loc' and not raise_missing):\n not_found = list(set(key) - set(ax))\n raise KeyError(\"{} not in index\".format(not_found))\n\n # we skip the warning on Categorical/Interval\n # as this check is actually done (check for\n # non-missing values), but a bit later in the\n # code, so we want to avoid warning & then\n # just raising\n\n _missing_key_warning = textwrap.dedent(\"\"\"\n Passing list-likes to .loc or [] with any missing label will raise\n KeyError in the future, you can use .reindex() as an alternative.\n\n See the documentation here:\n https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike\"\"\") # noqa\n\n if not (ax.is_categorical() or ax.is_interval()):\n warnings.warn(_missing_key_warning,\n FutureWarning, stacklevel=6)"} {"instruction": "Explain what the following Python 3 code does\ndef _convert_to_indexer(self, obj, axis=None, is_setter=False,\n raise_missing=False):\n \"\"\"\n Convert indexing key into something we can use to do actual fancy\n indexing on an ndarray\n\n Examples\n ix[:5] -> slice(0, 5)\n ix[[1,2,3]] -> [1,2,3]\n ix[['foo', 'bar', 'baz']] -> [i, j, k] (indices of foo, bar, baz)\n\n Going by Zen of Python?\n 'In the face of ambiguity, refuse the temptation to guess.'\n raise AmbiguousIndexError with integer labels?\n - No, prefer label-based indexing\n \"\"\"\n if axis is None:\n axis = self.axis or 0\n\n labels = self.obj._get_axis(axis)\n\n if isinstance(obj, slice):\n return self._convert_slice_indexer(obj, axis)\n\n # try to find out correct indexer, if not type correct raise\n try:\n obj = self._convert_scalar_indexer(obj, axis)\n except TypeError:\n\n # but we will allow setting\n if is_setter:\n pass\n\n # see if we are positional in nature\n is_int_index = labels.is_integer()\n is_int_positional = is_integer(obj) and not is_int_index\n\n # if we are a label return me\n try:\n return labels.get_loc(obj)\n except LookupError:\n if isinstance(obj, tuple) and isinstance(labels, MultiIndex):\n if is_setter and len(obj) == labels.nlevels:\n return {'key': obj}\n raise\n except TypeError:\n pass\n except (ValueError):\n if not is_int_positional:\n raise\n\n # a positional\n if is_int_positional:\n\n # if we are setting and its not a valid location\n # its an insert which fails by definition\n if is_setter:\n\n # always valid\n if self.name == 'loc':\n return {'key': obj}\n\n # a positional\n if (obj >= self.obj.shape[axis] and\n not isinstance(labels, MultiIndex)):\n raise ValueError(\"cannot set by positional indexing with \"\n \"enlargement\")\n\n return obj\n\n if is_nested_tuple(obj, labels):\n return labels.get_locs(obj)\n\n elif is_list_like_indexer(obj):\n\n if com.is_bool_indexer(obj):\n obj = check_bool_indexer(labels, obj)\n inds, = obj.nonzero()\n return inds\n else:\n # When setting, missing keys are not allowed, even with .loc:\n kwargs = {'raise_missing': True if is_setter else\n raise_missing}\n return self._get_listlike_indexer(obj, axis, **kwargs)[1]\n else:\n try:\n return labels.get_loc(obj)\n except LookupError:\n # allow a not found key only if we are a setter\n if not is_list_like_indexer(obj) and is_setter:\n return {'key': obj}\n raise", "response": "Convert an object to an indexable object."} {"instruction": "Can you create a Python 3 function that\ntransforms a list of keys into a new array ready to be used as axis of .", "response": "def _convert_for_reindex(self, key, axis=None):\n \"\"\"\n Transform a list of keys into a new array ready to be used as axis of\n the object we return (e.g. including NaNs).\n\n Parameters\n ----------\n key : list-like\n Target labels\n axis: int\n Where the indexing is being made\n\n Returns\n -------\n list-like of labels\n \"\"\"\n\n if axis is None:\n axis = self.axis or 0\n labels = self.obj._get_axis(axis)\n\n if com.is_bool_indexer(key):\n key = check_bool_indexer(labels, key)\n return labels[key]\n\n if isinstance(key, Index):\n keyarr = labels._convert_index_indexer(key)\n else:\n # asarray can be unsafe, NumPy strings are weird\n keyarr = com.asarray_tuplesafe(key)\n\n if is_integer_dtype(keyarr):\n # Cast the indexer to uint64 if possible so\n # that the values returned from indexing are\n # also uint64.\n keyarr = labels._convert_arr_indexer(keyarr)\n\n if not labels.is_integer():\n keyarr = ensure_platform_int(keyarr)\n return labels.take(keyarr)\n\n return keyarr"} {"instruction": "How would you implement a function in Python 3 that\ntranslates any partial string timestamp matches in key returning the new key ( GH 10331", "response": "def _get_partial_string_timestamp_match_key(self, key, labels):\n \"\"\"Translate any partial string timestamp matches in key, returning the\n new key (GH 10331)\"\"\"\n if isinstance(labels, MultiIndex):\n if (isinstance(key, str) and labels.levels[0].is_all_dates):\n # Convert key '2016-01-01' to\n # ('2016-01-01'[, slice(None, None, None)]+)\n key = tuple([key] + [slice(None)] * (len(labels.levels) - 1))\n\n if isinstance(key, tuple):\n # Convert (..., '2016-01-01', ...) in tuple to\n # (..., slice('2016-01-01', '2016-01-01', None), ...)\n new_key = []\n for i, component in enumerate(key):\n if (isinstance(component, str) and\n labels.levels[i].is_all_dates):\n new_key.append(slice(component, component, None))\n else:\n new_key.append(component)\n key = tuple(new_key)\n\n return key"} {"instruction": "Write a Python 3 script to\nvalidate that the key is a valid position in the desired axis.", "response": "def _validate_integer(self, key, axis):\n \"\"\"\n Check that 'key' is a valid position in the desired axis.\n\n Parameters\n ----------\n key : int\n Requested position\n axis : int\n Desired axis\n\n Returns\n -------\n None\n\n Raises\n ------\n IndexError\n If 'key' is not a valid position in axis 'axis'\n \"\"\"\n\n len_axis = len(self.obj._get_axis(axis))\n if key >= len_axis or key < -len_axis:\n raise IndexError(\"single positional indexer is out-of-bounds\")"} {"instruction": "Write a Python 3 script for\nreturning Series values by list or array of integers", "response": "def _get_list_axis(self, key, axis=None):\n \"\"\"\n Return Series values by list or array of integers\n\n Parameters\n ----------\n key : list-like positional indexer\n axis : int (can only be zero)\n\n Returns\n -------\n Series object\n \"\"\"\n if axis is None:\n axis = self.axis or 0\n try:\n return self.obj._take(key, axis=axis)\n except IndexError:\n # re-raise with different error message\n raise IndexError(\"positional indexers are out-of-bounds\")"} {"instruction": "Here you have a function in Python 3, explain what it does\ndef _convert_to_indexer(self, obj, axis=None, is_setter=False):\n if axis is None:\n axis = self.axis or 0\n\n # make need to convert a float key\n if isinstance(obj, slice):\n return self._convert_slice_indexer(obj, axis)\n\n elif is_float(obj):\n return self._convert_scalar_indexer(obj, axis)\n\n try:\n self._validate_key(obj, axis)\n return obj\n except ValueError:\n raise ValueError(\"Can only index by location with \"\n \"a [{types}]\".format(types=self._valid_types))", "response": "converts an object to a key - value index"} {"instruction": "How would you explain what the following Python 3 function does\ndef _convert_key(self, key, is_setter=False):\n\n # allow arbitrary setting\n if is_setter:\n return list(key)\n\n for ax, i in zip(self.obj.axes, key):\n if ax.is_integer():\n if not is_integer(i):\n raise ValueError(\"At based indexing on an integer index \"\n \"can only have integer indexers\")\n else:\n if is_integer(i) and not ax.holds_integer():\n raise ValueError(\"At based indexing on an non-integer \"\n \"index can only have non-integer \"\n \"indexers\")\n return key", "response": "Convert the key to the correct type for the index."} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef _convert_key(self, key, is_setter=False):\n for a, i in zip(self.obj.axes, key):\n if not is_integer(i):\n raise ValueError(\"iAt based indexing can only have integer \"\n \"indexers\")\n return key", "response": "converts a key to a label"} {"instruction": "Can you create a Python 3 function that\ncreates and return a block manager from a dataframe of series columns index", "response": "def to_manager(sdf, columns, index):\n \"\"\" create and return the block manager from a dataframe of series,\n columns, index\n \"\"\"\n\n # from BlockManager perspective\n axes = [ensure_index(columns), ensure_index(index)]\n\n return create_block_manager_from_arrays(\n [sdf[c] for c in columns], columns, axes)"} {"instruction": "How would you code a function in Python 3 to\nconform a set of SparseSeries to a common SparseIndex corresponding to the locations where they all have data.", "response": "def homogenize(series_dict):\n \"\"\"\n Conform a set of SparseSeries (with NaN fill_value) to a common SparseIndex\n corresponding to the locations where they all have data\n\n Parameters\n ----------\n series_dict : dict or DataFrame\n\n Notes\n -----\n Using the dumbest algorithm I could think of. Should put some more thought\n into this\n\n Returns\n -------\n homogenized : dict of SparseSeries\n \"\"\"\n index = None\n\n need_reindex = False\n\n for _, series in series_dict.items():\n if not np.isnan(series.fill_value):\n raise TypeError('this method is only valid with NaN fill values')\n\n if index is None:\n index = series.sp_index\n elif not series.sp_index.equals(index):\n need_reindex = True\n index = index.intersect(series.sp_index)\n\n if need_reindex:\n output = {}\n for name, series in series_dict.items():\n if not series.sp_index.equals(index):\n series = series.sparse_reindex(index)\n\n output[name] = series\n else:\n output = series_dict\n\n return output"} {"instruction": "Create a Python 3 function for\nreturning the contents of the frame as a sparse SciPy COO matrix.", "response": "def to_coo(self):\n \"\"\"\n Return the contents of the frame as a sparse SciPy COO matrix.\n\n .. versionadded:: 0.20.0\n\n Returns\n -------\n coo_matrix : scipy.sparse.spmatrix\n If the caller is heterogeneous and contains booleans or objects,\n the result will be of dtype=object. See Notes.\n\n Notes\n -----\n The dtype will be the lowest-common-denominator type (implicit\n upcasting); that is to say if the dtypes (even of numeric types)\n are mixed, the one that accommodates all will be chosen.\n\n e.g. If the dtypes are float16 and float32, dtype will be upcast to\n float32. By numpy.find_common_type convention, mixing int64 and\n and uint64 will result in a float64 dtype.\n \"\"\"\n try:\n from scipy.sparse import coo_matrix\n except ImportError:\n raise ImportError('Scipy is not installed')\n\n dtype = find_common_type(self.dtypes)\n if isinstance(dtype, SparseDtype):\n dtype = dtype.subtype\n\n cols, rows, datas = [], [], []\n for col, name in enumerate(self):\n s = self[name]\n row = s.sp_index.to_int_index().indices\n cols.append(np.repeat(col, len(row)))\n rows.append(row)\n datas.append(s.sp_values.astype(dtype, copy=False))\n\n cols = np.concatenate(cols)\n rows = np.concatenate(rows)\n datas = np.concatenate(datas)\n return coo_matrix((datas, (rows, cols)), shape=self.shape)"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef _unpickle_sparse_frame_compat(self, state):\n series, cols, idx, fv, kind = state\n\n if not isinstance(cols, Index): # pragma: no cover\n from pandas.io.pickle import _unpickle_array\n columns = _unpickle_array(cols)\n else:\n columns = cols\n\n if not isinstance(idx, Index): # pragma: no cover\n from pandas.io.pickle import _unpickle_array\n index = _unpickle_array(idx)\n else:\n index = idx\n\n series_dict = DataFrame()\n for col, (sp_index, sp_values) in series.items():\n series_dict[col] = SparseSeries(sp_values, sparse_index=sp_index,\n fill_value=fv)\n\n self._data = to_manager(series_dict, columns, index)\n self._default_fill_value = fv\n self._default_kind = kind", "response": "Unpickle the state of a sparse DataFrame."} {"instruction": "Can you write a function in Python 3 where it\nconverts to dense DataFrame", "response": "def to_dense(self):\n \"\"\"\n Convert to dense DataFrame\n\n Returns\n -------\n df : DataFrame\n \"\"\"\n data = {k: v.to_dense() for k, v in self.items()}\n return DataFrame(data, index=self.index, columns=self.columns)"} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef _apply_columns(self, func):\n\n new_data = {col: func(series)\n for col, series in self.items()}\n\n return self._constructor(\n data=new_data, index=self.index, columns=self.columns,\n default_fill_value=self.default_fill_value).__finalize__(self)", "response": "Returns a new SparseDataFrame applying func to each column of the DataFrame."} {"instruction": "Can you tell what is the following Python 3 function doing\ndef copy(self, deep=True):\n result = super().copy(deep=deep)\n result._default_fill_value = self._default_fill_value\n result._default_kind = self._default_kind\n return result", "response": "Make a copy of this SparseDataFrame\n "} {"instruction": "Implement a Python 3 function for\nreturning the density of the non - sparse data points represented in the frame.", "response": "def density(self):\n \"\"\"\n Ratio of non-sparse points to total (dense) data points\n represented in the frame\n \"\"\"\n tot_nonsparse = sum(ser.sp_index.npoints\n for _, ser in self.items())\n tot = len(self.index) * len(self.columns)\n return tot_nonsparse / float(tot)"} {"instruction": "Write a Python 3 script for\nsanitizing a column of the data.", "response": "def _sanitize_column(self, key, value, **kwargs):\n \"\"\"\n Creates a new SparseArray from the input value.\n\n Parameters\n ----------\n key : object\n value : scalar, Series, or array-like\n kwargs : dict\n\n Returns\n -------\n sanitized_column : SparseArray\n\n \"\"\"\n def sp_maker(x, index=None):\n return SparseArray(x, index=index,\n fill_value=self._default_fill_value,\n kind=self._default_kind)\n if isinstance(value, SparseSeries):\n clean = value.reindex(self.index).as_sparse_array(\n fill_value=self._default_fill_value, kind=self._default_kind)\n\n elif isinstance(value, SparseArray):\n if len(value) != len(self.index):\n raise ValueError('Length of values does not match '\n 'length of index')\n clean = value\n\n elif hasattr(value, '__iter__'):\n if isinstance(value, Series):\n clean = value.reindex(self.index)\n if not isinstance(value, SparseSeries):\n clean = sp_maker(clean)\n else:\n if len(value) != len(self.index):\n raise ValueError('Length of values does not match '\n 'length of index')\n clean = sp_maker(value)\n\n # Scalar\n else:\n clean = sp_maker(value, self.index)\n\n # always return a SparseArray!\n return clean"} {"instruction": "How would you explain what the following Python 3 function does\ndef xs(self, key, axis=0, copy=False):\n if axis == 1:\n data = self[key]\n return data\n\n i = self.index.get_loc(key)\n data = self.take([i]).get_values()[0]\n return Series(data, index=self.columns)", "response": "Returns a row from the SparseDataFrame as a Series."} {"instruction": "Explain what the following Python 3 code does\ndef transpose(self, *args, **kwargs):\n nv.validate_transpose(args, kwargs)\n return self._constructor(\n self.values.T, index=self.columns, columns=self.index,\n default_fill_value=self._default_fill_value,\n default_kind=self._default_kind).__finalize__(self)", "response": "Returns a DataFrame with the rows / columns switched."} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef cumsum(self, axis=0, *args, **kwargs):\n nv.validate_cumsum(args, kwargs)\n\n if axis is None:\n axis = self._stat_axis_number\n\n return self.apply(lambda x: x.cumsum(), axis=axis)", "response": "Return a SparseDataFrame of cumulative sums over requested axis."} {"instruction": "Make a summary of the following Python 3 code\ndef apply(self, func, axis=0, broadcast=None, reduce=None,\n result_type=None):\n \"\"\"\n Analogous to DataFrame.apply, for SparseDataFrame\n\n Parameters\n ----------\n func : function\n Function to apply to each column\n axis : {0, 1, 'index', 'columns'}\n broadcast : bool, default False\n For aggregation functions, return object of same size with values\n propagated\n\n .. deprecated:: 0.23.0\n This argument will be removed in a future version, replaced\n by result_type='broadcast'.\n\n reduce : boolean or None, default None\n Try to apply reduction procedures. If the DataFrame is empty,\n apply will use reduce to determine whether the result should be a\n Series or a DataFrame. If reduce is None (the default), apply's\n return value will be guessed by calling func an empty Series (note:\n while guessing, exceptions raised by func will be ignored). If\n reduce is True a Series will always be returned, and if False a\n DataFrame will always be returned.\n\n .. deprecated:: 0.23.0\n This argument will be removed in a future version, replaced\n by result_type='reduce'.\n\n result_type : {'expand', 'reduce', 'broadcast, None}\n These only act when axis=1 {columns}:\n\n * 'expand' : list-like results will be turned into columns.\n * 'reduce' : return a Series if possible rather than expanding\n list-like results. This is the opposite to 'expand'.\n * 'broadcast' : results will be broadcast to the original shape\n of the frame, the original index & columns will be retained.\n\n The default behaviour (None) depends on the return value of the\n applied function: list-like results will be returned as a Series\n of those. However if the apply function returns a Series these\n are expanded to columns.\n\n .. versionadded:: 0.23.0\n\n Returns\n -------\n applied : Series or SparseDataFrame\n \"\"\"\n if not len(self.columns):\n return self\n axis = self._get_axis_number(axis)\n\n if isinstance(func, np.ufunc):\n new_series = {}\n for k, v in self.items():\n applied = func(v)\n applied.fill_value = func(v.fill_value)\n new_series[k] = applied\n return self._constructor(\n new_series, index=self.index, columns=self.columns,\n default_fill_value=self._default_fill_value,\n default_kind=self._default_kind).__finalize__(self)\n\n from pandas.core.apply import frame_apply\n op = frame_apply(self,\n func=func,\n axis=axis,\n reduce=reduce,\n broadcast=broadcast,\n result_type=result_type)\n return op.get_result()", "response": "Analogous to DataFrame. apply for SparseDataFrame. apply"} {"instruction": "Implement a Python 3 function for\nconverting a conda package to its pip equivalent.", "response": "def conda_package_to_pip(package):\n \"\"\"\n Convert a conda package to its pip equivalent.\n\n In most cases they are the same, those are the exceptions:\n - Packages that should be excluded (in `EXCLUDE`)\n - Packages that should be renamed (in `RENAME`)\n - A package requiring a specific version, in conda is defined with a single\n equal (e.g. ``pandas=1.0``) and in pip with two (e.g. ``pandas==1.0``)\n \"\"\"\n if package in EXCLUDE:\n return\n\n package = re.sub('(?<=[^<>])=', '==', package).strip()\n for compare in ('<=', '>=', '=='):\n if compare not in package:\n continue\n\n pkg, version = package.split(compare)\n\n if pkg in RENAME:\n return ''.join((RENAME[pkg], compare, version))\n\n break\n\n return package"} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef main(conda_fname, pip_fname, compare=False):\n with open(conda_fname) as conda_fd:\n deps = yaml.safe_load(conda_fd)['dependencies']\n\n pip_deps = []\n for dep in deps:\n if isinstance(dep, str):\n conda_dep = conda_package_to_pip(dep)\n if conda_dep:\n pip_deps.append(conda_dep)\n elif isinstance(dep, dict) and len(dep) == 1 and 'pip' in dep:\n pip_deps += dep['pip']\n else:\n raise ValueError('Unexpected dependency {}'.format(dep))\n\n pip_content = '\\n'.join(pip_deps)\n\n if compare:\n with open(pip_fname) as pip_fd:\n return pip_content != pip_fd.read()\n else:\n with open(pip_fname, 'w') as pip_fd:\n pip_fd.write(pip_content)\n return False", "response": "This function generates the pip dependencies file from the conda file or compare that conda file they are synchronized."} {"instruction": "Explain what the following Python 3 code does\ndef maybe_convert_platform(values):\n\n if isinstance(values, (list, tuple)):\n values = construct_1d_object_array_from_listlike(list(values))\n if getattr(values, 'dtype', None) == np.object_:\n if hasattr(values, '_values'):\n values = values._values\n values = lib.maybe_convert_objects(values)\n\n return values", "response": "try to do platform conversion"} {"instruction": "Given the following Python 3 function, write the documentation\ndef is_nested_object(obj):\n\n if isinstance(obj, ABCSeries) and is_object_dtype(obj):\n\n if any(isinstance(v, ABCSeries) for v in obj.values):\n return True\n\n return False", "response": "Return a boolean if the object is a nested Series."} {"instruction": "Implement a function in Python 3 to\ntry to downcast the result to the specified dtype.", "response": "def maybe_downcast_to_dtype(result, dtype):\n \"\"\" try to cast to the specified dtype (e.g. convert back to bool/int\n or could be an astype of float64->float32\n \"\"\"\n\n if is_scalar(result):\n return result\n\n def trans(x):\n return x\n\n if isinstance(dtype, str):\n if dtype == 'infer':\n inferred_type = lib.infer_dtype(ensure_object(result.ravel()),\n skipna=False)\n if inferred_type == 'boolean':\n dtype = 'bool'\n elif inferred_type == 'integer':\n dtype = 'int64'\n elif inferred_type == 'datetime64':\n dtype = 'datetime64[ns]'\n elif inferred_type == 'timedelta64':\n dtype = 'timedelta64[ns]'\n\n # try to upcast here\n elif inferred_type == 'floating':\n dtype = 'int64'\n if issubclass(result.dtype.type, np.number):\n\n def trans(x): # noqa\n return x.round()\n else:\n dtype = 'object'\n\n if isinstance(dtype, str):\n dtype = np.dtype(dtype)\n\n try:\n\n # don't allow upcasts here (except if empty)\n if dtype.kind == result.dtype.kind:\n if (result.dtype.itemsize <= dtype.itemsize and\n np.prod(result.shape)):\n return result\n\n if is_bool_dtype(dtype) or is_integer_dtype(dtype):\n\n # if we don't have any elements, just astype it\n if not np.prod(result.shape):\n return trans(result).astype(dtype)\n\n # do a test on the first element, if it fails then we are done\n r = result.ravel()\n arr = np.array([r[0]])\n\n # if we have any nulls, then we are done\n if (isna(arr).any() or\n not np.allclose(arr, trans(arr).astype(dtype), rtol=0)):\n return result\n\n # a comparable, e.g. a Decimal may slip in here\n elif not isinstance(r[0], (np.integer, np.floating, np.bool, int,\n float, bool)):\n return result\n\n if (issubclass(result.dtype.type, (np.object_, np.number)) and\n notna(result).all()):\n new_result = trans(result).astype(dtype)\n try:\n if np.allclose(new_result, result, rtol=0):\n return new_result\n except Exception:\n\n # comparison of an object dtype with a number type could\n # hit here\n if (new_result == result).all():\n return new_result\n elif (issubclass(dtype.type, np.floating) and\n not is_bool_dtype(result.dtype)):\n return result.astype(dtype)\n\n # a datetimelike\n # GH12821, iNaT is casted to float\n elif dtype.kind in ['M', 'm'] and result.dtype.kind in ['i', 'f']:\n try:\n result = result.astype(dtype)\n except Exception:\n if dtype.tz:\n # convert to datetime and change timezone\n from pandas import to_datetime\n result = to_datetime(result).tz_localize('utc')\n result = result.tz_convert(dtype.tz)\n\n elif dtype.type == Period:\n # TODO(DatetimeArray): merge with previous elif\n from pandas.core.arrays import PeriodArray\n\n return PeriodArray(result, freq=dtype.freq)\n\n except Exception:\n pass\n\n return result"} {"instruction": "Make a summary of the following Python 3 code\ndef maybe_upcast_putmask(result, mask, other):\n\n if not isinstance(result, np.ndarray):\n raise ValueError(\"The result input must be a ndarray.\")\n\n if mask.any():\n # Two conversions for date-like dtypes that can't be done automatically\n # in np.place:\n # NaN -> NaT\n # integer or integer array -> date-like array\n if is_datetimelike(result.dtype):\n if is_scalar(other):\n if isna(other):\n other = result.dtype.type('nat')\n elif is_integer(other):\n other = np.array(other, dtype=result.dtype)\n elif is_integer_dtype(other):\n other = np.array(other, dtype=result.dtype)\n\n def changeit():\n\n # try to directly set by expanding our array to full\n # length of the boolean\n try:\n om = other[mask]\n om_at = om.astype(result.dtype)\n if (om == om_at).all():\n new_result = result.values.copy()\n new_result[mask] = om_at\n result[:] = new_result\n return result, False\n except Exception:\n pass\n\n # we are forced to change the dtype of the result as the input\n # isn't compatible\n r, _ = maybe_upcast(result, fill_value=other, copy=True)\n np.place(r, mask, other)\n\n return r, True\n\n # we want to decide whether place will work\n # if we have nans in the False portion of our mask then we need to\n # upcast (possibly), otherwise we DON't want to upcast (e.g. if we\n # have values, say integers, in the success portion then it's ok to not\n # upcast)\n new_dtype, _ = maybe_promote(result.dtype, other)\n if new_dtype != result.dtype:\n\n # we have a scalar or len 0 ndarray\n # and its nan and we are changing some values\n if (is_scalar(other) or\n (isinstance(other, np.ndarray) and other.ndim < 1)):\n if isna(other):\n return changeit()\n\n # we have an ndarray and the masking has nans in it\n else:\n\n if isna(other).any():\n return changeit()\n\n try:\n np.place(result, mask, other)\n except Exception:\n return changeit()\n\n return result, False", "response": "A safe version of putmask that potentially upcasts the result array."} {"instruction": "Make a summary of the following Python 3 code\ndef infer_dtype_from(val, pandas_dtype=False):\n if is_scalar(val):\n return infer_dtype_from_scalar(val, pandas_dtype=pandas_dtype)\n return infer_dtype_from_array(val, pandas_dtype=pandas_dtype)", "response": "interpret the dtype from a scalar or array"} {"instruction": "Here you have a function in Python 3, explain what it does\ndef infer_dtype_from_scalar(val, pandas_dtype=False):\n\n dtype = np.object_\n\n # a 1-element ndarray\n if isinstance(val, np.ndarray):\n msg = \"invalid ndarray passed to infer_dtype_from_scalar\"\n if val.ndim != 0:\n raise ValueError(msg)\n\n dtype = val.dtype\n val = val.item()\n\n elif isinstance(val, str):\n\n # If we create an empty array using a string to infer\n # the dtype, NumPy will only allocate one character per entry\n # so this is kind of bad. Alternately we could use np.repeat\n # instead of np.empty (but then you still don't want things\n # coming out as np.str_!\n\n dtype = np.object_\n\n elif isinstance(val, (np.datetime64, datetime)):\n val = tslibs.Timestamp(val)\n if val is tslibs.NaT or val.tz is None:\n dtype = np.dtype('M8[ns]')\n else:\n if pandas_dtype:\n dtype = DatetimeTZDtype(unit='ns', tz=val.tz)\n else:\n # return datetimetz as object\n return np.object_, val\n val = val.value\n\n elif isinstance(val, (np.timedelta64, timedelta)):\n val = tslibs.Timedelta(val).value\n dtype = np.dtype('m8[ns]')\n\n elif is_bool(val):\n dtype = np.bool_\n\n elif is_integer(val):\n if isinstance(val, np.integer):\n dtype = type(val)\n else:\n dtype = np.int64\n\n elif is_float(val):\n if isinstance(val, np.floating):\n dtype = type(val)\n else:\n dtype = np.float64\n\n elif is_complex(val):\n dtype = np.complex_\n\n elif pandas_dtype:\n if lib.is_period(val):\n dtype = PeriodDtype(freq=val.freq)\n val = val.ordinal\n\n return dtype, val", "response": "interprets the dtype from a scalar value"} {"instruction": "Explain what the following Python 3 code does\ndef infer_dtype_from_array(arr, pandas_dtype=False):\n\n if isinstance(arr, np.ndarray):\n return arr.dtype, arr\n\n if not is_list_like(arr):\n arr = [arr]\n\n if pandas_dtype and is_extension_type(arr):\n return arr.dtype, arr\n\n elif isinstance(arr, ABCSeries):\n return arr.dtype, np.asarray(arr)\n\n # don't force numpy coerce with nan's\n inferred = lib.infer_dtype(arr, skipna=False)\n if inferred in ['string', 'bytes', 'unicode',\n 'mixed', 'mixed-integer']:\n return (np.object_, arr)\n\n arr = np.asarray(arr)\n return arr.dtype, arr", "response": "infer the dtype from a scalar or array"} {"instruction": "Create a Python 3 function for\ntrying to infer the dtype of an object.", "response": "def maybe_infer_dtype_type(element):\n \"\"\"Try to infer an object's dtype, for use in arithmetic ops\n\n Uses `element.dtype` if that's available.\n Objects implementing the iterator protocol are cast to a NumPy array,\n and from there the array's type is used.\n\n Parameters\n ----------\n element : object\n Possibly has a `.dtype` attribute, and possibly the iterator\n protocol.\n\n Returns\n -------\n tipo : type\n\n Examples\n --------\n >>> from collections import namedtuple\n >>> Foo = namedtuple(\"Foo\", \"dtype\")\n >>> maybe_infer_dtype_type(Foo(np.dtype(\"i8\")))\n numpy.int64\n \"\"\"\n tipo = None\n if hasattr(element, 'dtype'):\n tipo = element.dtype\n elif is_list_like(element):\n element = np.asarray(element)\n tipo = element.dtype\n return tipo"} {"instruction": "Write a Python 3 function that can\nprovide explicit type promotion and coercion of the values to the type of the object.", "response": "def maybe_upcast(values, fill_value=np.nan, dtype=None, copy=False):\n \"\"\" provide explicit type promotion and coercion\n\n Parameters\n ----------\n values : the ndarray that we want to maybe upcast\n fill_value : what we want to fill with\n dtype : if None, then use the dtype of the values, else coerce to this type\n copy : if True always make a copy even if no upcast is required\n \"\"\"\n\n if is_extension_type(values):\n if copy:\n values = values.copy()\n else:\n if dtype is None:\n dtype = values.dtype\n new_dtype, fill_value = maybe_promote(dtype, fill_value)\n if new_dtype != values.dtype:\n values = values.astype(new_dtype)\n elif copy:\n values = values.copy()\n\n return values, fill_value"} {"instruction": "Here you have a function in Python 3, explain what it does\ndef invalidate_string_dtypes(dtype_set):\n non_string_dtypes = dtype_set - {np.dtype('S').type, np.dtype(' 1 and coerce:\n raise ValueError(\"Only one of 'datetime', 'numeric' or \"\n \"'timedelta' can be True when when coerce=True.\")\n\n if isinstance(values, (list, tuple)):\n # List or scalar\n values = np.array(values, dtype=np.object_)\n elif not hasattr(values, 'dtype'):\n values = np.array([values], dtype=np.object_)\n elif not is_object_dtype(values.dtype):\n # If not object, do not attempt conversion\n values = values.copy() if copy else values\n return values\n\n # If 1 flag is coerce, ensure 2 others are False\n if coerce:\n # Immediate return if coerce\n if datetime:\n from pandas import to_datetime\n return to_datetime(values, errors='coerce').to_numpy()\n elif timedelta:\n from pandas import to_timedelta\n return to_timedelta(values, errors='coerce').to_numpy()\n elif numeric:\n from pandas import to_numeric\n return to_numeric(values, errors='coerce')\n\n # Soft conversions\n if datetime:\n # GH 20380, when datetime is beyond year 2262, hence outside\n # bound of nanosecond-resolution 64-bit integers.\n try:\n values = lib.maybe_convert_objects(values,\n convert_datetime=datetime)\n except OutOfBoundsDatetime:\n pass\n\n if timedelta and is_object_dtype(values.dtype):\n # Object check to ensure only run if previous did not convert\n values = lib.maybe_convert_objects(values, convert_timedelta=timedelta)\n\n if numeric and is_object_dtype(values.dtype):\n try:\n converted = lib.maybe_convert_numeric(values, set(),\n coerce_numeric=True)\n # If all NaNs, then do not-alter\n values = converted if not isna(converted).all() else values\n values = values.copy() if copy else values\n except Exception:\n pass\n\n return values"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef maybe_infer_to_datetimelike(value, convert_dates=False):\n\n # TODO: why not timedelta?\n if isinstance(value, (ABCDatetimeIndex, ABCPeriodIndex,\n ABCDatetimeArray, ABCPeriodArray)):\n return value\n elif isinstance(value, ABCSeries):\n if isinstance(value._values, ABCDatetimeIndex):\n return value._values\n\n v = value\n\n if not is_list_like(v):\n v = [v]\n v = np.array(v, copy=False)\n\n # we only care about object dtypes\n if not is_object_dtype(v):\n return value\n\n shape = v.shape\n if not v.ndim == 1:\n v = v.ravel()\n\n if not len(v):\n return value\n\n def try_datetime(v):\n # safe coerce to datetime64\n try:\n # GH19671\n v = tslib.array_to_datetime(v,\n require_iso8601=True,\n errors='raise')[0]\n except ValueError:\n\n # we might have a sequence of the same-datetimes with tz's\n # if so coerce to a DatetimeIndex; if they are not the same,\n # then these stay as object dtype, xref GH19671\n try:\n from pandas._libs.tslibs import conversion\n from pandas import DatetimeIndex\n\n values, tz = conversion.datetime_to_datetime64(v)\n return DatetimeIndex(values).tz_localize(\n 'UTC').tz_convert(tz=tz)\n except (ValueError, TypeError):\n pass\n\n except Exception:\n pass\n\n return v.reshape(shape)\n\n def try_timedelta(v):\n # safe coerce to timedelta64\n\n # will try first with a string & object conversion\n from pandas import to_timedelta\n try:\n return to_timedelta(v)._ndarray_values.reshape(shape)\n except Exception:\n return v.reshape(shape)\n\n inferred_type = lib.infer_datetimelike_array(ensure_object(v))\n\n if inferred_type == 'date' and convert_dates:\n value = try_datetime(v)\n elif inferred_type == 'datetime':\n value = try_datetime(v)\n elif inferred_type == 'timedelta':\n value = try_timedelta(v)\n elif inferred_type == 'nat':\n\n # if all NaT, return as datetime\n if isna(v).all():\n value = try_datetime(v)\n else:\n\n # We have at least a NaT and a string\n # try timedelta first to avoid spurious datetime conversions\n # e.g. '00:00:01' is a timedelta but technically is also a datetime\n value = try_timedelta(v)\n if lib.infer_dtype(value, skipna=False) in ['mixed']:\n # cannot skip missing values, as NaT implies that the string\n # is actually a datetime\n value = try_datetime(v)\n\n return value", "response": "Try to infer the value of a node from a datetime - like object."} {"instruction": "Write a Python 3 function that can\ntry to cast the array or value to a datetimelike dtype converting float nan to iNaT", "response": "def maybe_cast_to_datetime(value, dtype, errors='raise'):\n \"\"\" try to cast the array/value to a datetimelike dtype, converting float\n nan to iNaT\n \"\"\"\n from pandas.core.tools.timedeltas import to_timedelta\n from pandas.core.tools.datetimes import to_datetime\n\n if dtype is not None:\n if isinstance(dtype, str):\n dtype = np.dtype(dtype)\n\n is_datetime64 = is_datetime64_dtype(dtype)\n is_datetime64tz = is_datetime64tz_dtype(dtype)\n is_timedelta64 = is_timedelta64_dtype(dtype)\n\n if is_datetime64 or is_datetime64tz or is_timedelta64:\n\n # Force the dtype if needed.\n msg = (\"The '{dtype}' dtype has no unit. \"\n \"Please pass in '{dtype}[ns]' instead.\")\n\n if is_datetime64 and not is_dtype_equal(dtype, _NS_DTYPE):\n if dtype.name in ('datetime64', 'datetime64[ns]'):\n if dtype.name == 'datetime64':\n raise ValueError(msg.format(dtype=dtype.name))\n dtype = _NS_DTYPE\n else:\n raise TypeError(\"cannot convert datetimelike to \"\n \"dtype [{dtype}]\".format(dtype=dtype))\n elif is_datetime64tz:\n\n # our NaT doesn't support tz's\n # this will coerce to DatetimeIndex with\n # a matching dtype below\n if is_scalar(value) and isna(value):\n value = [value]\n\n elif is_timedelta64 and not is_dtype_equal(dtype, _TD_DTYPE):\n if dtype.name in ('timedelta64', 'timedelta64[ns]'):\n if dtype.name == 'timedelta64':\n raise ValueError(msg.format(dtype=dtype.name))\n dtype = _TD_DTYPE\n else:\n raise TypeError(\"cannot convert timedeltalike to \"\n \"dtype [{dtype}]\".format(dtype=dtype))\n\n if is_scalar(value):\n if value == iNaT or isna(value):\n value = iNaT\n else:\n value = np.array(value, copy=False)\n\n # have a scalar array-like (e.g. NaT)\n if value.ndim == 0:\n value = iNaT\n\n # we have an array of datetime or timedeltas & nulls\n elif np.prod(value.shape) or not is_dtype_equal(value.dtype,\n dtype):\n try:\n if is_datetime64:\n value = to_datetime(value, errors=errors)\n # GH 25843: Remove tz information since the dtype\n # didn't specify one\n if value.tz is not None:\n value = value.tz_localize(None)\n value = value._values\n elif is_datetime64tz:\n # The string check can be removed once issue #13712\n # is solved. String data that is passed with a\n # datetime64tz is assumed to be naive which should\n # be localized to the timezone.\n is_dt_string = is_string_dtype(value)\n value = to_datetime(value, errors=errors).array\n if is_dt_string:\n # Strings here are naive, so directly localize\n value = value.tz_localize(dtype.tz)\n else:\n # Numeric values are UTC at this point,\n # so localize and convert\n value = (value.tz_localize('UTC')\n .tz_convert(dtype.tz))\n elif is_timedelta64:\n value = to_timedelta(value, errors=errors)._values\n except (AttributeError, ValueError, TypeError):\n pass\n\n # coerce datetimelike to object\n elif is_datetime64_dtype(value) and not is_datetime64_dtype(dtype):\n if is_object_dtype(dtype):\n if value.dtype != _NS_DTYPE:\n value = value.astype(_NS_DTYPE)\n ints = np.asarray(value).view('i8')\n return tslib.ints_to_pydatetime(ints)\n\n # we have a non-castable dtype that was passed\n raise TypeError('Cannot cast datetime64 to {dtype}'\n .format(dtype=dtype))\n\n else:\n\n is_array = isinstance(value, np.ndarray)\n\n # catch a datetime/timedelta that is not of ns variety\n # and no coercion specified\n if is_array and value.dtype.kind in ['M', 'm']:\n dtype = value.dtype\n\n if dtype.kind == 'M' and dtype != _NS_DTYPE:\n value = value.astype(_NS_DTYPE)\n\n elif dtype.kind == 'm' and dtype != _TD_DTYPE:\n value = to_timedelta(value)\n\n # only do this if we have an array and the dtype of the array is not\n # setup already we are not an integer/object, so don't bother with this\n # conversion\n elif not (is_array and not (issubclass(value.dtype.type, np.integer) or\n value.dtype == np.object_)):\n value = maybe_infer_to_datetimelike(value)\n\n return value"} {"instruction": "How would you implement a function in Python 3 that\nfinds a common data type among the given dtypes.", "response": "def find_common_type(types):\n \"\"\"\n Find a common data type among the given dtypes.\n\n Parameters\n ----------\n types : list of dtypes\n\n Returns\n -------\n pandas extension or numpy dtype\n\n See Also\n --------\n numpy.find_common_type\n\n \"\"\"\n\n if len(types) == 0:\n raise ValueError('no types given')\n\n first = types[0]\n\n # workaround for find_common_type([np.dtype('datetime64[ns]')] * 2)\n # => object\n if all(is_dtype_equal(first, t) for t in types[1:]):\n return first\n\n if any(isinstance(t, (PandasExtensionDtype, ExtensionDtype))\n for t in types):\n return np.object\n\n # take lowest unit\n if all(is_datetime64_dtype(t) for t in types):\n return np.dtype('datetime64[ns]')\n if all(is_timedelta64_dtype(t) for t in types):\n return np.dtype('timedelta64[ns]')\n\n # don't mix bool / int or float or complex\n # this is different from numpy, which casts bool with float/int as int\n has_bools = any(is_bool_dtype(t) for t in types)\n if has_bools:\n for t in types:\n if is_integer_dtype(t) or is_float_dtype(t) or is_complex_dtype(t):\n return np.object\n\n return np.find_common_type(types, [])"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef cast_scalar_to_array(shape, value, dtype=None):\n\n if dtype is None:\n dtype, fill_value = infer_dtype_from_scalar(value)\n else:\n fill_value = value\n\n values = np.empty(shape, dtype=dtype)\n values.fill(fill_value)\n\n return values", "response": "Coerce scalar value to array."} {"instruction": "Make a summary of the following Python 3 code\ndef construct_1d_arraylike_from_scalar(value, length, dtype):\n if is_datetime64tz_dtype(dtype):\n from pandas import DatetimeIndex\n subarr = DatetimeIndex([value] * length, dtype=dtype)\n elif is_categorical_dtype(dtype):\n from pandas import Categorical\n subarr = Categorical([value] * length, dtype=dtype)\n else:\n if not isinstance(dtype, (np.dtype, type(np.dtype))):\n dtype = dtype.dtype\n\n if length and is_integer_dtype(dtype) and isna(value):\n # coerce if we have nan for an integer dtype\n dtype = np.dtype('float64')\n elif isinstance(dtype, np.dtype) and dtype.kind in (\"U\", \"S\"):\n # we need to coerce to object dtype to avoid\n # to allow numpy to take our string as a scalar value\n dtype = object\n if not isna(value):\n value = to_str(value)\n\n subarr = np.empty(length, dtype=dtype)\n subarr.fill(value)\n\n return subarr", "response": "constructs a 1d array of specified shape and dtype with values"} {"instruction": "Write a Python 3 script to\ntransform any list - like object in a 1 - dimensional numpy array of dtype object .", "response": "def construct_1d_object_array_from_listlike(values):\n \"\"\"\n Transform any list-like object in a 1-dimensional numpy array of object\n dtype.\n\n Parameters\n ----------\n values : any iterable which has a len()\n\n Raises\n ------\n TypeError\n * If `values` does not have a len()\n\n Returns\n -------\n 1-dimensional numpy array of dtype object\n \"\"\"\n # numpy will try to interpret nested lists as further dimensions, hence\n # making a 1D array that contains list-likes is a bit tricky:\n result = np.empty(len(values), dtype='object')\n result[:] = values\n return result"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef construct_1d_ndarray_preserving_na(values, dtype=None, copy=False):\n subarr = np.array(values, dtype=dtype, copy=copy)\n\n if dtype is not None and dtype.kind in (\"U\", \"S\"):\n # GH-21083\n # We can't just return np.array(subarr, dtype='str') since\n # NumPy will convert the non-string objects into strings\n # Including NA values. Se we have to go\n # string -> object -> update NA, which requires an\n # additional pass over the data.\n na_values = isna(values)\n subarr2 = subarr.astype(object)\n subarr2[na_values] = np.asarray(values, dtype=object)[na_values]\n subarr = subarr2\n\n return subarr", "response": "Construct a new ndarray coercing values to dtype preserving NA."} {"instruction": "Make a summary of the following Python 3 code\ndef maybe_cast_to_integer_array(arr, dtype, copy=False):\n\n try:\n if not hasattr(arr, \"astype\"):\n casted = np.array(arr, dtype=dtype, copy=copy)\n else:\n casted = arr.astype(dtype, copy=copy)\n except OverflowError:\n raise OverflowError(\"The elements provided in the data cannot all be \"\n \"casted to the dtype {dtype}\".format(dtype=dtype))\n\n if np.array_equal(arr, casted):\n return casted\n\n # We do this casting to allow for proper\n # data and dtype checking.\n #\n # We didn't do this earlier because NumPy\n # doesn't handle `uint64` correctly.\n arr = np.asarray(arr)\n\n if is_unsigned_integer_dtype(dtype) and (arr < 0).any():\n raise OverflowError(\"Trying to coerce negative values \"\n \"to unsigned integers\")\n\n if is_integer_dtype(dtype) and (is_float_dtype(arr) or\n is_object_dtype(arr)):\n raise ValueError(\"Trying to coerce float values to integers\")", "response": "Casts an array to integer array."} {"instruction": "How would you code a function in Python 3 to\nmake a scatter plot from two DataFrame columns x y and by.", "response": "def scatter_plot(data, x, y, by=None, ax=None, figsize=None, grid=False,\n **kwargs):\n \"\"\"\n Make a scatter plot from two DataFrame columns\n\n Parameters\n ----------\n data : DataFrame\n x : Column name for the x-axis values\n y : Column name for the y-axis values\n ax : Matplotlib axis object\n figsize : A tuple (width, height) in inches\n grid : Setting this to True will show the grid\n kwargs : other plotting keyword arguments\n To be passed to scatter function\n\n Returns\n -------\n matplotlib.Figure\n \"\"\"\n import matplotlib.pyplot as plt\n\n kwargs.setdefault('edgecolors', 'none')\n\n def plot_group(group, ax):\n xvals = group[x].values\n yvals = group[y].values\n ax.scatter(xvals, yvals, **kwargs)\n ax.grid(grid)\n\n if by is not None:\n fig = _grouped_plot(plot_group, data, by=by, figsize=figsize, ax=ax)\n else:\n if ax is None:\n fig = plt.figure()\n ax = fig.add_subplot(111)\n else:\n fig = ax.get_figure()\n plot_group(data, ax)\n ax.set_ylabel(pprint_thing(y))\n ax.set_xlabel(pprint_thing(x))\n\n ax.grid(grid)\n\n return fig"} {"instruction": "Create a Python 3 function to\nmake a histogram of the DataFrame's. A `histogram`_ is a representation of the distribution of data. This function calls :meth:`matplotlib.pyplot.hist`, on each series in the DataFrame, resulting in one histogram per column. .. _histogram: https://en.wikipedia.org/wiki/Histogram Parameters ---------- data : DataFrame The pandas object holding the data. column : string or sequence If passed, will be used to limit data to a subset of columns. by : object, optional If passed, then used to form histograms for separate groups. grid : bool, default True Whether to show axis grid lines. xlabelsize : int, default None If specified changes the x-axis label size. xrot : float, default None Rotation of x axis labels. For example, a value of 90 displays the x labels rotated 90 degrees clockwise. ylabelsize : int, default None If specified changes the y-axis label size. yrot : float, default None Rotation of y axis labels. For example, a value of 90 displays the y labels rotated 90 degrees clockwise. ax : Matplotlib axes object, default None The axes to plot the histogram on. sharex : bool, default True if ax is None else False In case subplots=True, share x axis and set some x axis labels to invisible; defaults to True if ax is None otherwise False if an ax is passed in. Note that passing in both an ax and sharex=True will alter all x axis labels for all subplots in a figure. sharey : bool, default False In case subplots=True, share y axis and set some y axis labels to invisible. figsize : tuple The size in inches of the figure to create. Uses the value in `matplotlib.rcParams` by default. layout : tuple, optional Tuple of (rows, columns) for the layout of the histograms. bins : integer or sequence, default 10 Number of histogram bins to be used. If an integer is given, bins + 1 bin edges are calculated and returned. If bins is a sequence, gives bin edges, including left edge of first bin and right edge of last bin. In this case, bins is returned unmodified. **kwds All other plotting keyword arguments to be passed to :meth:`matplotlib.pyplot.hist`. Returns ------- matplotlib.AxesSubplot or numpy.ndarray of them See Also -------- matplotlib.pyplot.hist : Plot a histogram using matplotlib. Examples -------- .. plot:: :context: close-figs This example draws a histogram based on the length and width of some animals, displayed in three bins >>> df = pd.DataFrame({ ... 'length': [1.5, 0.5, 1.2, 0.9, 3], ... 'width': [0.7, 0.2, 0.15, 0.2, 1.1] ... }, index= ['pig', 'rabbit', 'duck', 'chicken', 'horse']) >>> hist = df.hist(bins=3)", "response": "def hist_frame(data, column=None, by=None, grid=True, xlabelsize=None,\n xrot=None, ylabelsize=None, yrot=None, ax=None, sharex=False,\n sharey=False, figsize=None, layout=None, bins=10, **kwds):\n \"\"\"\n Make a histogram of the DataFrame's.\n\n A `histogram`_ is a representation of the distribution of data.\n This function calls :meth:`matplotlib.pyplot.hist`, on each series in\n the DataFrame, resulting in one histogram per column.\n\n .. _histogram: https://en.wikipedia.org/wiki/Histogram\n\n Parameters\n ----------\n data : DataFrame\n The pandas object holding the data.\n column : string or sequence\n If passed, will be used to limit data to a subset of columns.\n by : object, optional\n If passed, then used to form histograms for separate groups.\n grid : bool, default True\n Whether to show axis grid lines.\n xlabelsize : int, default None\n If specified changes the x-axis label size.\n xrot : float, default None\n Rotation of x axis labels. For example, a value of 90 displays the\n x labels rotated 90 degrees clockwise.\n ylabelsize : int, default None\n If specified changes the y-axis label size.\n yrot : float, default None\n Rotation of y axis labels. For example, a value of 90 displays the\n y labels rotated 90 degrees clockwise.\n ax : Matplotlib axes object, default None\n The axes to plot the histogram on.\n sharex : bool, default True if ax is None else False\n In case subplots=True, share x axis and set some x axis labels to\n invisible; defaults to True if ax is None otherwise False if an ax\n is passed in.\n Note that passing in both an ax and sharex=True will alter all x axis\n labels for all subplots in a figure.\n sharey : bool, default False\n In case subplots=True, share y axis and set some y axis labels to\n invisible.\n figsize : tuple\n The size in inches of the figure to create. Uses the value in\n `matplotlib.rcParams` by default.\n layout : tuple, optional\n Tuple of (rows, columns) for the layout of the histograms.\n bins : integer or sequence, default 10\n Number of histogram bins to be used. If an integer is given, bins + 1\n bin edges are calculated and returned. If bins is a sequence, gives\n bin edges, including left edge of first bin and right edge of last\n bin. In this case, bins is returned unmodified.\n **kwds\n All other plotting keyword arguments to be passed to\n :meth:`matplotlib.pyplot.hist`.\n\n Returns\n -------\n matplotlib.AxesSubplot or numpy.ndarray of them\n\n See Also\n --------\n matplotlib.pyplot.hist : Plot a histogram using matplotlib.\n\n Examples\n --------\n\n .. plot::\n :context: close-figs\n\n This example draws a histogram based on the length and width of\n some animals, displayed in three bins\n\n >>> df = pd.DataFrame({\n ... 'length': [1.5, 0.5, 1.2, 0.9, 3],\n ... 'width': [0.7, 0.2, 0.15, 0.2, 1.1]\n ... }, index= ['pig', 'rabbit', 'duck', 'chicken', 'horse'])\n >>> hist = df.hist(bins=3)\n \"\"\"\n _raise_if_no_mpl()\n _converter._WARN = False\n if by is not None:\n axes = grouped_hist(data, column=column, by=by, ax=ax, grid=grid,\n figsize=figsize, sharex=sharex, sharey=sharey,\n layout=layout, bins=bins, xlabelsize=xlabelsize,\n xrot=xrot, ylabelsize=ylabelsize,\n yrot=yrot, **kwds)\n return axes\n\n if column is not None:\n if not isinstance(column, (list, np.ndarray, ABCIndexClass)):\n column = [column]\n data = data[column]\n data = data._get_numeric_data()\n naxes = len(data.columns)\n\n fig, axes = _subplots(naxes=naxes, ax=ax, squeeze=False,\n sharex=sharex, sharey=sharey, figsize=figsize,\n layout=layout)\n _axes = _flatten(axes)\n\n for i, col in enumerate(com.try_sort(data.columns)):\n ax = _axes[i]\n ax.hist(data[col].dropna().values, bins=bins, **kwds)\n ax.set_title(col)\n ax.grid(grid)\n\n _set_ticks_props(axes, xlabelsize=xlabelsize, xrot=xrot,\n ylabelsize=ylabelsize, yrot=yrot)\n fig.subplots_adjust(wspace=0.3, hspace=0.3)\n\n return axes"} {"instruction": "Make a summary of the following Python 3 code\ndef hist_series(self, by=None, ax=None, grid=True, xlabelsize=None,\n xrot=None, ylabelsize=None, yrot=None, figsize=None,\n bins=10, **kwds):\n \"\"\"\n Draw histogram of the input series using matplotlib.\n\n Parameters\n ----------\n by : object, optional\n If passed, then used to form histograms for separate groups\n ax : matplotlib axis object\n If not passed, uses gca()\n grid : bool, default True\n Whether to show axis grid lines\n xlabelsize : int, default None\n If specified changes the x-axis label size\n xrot : float, default None\n rotation of x axis labels\n ylabelsize : int, default None\n If specified changes the y-axis label size\n yrot : float, default None\n rotation of y axis labels\n figsize : tuple, default None\n figure size in inches by default\n bins : integer or sequence, default 10\n Number of histogram bins to be used. If an integer is given, bins + 1\n bin edges are calculated and returned. If bins is a sequence, gives\n bin edges, including left edge of first bin and right edge of last\n bin. In this case, bins is returned unmodified.\n bins : integer, default 10\n Number of histogram bins to be used\n `**kwds` : keywords\n To be passed to the actual plotting function\n\n See Also\n --------\n matplotlib.axes.Axes.hist : Plot a histogram using matplotlib.\n \"\"\"\n import matplotlib.pyplot as plt\n\n if by is None:\n if kwds.get('layout', None) is not None:\n raise ValueError(\"The 'layout' keyword is not supported when \"\n \"'by' is None\")\n # hack until the plotting interface is a bit more unified\n fig = kwds.pop('figure', plt.gcf() if plt.get_fignums() else\n plt.figure(figsize=figsize))\n if (figsize is not None and tuple(figsize) !=\n tuple(fig.get_size_inches())):\n fig.set_size_inches(*figsize, forward=True)\n if ax is None:\n ax = fig.gca()\n elif ax.get_figure() != fig:\n raise AssertionError('passed axis not bound to passed figure')\n values = self.dropna().values\n\n ax.hist(values, bins=bins, **kwds)\n ax.grid(grid)\n axes = np.array([ax])\n\n _set_ticks_props(axes, xlabelsize=xlabelsize, xrot=xrot,\n ylabelsize=ylabelsize, yrot=yrot)\n\n else:\n if 'figure' in kwds:\n raise ValueError(\"Cannot pass 'figure' when using the \"\n \"'by' argument, since a new 'Figure' instance \"\n \"will be created\")\n axes = grouped_hist(self, by=by, ax=ax, grid=grid, figsize=figsize,\n bins=bins, xlabelsize=xlabelsize, xrot=xrot,\n ylabelsize=ylabelsize, yrot=yrot, **kwds)\n\n if hasattr(axes, 'ndim'):\n if axes.ndim == 1 and len(axes) == 1:\n return axes[0]\n return axes", "response": "Plots a histogram of the input series using matplotlib."} {"instruction": "Create a Python 3 function to\nplot a grouped histogram of the data.", "response": "def grouped_hist(data, column=None, by=None, ax=None, bins=50, figsize=None,\n layout=None, sharex=False, sharey=False, rot=90, grid=True,\n xlabelsize=None, xrot=None, ylabelsize=None, yrot=None,\n **kwargs):\n \"\"\"\n Grouped histogram\n\n Parameters\n ----------\n data : Series/DataFrame\n column : object, optional\n by : object, optional\n ax : axes, optional\n bins : int, default 50\n figsize : tuple, optional\n layout : optional\n sharex : bool, default False\n sharey : bool, default False\n rot : int, default 90\n grid : bool, default True\n kwargs : dict, keyword arguments passed to matplotlib.Axes.hist\n\n Returns\n -------\n collection of Matplotlib Axes\n \"\"\"\n _raise_if_no_mpl()\n _converter._WARN = False\n\n def plot_group(group, ax):\n ax.hist(group.dropna().values, bins=bins, **kwargs)\n\n xrot = xrot or rot\n\n fig, axes = _grouped_plot(plot_group, data, column=column,\n by=by, sharex=sharex, sharey=sharey, ax=ax,\n figsize=figsize, layout=layout, rot=rot)\n\n _set_ticks_props(axes, xlabelsize=xlabelsize, xrot=xrot,\n ylabelsize=ylabelsize, yrot=yrot)\n\n fig.subplots_adjust(bottom=0.15, top=0.9, left=0.1, right=0.9,\n hspace=0.5, wspace=0.3)\n return axes"} {"instruction": "Create a Python 3 function to\ngroup by function for a set of dataframes.", "response": "def boxplot_frame_groupby(grouped, subplots=True, column=None, fontsize=None,\n rot=0, grid=True, ax=None, figsize=None,\n layout=None, sharex=False, sharey=True, **kwds):\n \"\"\"\n Make box plots from DataFrameGroupBy data.\n\n Parameters\n ----------\n grouped : Grouped DataFrame\n subplots : bool\n * ``False`` - no subplots will be used\n * ``True`` - create a subplot for each group\n column : column name or list of names, or vector\n Can be any valid input to groupby\n fontsize : int or string\n rot : label rotation angle\n grid : Setting this to True will show the grid\n ax : Matplotlib axis object, default None\n figsize : A tuple (width, height) in inches\n layout : tuple (optional)\n (rows, columns) for the layout of the plot\n sharex : bool, default False\n Whether x-axes will be shared among subplots\n\n .. versionadded:: 0.23.1\n sharey : bool, default True\n Whether y-axes will be shared among subplots\n\n .. versionadded:: 0.23.1\n `**kwds` : Keyword Arguments\n All other plotting keyword arguments to be passed to\n matplotlib's boxplot function\n\n Returns\n -------\n dict of key/value = group key/DataFrame.boxplot return value\n or DataFrame.boxplot return value in case subplots=figures=False\n\n Examples\n --------\n >>> import itertools\n >>> tuples = [t for t in itertools.product(range(1000), range(4))]\n >>> index = pd.MultiIndex.from_tuples(tuples, names=['lvl0', 'lvl1'])\n >>> data = np.random.randn(len(index),4)\n >>> df = pd.DataFrame(data, columns=list('ABCD'), index=index)\n >>>\n >>> grouped = df.groupby(level='lvl1')\n >>> boxplot_frame_groupby(grouped)\n >>>\n >>> grouped = df.unstack(level='lvl1').groupby(level=0, axis=1)\n >>> boxplot_frame_groupby(grouped, subplots=False)\n \"\"\"\n _raise_if_no_mpl()\n _converter._WARN = False\n if subplots is True:\n naxes = len(grouped)\n fig, axes = _subplots(naxes=naxes, squeeze=False,\n ax=ax, sharex=sharex, sharey=sharey,\n figsize=figsize, layout=layout)\n axes = _flatten(axes)\n\n from pandas.core.series import Series\n ret = Series()\n for (key, group), ax in zip(grouped, axes):\n d = group.boxplot(ax=ax, column=column, fontsize=fontsize,\n rot=rot, grid=grid, **kwds)\n ax.set_title(pprint_thing(key))\n ret.loc[key] = d\n fig.subplots_adjust(bottom=0.15, top=0.9, left=0.1,\n right=0.9, wspace=0.2)\n else:\n from pandas.core.reshape.concat import concat\n keys, frames = zip(*grouped)\n if grouped.axis == 0:\n df = concat(frames, keys=keys, axis=1)\n else:\n if len(frames) > 1:\n df = frames[0].join(frames[1::])\n else:\n df = frames[0]\n ret = df.boxplot(column=column, fontsize=fontsize, rot=rot,\n grid=grid, ax=ax, figsize=figsize,\n layout=layout, **kwds)\n return ret"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef _has_plotted_object(self, ax):\n return (len(ax.lines) != 0 or\n len(ax.artists) != 0 or\n len(ax.containers) != 0)", "response": "check whether ax has data"} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef _post_plot_logic_common(self, ax, data):\n\n def get_label(i):\n try:\n return pprint_thing(data.index[i])\n except Exception:\n return ''\n\n if self.orientation == 'vertical' or self.orientation is None:\n if self._need_to_set_index:\n xticklabels = [get_label(x) for x in ax.get_xticks()]\n ax.set_xticklabels(xticklabels)\n self._apply_axis_properties(ax.xaxis, rot=self.rot,\n fontsize=self.fontsize)\n self._apply_axis_properties(ax.yaxis, fontsize=self.fontsize)\n\n if hasattr(ax, 'right_ax'):\n self._apply_axis_properties(ax.right_ax.yaxis,\n fontsize=self.fontsize)\n\n elif self.orientation == 'horizontal':\n if self._need_to_set_index:\n yticklabels = [get_label(y) for y in ax.get_yticks()]\n ax.set_yticklabels(yticklabels)\n self._apply_axis_properties(ax.yaxis, rot=self.rot,\n fontsize=self.fontsize)\n self._apply_axis_properties(ax.xaxis, fontsize=self.fontsize)\n\n if hasattr(ax, 'right_ax'):\n self._apply_axis_properties(ax.right_ax.yaxis,\n fontsize=self.fontsize)\n else: # pragma no cover\n raise ValueError", "response": "Common post processing for each axes"} {"instruction": "Explain what the following Python 3 code does\ndef _adorn_subplots(self):\n if len(self.axes) > 0:\n all_axes = self._get_subplots()\n nrows, ncols = self._get_axes_layout()\n _handle_shared_axes(axarr=all_axes, nplots=len(all_axes),\n naxes=nrows * ncols, nrows=nrows,\n ncols=ncols, sharex=self.sharex,\n sharey=self.sharey)\n\n for ax in self.axes:\n if self.yticks is not None:\n ax.set_yticks(self.yticks)\n\n if self.xticks is not None:\n ax.set_xticks(self.xticks)\n\n if self.ylim is not None:\n ax.set_ylim(self.ylim)\n\n if self.xlim is not None:\n ax.set_xlim(self.xlim)\n\n ax.grid(self.grid)\n\n if self.title:\n if self.subplots:\n if is_list_like(self.title):\n if len(self.title) != self.nseries:\n msg = ('The length of `title` must equal the number '\n 'of columns if using `title` of type `list` '\n 'and `subplots=True`.\\n'\n 'length of title = {}\\n'\n 'number of columns = {}').format(\n len(self.title), self.nseries)\n raise ValueError(msg)\n\n for (ax, title) in zip(self.axes, self.title):\n ax.set_title(title)\n else:\n self.fig.suptitle(self.title)\n else:\n if is_list_like(self.title):\n msg = ('Using `title` of type `list` is not supported '\n 'unless `subplots=True` is passed')\n raise ValueError(msg)\n self.axes[0].set_title(self.title)", "response": "Common post processing unrelated to data"} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef _apply_axis_properties(self, axis, rot=None, fontsize=None):\n\n if rot is not None or fontsize is not None:\n # rot=0 is a valid setting, hence the explicit None check\n labels = axis.get_majorticklabels() + axis.get_minorticklabels()\n for label in labels:\n if rot is not None:\n label.set_rotation(rot)\n if fontsize is not None:\n label.set_fontsize(fontsize)", "response": "Applies the axis properties to the internal data structures."} {"instruction": "Make a summary of the following Python 3 code\ndef _get_ax_layer(cls, ax, primary=True):\n if primary:\n return getattr(ax, 'left_ax', ax)\n else:\n return getattr(ax, 'right_ax', ax)", "response": "get the layer of the specified axes"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef _apply_style_colors(self, colors, kwds, col_num, label):\n style = None\n if self.style is not None:\n if isinstance(self.style, list):\n try:\n style = self.style[col_num]\n except IndexError:\n pass\n elif isinstance(self.style, dict):\n style = self.style.get(label, style)\n else:\n style = self.style\n\n has_color = 'color' in kwds or self.colormap is not None\n nocolor_style = style is None or re.match('[a-z]+', style) is None\n if (has_color or self.subplots) and nocolor_style:\n kwds['color'] = colors[col_num % len(colors)]\n return style, kwds", "response": "Applies color to the style and kwds."} {"instruction": "Implement a function in Python 3 to\nparse error bars and return the actual errorbar data.", "response": "def _parse_errorbars(self, label, err):\n \"\"\"\n Look for error keyword arguments and return the actual errorbar data\n or return the error DataFrame/dict\n\n Error bars can be specified in several ways:\n Series: the user provides a pandas.Series object of the same\n length as the data\n ndarray: provides a np.ndarray of the same length as the data\n DataFrame/dict: error values are paired with keys matching the\n key in the plotted DataFrame\n str: the name of the column within the plotted DataFrame\n \"\"\"\n\n if err is None:\n return None\n\n def match_labels(data, e):\n e = e.reindex(data.index)\n return e\n\n # key-matched DataFrame\n if isinstance(err, ABCDataFrame):\n\n err = match_labels(self.data, err)\n # key-matched dict\n elif isinstance(err, dict):\n pass\n\n # Series of error values\n elif isinstance(err, ABCSeries):\n # broadcast error series across data\n err = match_labels(self.data, err)\n err = np.atleast_2d(err)\n err = np.tile(err, (self.nseries, 1))\n\n # errors are a column in the dataframe\n elif isinstance(err, str):\n evalues = self.data[err].values\n self.data = self.data[self.data.columns.drop(err)]\n err = np.atleast_2d(evalues)\n err = np.tile(err, (self.nseries, 1))\n\n elif is_list_like(err):\n if is_iterator(err):\n err = np.atleast_2d(list(err))\n else:\n # raw error values\n err = np.atleast_2d(err)\n\n err_shape = err.shape\n\n # asymmetrical error bars\n if err.ndim == 3:\n if (err_shape[0] != self.nseries) or \\\n (err_shape[1] != 2) or \\\n (err_shape[2] != len(self.data)):\n msg = \"Asymmetrical error bars should be provided \" + \\\n \"with the shape (%u, 2, %u)\" % \\\n (self.nseries, len(self.data))\n raise ValueError(msg)\n\n # broadcast errors to each data series\n if len(err) == 1:\n err = np.tile(err, (self.nseries, 1))\n\n elif is_number(err):\n err = np.tile([err], (self.nseries, len(self.data)))\n\n else:\n msg = \"No valid {label} detected\".format(label=label)\n raise ValueError(msg)\n\n return err"} {"instruction": "Here you have a function in Python 3, explain what it does\ndef _make_plot_keywords(self, kwds, y):\n # y is required for KdePlot\n kwds['bottom'] = self.bottom\n kwds['bins'] = self.bins\n return kwds", "response": "merge BoxPlot and KdePlot properties to passed kwds"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef line(self, x=None, y=None, **kwds):\n return self(kind='line', x=x, y=y, **kwds)", "response": "Plot a line of the entry for a specific language."} {"instruction": "Can you tell what is the following Python 3 function doing\ndef barh(self, x=None, y=None, **kwds):\n return self(kind='barh', x=x, y=y, **kwds)", "response": "Make a horizontal bar plot of the information for the current entry."} {"instruction": "Implement a Python 3 function for\ndrawing one histogram of the DataFrame s columns.", "response": "def hist(self, by=None, bins=10, **kwds):\n \"\"\"\n Draw one histogram of the DataFrame's columns.\n\n A histogram is a representation of the distribution of data.\n This function groups the values of all given Series in the DataFrame\n into bins and draws all bins in one :class:`matplotlib.axes.Axes`.\n This is useful when the DataFrame's Series are in a similar scale.\n\n Parameters\n ----------\n by : str or sequence, optional\n Column in the DataFrame to group by.\n bins : int, default 10\n Number of histogram bins to be used.\n **kwds\n Additional keyword arguments are documented in\n :meth:`DataFrame.plot`.\n\n Returns\n -------\n class:`matplotlib.AxesSubplot`\n Return a histogram plot.\n\n See Also\n --------\n DataFrame.hist : Draw histograms per DataFrame's Series.\n Series.hist : Draw a histogram with Series' data.\n\n Examples\n --------\n When we draw a dice 6000 times, we expect to get each value around 1000\n times. But when we draw two dices and sum the result, the distribution\n is going to be quite different. A histogram illustrates those\n distributions.\n\n .. plot::\n :context: close-figs\n\n >>> df = pd.DataFrame(\n ... np.random.randint(1, 7, 6000),\n ... columns = ['one'])\n >>> df['two'] = df['one'] + np.random.randint(1, 7, 6000)\n >>> ax = df.plot.hist(bins=12, alpha=0.5)\n \"\"\"\n return self(kind='hist', by=by, bins=bins, **kwds)"} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef area(self, x=None, y=None, **kwds):\n return self(kind='area', x=x, y=y, **kwds)", "response": "This function creates an area plot showing quantitative data visually."} {"instruction": "Make a summary of the following Python 3 code\ndef scatter(self, x, y, s=None, c=None, **kwds):\n return self(kind='scatter', x=x, y=y, c=c, s=s, **kwds)", "response": "Creates a scatter plot of the data for the given marker point size and color."} {"instruction": "Write a Python 3 script to\ngenerate a hexagonal binning plot of the given x and y points.", "response": "def hexbin(self, x, y, C=None, reduce_C_function=None, gridsize=None,\n **kwds):\n \"\"\"\n Generate a hexagonal binning plot.\n\n Generate a hexagonal binning plot of `x` versus `y`. If `C` is `None`\n (the default), this is a histogram of the number of occurrences\n of the observations at ``(x[i], y[i])``.\n\n If `C` is specified, specifies values at given coordinates\n ``(x[i], y[i])``. These values are accumulated for each hexagonal\n bin and then reduced according to `reduce_C_function`,\n having as default the NumPy's mean function (:meth:`numpy.mean`).\n (If `C` is specified, it must also be a 1-D sequence\n of the same length as `x` and `y`, or a column label.)\n\n Parameters\n ----------\n x : int or str\n The column label or position for x points.\n y : int or str\n The column label or position for y points.\n C : int or str, optional\n The column label or position for the value of `(x, y)` point.\n reduce_C_function : callable, default `np.mean`\n Function of one argument that reduces all the values in a bin to\n a single number (e.g. `np.mean`, `np.max`, `np.sum`, `np.std`).\n gridsize : int or tuple of (int, int), default 100\n The number of hexagons in the x-direction.\n The corresponding number of hexagons in the y-direction is\n chosen in a way that the hexagons are approximately regular.\n Alternatively, gridsize can be a tuple with two elements\n specifying the number of hexagons in the x-direction and the\n y-direction.\n **kwds\n Additional keyword arguments are documented in\n :meth:`DataFrame.plot`.\n\n Returns\n -------\n matplotlib.AxesSubplot\n The matplotlib ``Axes`` on which the hexbin is plotted.\n\n See Also\n --------\n DataFrame.plot : Make plots of a DataFrame.\n matplotlib.pyplot.hexbin : Hexagonal binning plot using matplotlib,\n the matplotlib function that is used under the hood.\n\n Examples\n --------\n The following examples are generated with random data from\n a normal distribution.\n\n .. plot::\n :context: close-figs\n\n >>> n = 10000\n >>> df = pd.DataFrame({'x': np.random.randn(n),\n ... 'y': np.random.randn(n)})\n >>> ax = df.plot.hexbin(x='x', y='y', gridsize=20)\n\n The next example uses `C` and `np.sum` as `reduce_C_function`.\n Note that `'observations'` values ranges from 1 to 5 but the result\n plot shows values up to more than 25. This is because of the\n `reduce_C_function`.\n\n .. plot::\n :context: close-figs\n\n >>> n = 500\n >>> df = pd.DataFrame({\n ... 'coord_x': np.random.uniform(-3, 3, size=n),\n ... 'coord_y': np.random.uniform(30, 50, size=n),\n ... 'observations': np.random.randint(1,5, size=n)\n ... })\n >>> ax = df.plot.hexbin(x='coord_x',\n ... y='coord_y',\n ... C='observations',\n ... reduce_C_function=np.sum,\n ... gridsize=10,\n ... cmap=\"viridis\")\n \"\"\"\n if reduce_C_function is not None:\n kwds['reduce_C_function'] = reduce_C_function\n if gridsize is not None:\n kwds['gridsize'] = gridsize\n return self(kind='hexbin', x=x, y=y, C=C, **kwds)"} {"instruction": "Explain what the following Python 3 code does\ndef _get_objs_combined_axis(objs, intersect=False, axis=0, sort=True):\n obs_idxes = [obj._get_axis(axis) for obj in objs\n if hasattr(obj, '_get_axis')]\n if obs_idxes:\n return _get_combined_index(obs_idxes, intersect=intersect, sort=sort)", "response": "Extract combined index from a list of objects."} {"instruction": "Here you have a function in Python 3, explain what it does\ndef _get_distinct_objs(objs):\n ids = set()\n res = []\n for obj in objs:\n if not id(obj) in ids:\n ids.add(id(obj))\n res.append(obj)\n return res", "response": "Return a list with distinct elements of objs. Preserves order."} {"instruction": "Implement a Python 3 function for\nreturning the union or intersection of the given list of indexes.", "response": "def _get_combined_index(indexes, intersect=False, sort=False):\n \"\"\"\n Return the union or intersection of indexes.\n\n Parameters\n ----------\n indexes : list of Index or list objects\n When intersect=True, do not accept list of lists.\n intersect : bool, default False\n If True, calculate the intersection between indexes. Otherwise,\n calculate the union.\n sort : bool, default False\n Whether the result index should come out sorted or not.\n\n Returns\n -------\n Index\n \"\"\"\n\n # TODO: handle index names!\n indexes = _get_distinct_objs(indexes)\n if len(indexes) == 0:\n index = Index([])\n elif len(indexes) == 1:\n index = indexes[0]\n elif intersect:\n index = indexes[0]\n for other in indexes[1:]:\n index = index.intersection(other)\n else:\n index = _union_indexes(indexes, sort=sort)\n index = ensure_index(index)\n\n if sort:\n try:\n index = index.sort_values()\n except TypeError:\n pass\n return index"} {"instruction": "Write a Python 3 script to\nreturn the union of the given list of indexes.", "response": "def _union_indexes(indexes, sort=True):\n \"\"\"\n Return the union of indexes.\n\n The behavior of sort and names is not consistent.\n\n Parameters\n ----------\n indexes : list of Index or list objects\n sort : bool, default True\n Whether the result index should come out sorted or not.\n\n Returns\n -------\n Index\n \"\"\"\n if len(indexes) == 0:\n raise AssertionError('Must have at least 1 Index to union')\n if len(indexes) == 1:\n result = indexes[0]\n if isinstance(result, list):\n result = Index(sorted(result))\n return result\n\n indexes, kind = _sanitize_and_check(indexes)\n\n def _unique_indices(inds):\n \"\"\"\n Convert indexes to lists and concatenate them, removing duplicates.\n\n The final dtype is inferred.\n\n Parameters\n ----------\n inds : list of Index or list objects\n\n Returns\n -------\n Index\n \"\"\"\n def conv(i):\n if isinstance(i, Index):\n i = i.tolist()\n return i\n\n return Index(\n lib.fast_unique_multiple_list([conv(i) for i in inds], sort=sort))\n\n if kind == 'special':\n result = indexes[0]\n\n if hasattr(result, 'union_many'):\n return result.union_many(indexes[1:])\n else:\n for other in indexes[1:]:\n result = result.union(other)\n return result\n elif kind == 'array':\n index = indexes[0]\n for other in indexes[1:]:\n if not index.equals(other):\n\n if sort is None:\n # TODO: remove once pd.concat sort default changes\n warnings.warn(_sort_msg, FutureWarning, stacklevel=8)\n sort = True\n\n return _unique_indices(indexes)\n\n name = _get_consensus_names(indexes)[0]\n if name != index.name:\n index = index._shallow_copy(name=name)\n return index\n else: # kind='list'\n return _unique_indices(indexes)"} {"instruction": "Given the following Python 3 function, write the documentation\ndef _sanitize_and_check(indexes):\n kinds = list({type(index) for index in indexes})\n\n if list in kinds:\n if len(kinds) > 1:\n indexes = [Index(com.try_sort(x))\n if not isinstance(x, Index) else\n x for x in indexes]\n kinds.remove(list)\n else:\n return indexes, 'list'\n\n if len(kinds) > 1 or Index not in kinds:\n return indexes, 'special'\n else:\n return indexes, 'array'", "response": "Verify the type of indexes and convert lists to Index."} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef _get_consensus_names(indexes):\n\n # find the non-none names, need to tupleify to make\n # the set hashable, then reverse on return\n consensus_names = {tuple(i.names) for i in indexes\n if com._any_not_none(*i.names)}\n if len(consensus_names) == 1:\n return list(list(consensus_names)[0])\n return [None] * indexes[0].nlevels", "response": "Returns a list of consensus names for the given indexes."} {"instruction": "Make a summary of the following Python 3 code\ndef _all_indexes_same(indexes):\n first = indexes[0]\n for index in indexes[1:]:\n if not first.equals(index):\n return False\n return True", "response": "Determines if all indexes contain the same elements."} {"instruction": "Write a Python 3 function for\nconverting SQL and params args to DBAPI2. 0 compliant format.", "response": "def _convert_params(sql, params):\n \"\"\"Convert SQL and params args to DBAPI2.0 compliant format.\"\"\"\n args = [sql]\n if params is not None:\n if hasattr(params, 'keys'): # test if params is a mapping\n args += [params]\n else:\n args += [list(params)]\n return args"} {"instruction": "Given the following Python 3 function, write the documentation\ndef _process_parse_dates_argument(parse_dates):\n # handle non-list entries for parse_dates gracefully\n if parse_dates is True or parse_dates is None or parse_dates is False:\n parse_dates = []\n\n elif not hasattr(parse_dates, '__iter__'):\n parse_dates = [parse_dates]\n return parse_dates", "response": "Process parse_dates argument for read_sql functions."} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef _parse_date_columns(data_frame, parse_dates):\n parse_dates = _process_parse_dates_argument(parse_dates)\n\n # we want to coerce datetime64_tz dtypes for now to UTC\n # we could in theory do a 'nice' conversion from a FixedOffset tz\n # GH11216\n for col_name, df_col in data_frame.iteritems():\n if is_datetime64tz_dtype(df_col) or col_name in parse_dates:\n try:\n fmt = parse_dates[col_name]\n except TypeError:\n fmt = None\n data_frame[col_name] = _handle_date_column(df_col, format=fmt)\n\n return data_frame", "response": "Convert datetime columns to datetime columns."} {"instruction": "How would you code a function in Python 3 to\nwrap result set of query in a DataFrame.", "response": "def _wrap_result(data, columns, index_col=None, coerce_float=True,\n parse_dates=None):\n \"\"\"Wrap result set of query in a DataFrame.\"\"\"\n\n frame = DataFrame.from_records(data, columns=columns,\n coerce_float=coerce_float)\n\n frame = _parse_date_columns(frame, parse_dates)\n\n if index_col is not None:\n frame.set_index(index_col, inplace=True)\n\n return frame"} {"instruction": "Given the following Python 3 function, write the documentation\ndef execute(sql, con, cur=None, params=None):\n if cur is None:\n pandas_sql = pandasSQL_builder(con)\n else:\n pandas_sql = pandasSQL_builder(cur, is_cursor=True)\n args = _convert_params(sql, params)\n return pandas_sql.execute(*args)", "response": "Execute a SQL query using the provided connection object."} {"instruction": "Write a Python 3 script to\nread a SQL table into a DataFrame.", "response": "def read_sql_table(table_name, con, schema=None, index_col=None,\n coerce_float=True, parse_dates=None, columns=None,\n chunksize=None):\n \"\"\"\n Read SQL database table into a DataFrame.\n\n Given a table name and a SQLAlchemy connectable, returns a DataFrame.\n This function does not support DBAPI connections.\n\n Parameters\n ----------\n table_name : str\n Name of SQL table in database.\n con : SQLAlchemy connectable or str\n A database URI could be provided as as str.\n SQLite DBAPI connection mode not supported.\n schema : str, default None\n Name of SQL schema in database to query (if database flavor\n supports this). Uses default schema if None (default).\n index_col : str or list of str, optional, default: None\n Column(s) to set as index(MultiIndex).\n coerce_float : bool, default True\n Attempts to convert values of non-string, non-numeric objects (like\n decimal.Decimal) to floating point. Can result in loss of Precision.\n parse_dates : list or dict, default None\n - List of column names to parse as dates.\n - Dict of ``{column_name: format string}`` where format string is\n strftime compatible in case of parsing string times or is one of\n (D, s, ns, ms, us) in case of parsing integer timestamps.\n - Dict of ``{column_name: arg dict}``, where the arg dict corresponds\n to the keyword arguments of :func:`pandas.to_datetime`\n Especially useful with databases without native Datetime support,\n such as SQLite.\n columns : list, default None\n List of column names to select from SQL table.\n chunksize : int, default None\n If specified, returns an iterator where `chunksize` is the number of\n rows to include in each chunk.\n\n Returns\n -------\n DataFrame\n A SQL table is returned as two-dimensional data structure with labeled\n axes.\n\n See Also\n --------\n read_sql_query : Read SQL query into a DataFrame.\n read_sql : Read SQL query or database table into a DataFrame.\n\n Notes\n -----\n Any datetime values with time zone information will be converted to UTC.\n\n Examples\n --------\n >>> pd.read_sql_table('table_name', 'postgres:///db_name') # doctest:+SKIP\n \"\"\"\n\n con = _engine_builder(con)\n if not _is_sqlalchemy_connectable(con):\n raise NotImplementedError(\"read_sql_table only supported for \"\n \"SQLAlchemy connectable.\")\n import sqlalchemy\n from sqlalchemy.schema import MetaData\n meta = MetaData(con, schema=schema)\n try:\n meta.reflect(only=[table_name], views=True)\n except sqlalchemy.exc.InvalidRequestError:\n raise ValueError(\"Table {name} not found\".format(name=table_name))\n\n pandas_sql = SQLDatabase(con, meta=meta)\n table = pandas_sql.read_table(\n table_name, index_col=index_col, coerce_float=coerce_float,\n parse_dates=parse_dates, columns=columns, chunksize=chunksize)\n\n if table is not None:\n return table\n else:\n raise ValueError(\"Table {name} not found\".format(name=table_name), con)"} {"instruction": "Write a Python 3 script to\nread a SQL query into a DataFrame.", "response": "def read_sql_query(sql, con, index_col=None, coerce_float=True, params=None,\n parse_dates=None, chunksize=None):\n \"\"\"Read SQL query into a DataFrame.\n\n Returns a DataFrame corresponding to the result set of the query\n string. Optionally provide an `index_col` parameter to use one of the\n columns as the index, otherwise default integer index will be used.\n\n Parameters\n ----------\n sql : string SQL query or SQLAlchemy Selectable (select or text object)\n SQL query to be executed.\n con : SQLAlchemy connectable(engine/connection), database string URI,\n or sqlite3 DBAPI2 connection\n Using SQLAlchemy makes it possible to use any DB supported by that\n library.\n If a DBAPI2 object, only sqlite3 is supported.\n index_col : string or list of strings, optional, default: None\n Column(s) to set as index(MultiIndex).\n coerce_float : boolean, default True\n Attempts to convert values of non-string, non-numeric objects (like\n decimal.Decimal) to floating point. Useful for SQL result sets.\n params : list, tuple or dict, optional, default: None\n List of parameters to pass to execute method. The syntax used\n to pass parameters is database driver dependent. Check your\n database driver documentation for which of the five syntax styles,\n described in PEP 249's paramstyle, is supported.\n Eg. for psycopg2, uses %(name)s so use params={'name' : 'value'}\n parse_dates : list or dict, default: None\n - List of column names to parse as dates.\n - Dict of ``{column_name: format string}`` where format string is\n strftime compatible in case of parsing string times, or is one of\n (D, s, ns, ms, us) in case of parsing integer timestamps.\n - Dict of ``{column_name: arg dict}``, where the arg dict corresponds\n to the keyword arguments of :func:`pandas.to_datetime`\n Especially useful with databases without native Datetime support,\n such as SQLite.\n chunksize : int, default None\n If specified, return an iterator where `chunksize` is the number of\n rows to include in each chunk.\n\n Returns\n -------\n DataFrame\n\n See Also\n --------\n read_sql_table : Read SQL database table into a DataFrame.\n read_sql\n\n Notes\n -----\n Any datetime values with time zone information parsed via the `parse_dates`\n parameter will be converted to UTC.\n \"\"\"\n pandas_sql = pandasSQL_builder(con)\n return pandas_sql.read_query(\n sql, index_col=index_col, params=params, coerce_float=coerce_float,\n parse_dates=parse_dates, chunksize=chunksize)"} {"instruction": "Given the following Python 3 function, write the documentation\ndef read_sql(sql, con, index_col=None, coerce_float=True, params=None,\n parse_dates=None, columns=None, chunksize=None):\n \"\"\"\n Read SQL query or database table into a DataFrame.\n\n This function is a convenience wrapper around ``read_sql_table`` and\n ``read_sql_query`` (for backward compatibility). It will delegate\n to the specific function depending on the provided input. A SQL query\n will be routed to ``read_sql_query``, while a database table name will\n be routed to ``read_sql_table``. Note that the delegated function might\n have more specific notes about their functionality not listed here.\n\n Parameters\n ----------\n sql : string or SQLAlchemy Selectable (select or text object)\n SQL query to be executed or a table name.\n con : SQLAlchemy connectable (engine/connection) or database string URI\n or DBAPI2 connection (fallback mode)\n\n Using SQLAlchemy makes it possible to use any DB supported by that\n library. If a DBAPI2 object, only sqlite3 is supported.\n index_col : string or list of strings, optional, default: None\n Column(s) to set as index(MultiIndex).\n coerce_float : boolean, default True\n Attempts to convert values of non-string, non-numeric objects (like\n decimal.Decimal) to floating point, useful for SQL result sets.\n params : list, tuple or dict, optional, default: None\n List of parameters to pass to execute method. The syntax used\n to pass parameters is database driver dependent. Check your\n database driver documentation for which of the five syntax styles,\n described in PEP 249's paramstyle, is supported.\n Eg. for psycopg2, uses %(name)s so use params={'name' : 'value'}\n parse_dates : list or dict, default: None\n - List of column names to parse as dates.\n - Dict of ``{column_name: format string}`` where format string is\n strftime compatible in case of parsing string times, or is one of\n (D, s, ns, ms, us) in case of parsing integer timestamps.\n - Dict of ``{column_name: arg dict}``, where the arg dict corresponds\n to the keyword arguments of :func:`pandas.to_datetime`\n Especially useful with databases without native Datetime support,\n such as SQLite.\n columns : list, default: None\n List of column names to select from SQL table (only used when reading\n a table).\n chunksize : int, default None\n If specified, return an iterator where `chunksize` is the\n number of rows to include in each chunk.\n\n Returns\n -------\n DataFrame\n\n See Also\n --------\n read_sql_table : Read SQL database table into a DataFrame.\n read_sql_query : Read SQL query into a DataFrame.\n \"\"\"\n pandas_sql = pandasSQL_builder(con)\n\n if isinstance(pandas_sql, SQLiteDatabase):\n return pandas_sql.read_query(\n sql, index_col=index_col, params=params,\n coerce_float=coerce_float, parse_dates=parse_dates,\n chunksize=chunksize)\n\n try:\n _is_table_name = pandas_sql.has_table(sql)\n except Exception:\n # using generic exception to catch errors from sql drivers (GH24988)\n _is_table_name = False\n\n if _is_table_name:\n pandas_sql.meta.reflect(only=[sql])\n return pandas_sql.read_table(\n sql, index_col=index_col, coerce_float=coerce_float,\n parse_dates=parse_dates, columns=columns, chunksize=chunksize)\n else:\n return pandas_sql.read_query(\n sql, index_col=index_col, params=params,\n coerce_float=coerce_float, parse_dates=parse_dates,\n chunksize=chunksize)", "response": "Read a SQL query or database table into a DataFrame."} {"instruction": "Write a Python 3 script to\nwrite records stored in a DataFrame to a SQL database.", "response": "def to_sql(frame, name, con, schema=None, if_exists='fail', index=True,\n index_label=None, chunksize=None, dtype=None, method=None):\n \"\"\"\n Write records stored in a DataFrame to a SQL database.\n\n Parameters\n ----------\n frame : DataFrame, Series\n name : string\n Name of SQL table.\n con : SQLAlchemy connectable(engine/connection) or database string URI\n or sqlite3 DBAPI2 connection\n Using SQLAlchemy makes it possible to use any DB supported by that\n library.\n If a DBAPI2 object, only sqlite3 is supported.\n schema : string, default None\n Name of SQL schema in database to write to (if database flavor\n supports this). If None, use default schema (default).\n if_exists : {'fail', 'replace', 'append'}, default 'fail'\n - fail: If table exists, do nothing.\n - replace: If table exists, drop it, recreate it, and insert data.\n - append: If table exists, insert data. Create if does not exist.\n index : boolean, default True\n Write DataFrame index as a column.\n index_label : string or sequence, default None\n Column label for index column(s). If None is given (default) and\n `index` is True, then the index names are used.\n A sequence should be given if the DataFrame uses MultiIndex.\n chunksize : int, default None\n If not None, then rows will be written in batches of this size at a\n time. If None, all rows will be written at once.\n dtype : single SQLtype or dict of column name to SQL type, default None\n Optional specifying the datatype for columns. The SQL type should\n be a SQLAlchemy type, or a string for sqlite3 fallback connection.\n If all columns are of the same type, one single value can be used.\n method : {None, 'multi', callable}, default None\n Controls the SQL insertion clause used:\n\n - None : Uses standard SQL ``INSERT`` clause (one per row).\n - 'multi': Pass multiple values in a single ``INSERT`` clause.\n - callable with signature ``(pd_table, conn, keys, data_iter)``.\n\n Details and a sample callable implementation can be found in the\n section :ref:`insert method `.\n\n .. versionadded:: 0.24.0\n \"\"\"\n if if_exists not in ('fail', 'replace', 'append'):\n raise ValueError(\"'{0}' is not valid for if_exists\".format(if_exists))\n\n pandas_sql = pandasSQL_builder(con, schema=schema)\n\n if isinstance(frame, Series):\n frame = frame.to_frame()\n elif not isinstance(frame, DataFrame):\n raise NotImplementedError(\"'frame' argument should be either a \"\n \"Series or a DataFrame\")\n\n pandas_sql.to_sql(frame, name, if_exists=if_exists, index=index,\n index_label=index_label, schema=schema,\n chunksize=chunksize, dtype=dtype, method=method)"} {"instruction": "Implement a Python 3 function for\nchecking if DataBase has named table.", "response": "def has_table(table_name, con, schema=None):\n \"\"\"\n Check if DataBase has named table.\n\n Parameters\n ----------\n table_name: string\n Name of SQL table.\n con: SQLAlchemy connectable(engine/connection) or sqlite3 DBAPI2 connection\n Using SQLAlchemy makes it possible to use any DB supported by that\n library.\n If a DBAPI2 object, only sqlite3 is supported.\n schema : string, default None\n Name of SQL schema in database to write to (if database flavor supports\n this). If None, use default schema (default).\n\n Returns\n -------\n boolean\n \"\"\"\n pandas_sql = pandasSQL_builder(con, schema=schema)\n return pandas_sql.has_table(table_name)"} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef _engine_builder(con):\n global _SQLALCHEMY_INSTALLED\n if isinstance(con, str):\n try:\n import sqlalchemy\n except ImportError:\n _SQLALCHEMY_INSTALLED = False\n else:\n con = sqlalchemy.create_engine(con)\n return con\n\n return con", "response": "Returns a SQLAlchemy engine from a URI if con is a string else it just return con without modifying it."} {"instruction": "How would you explain what the following Python 3 function does\ndef pandasSQL_builder(con, schema=None, meta=None,\n is_cursor=False):\n \"\"\"\n Convenience function to return the correct PandasSQL subclass based on the\n provided parameters.\n \"\"\"\n # When support for DBAPI connections is removed,\n # is_cursor should not be necessary.\n con = _engine_builder(con)\n if _is_sqlalchemy_connectable(con):\n return SQLDatabase(con, schema=schema, meta=meta)\n elif isinstance(con, str):\n raise ImportError(\"Using URI string without sqlalchemy installed.\")\n else:\n return SQLiteDatabase(con, is_cursor=is_cursor)", "response": "Convenience function to return the correct PandasSQL subclass based on the provided parameters."} {"instruction": "How would you explain what the following Python 3 function does\ndef get_schema(frame, name, keys=None, con=None, dtype=None):\n\n pandas_sql = pandasSQL_builder(con=con)\n return pandas_sql._create_sql_schema(frame, name, keys=keys, dtype=dtype)", "response": "Returns the SQL db table schema for the given DataFrame."} {"instruction": "Can you tell what is the following Python 3 function doing\ndef _execute_insert(self, conn, keys, data_iter):\n data = [dict(zip(keys, row)) for row in data_iter]\n conn.execute(self.table.insert(), data)", "response": "Execute SQL statement inserting data\n "} {"instruction": "How would you explain what the following Python 3 function does\ndef _query_iterator(self, result, chunksize, columns, coerce_float=True,\n parse_dates=None):\n \"\"\"Return generator through chunked result set.\"\"\"\n\n while True:\n data = result.fetchmany(chunksize)\n if not data:\n break\n else:\n self.frame = DataFrame.from_records(\n data, columns=columns, coerce_float=coerce_float)\n\n self._harmonize_columns(parse_dates=parse_dates)\n\n if self.index is not None:\n self.frame.set_index(self.index, inplace=True)\n\n yield self.frame", "response": "Return generator through chunked result set."} {"instruction": "How would you code a function in Python 3 to\nharmonize the columns of the DataFrame.", "response": "def _harmonize_columns(self, parse_dates=None):\n \"\"\"\n Make the DataFrame's column types align with the SQL table\n column types.\n Need to work around limited NA value support. Floats are always\n fine, ints must always be floats if there are Null values.\n Booleans are hard because converting bool column with None replaces\n all Nones with false. Therefore only convert bool if there are no\n NA values.\n Datetimes should already be converted to np.datetime64 if supported,\n but here we also force conversion if required.\n \"\"\"\n parse_dates = _process_parse_dates_argument(parse_dates)\n\n for sql_col in self.table.columns:\n col_name = sql_col.name\n try:\n df_col = self.frame[col_name]\n\n # Handle date parsing upfront; don't try to convert columns\n # twice\n if col_name in parse_dates:\n try:\n fmt = parse_dates[col_name]\n except TypeError:\n fmt = None\n self.frame[col_name] = _handle_date_column(\n df_col, format=fmt)\n continue\n\n # the type the dataframe column should have\n col_type = self._get_dtype(sql_col.type)\n\n if (col_type is datetime or col_type is date or\n col_type is DatetimeTZDtype):\n # Convert tz-aware Datetime SQL columns to UTC\n utc = col_type is DatetimeTZDtype\n self.frame[col_name] = _handle_date_column(df_col, utc=utc)\n elif col_type is float:\n # floats support NA, can always convert!\n self.frame[col_name] = df_col.astype(col_type, copy=False)\n\n elif len(df_col) == df_col.count():\n # No NA values, can convert ints and bools\n if col_type is np.dtype('int64') or col_type is bool:\n self.frame[col_name] = df_col.astype(\n col_type, copy=False)\n except KeyError:\n pass"} {"instruction": "Here you have a function in Python 3, explain what it does\ndef read_table(self, table_name, index_col=None, coerce_float=True,\n parse_dates=None, columns=None, schema=None,\n chunksize=None):\n \"\"\"Read SQL database table into a DataFrame.\n\n Parameters\n ----------\n table_name : string\n Name of SQL table in database.\n index_col : string, optional, default: None\n Column to set as index.\n coerce_float : boolean, default True\n Attempts to convert values of non-string, non-numeric objects\n (like decimal.Decimal) to floating point. This can result in\n loss of precision.\n parse_dates : list or dict, default: None\n - List of column names to parse as dates.\n - Dict of ``{column_name: format string}`` where format string is\n strftime compatible in case of parsing string times, or is one of\n (D, s, ns, ms, us) in case of parsing integer timestamps.\n - Dict of ``{column_name: arg}``, where the arg corresponds\n to the keyword arguments of :func:`pandas.to_datetime`.\n Especially useful with databases without native Datetime support,\n such as SQLite.\n columns : list, default: None\n List of column names to select from SQL table.\n schema : string, default None\n Name of SQL schema in database to query (if database flavor\n supports this). If specified, this overwrites the default\n schema of the SQL database object.\n chunksize : int, default None\n If specified, return an iterator where `chunksize` is the number\n of rows to include in each chunk.\n\n Returns\n -------\n DataFrame\n\n See Also\n --------\n pandas.read_sql_table\n SQLDatabase.read_query\n\n \"\"\"\n table = SQLTable(table_name, self, index=index_col, schema=schema)\n return table.read(coerce_float=coerce_float,\n parse_dates=parse_dates, columns=columns,\n chunksize=chunksize)", "response": "Read a table into a DataFrame."} {"instruction": "Implement a Python 3 function for\nreturning generator through chunked result set", "response": "def _query_iterator(result, chunksize, columns, index_col=None,\n coerce_float=True, parse_dates=None):\n \"\"\"Return generator through chunked result set\"\"\"\n\n while True:\n data = result.fetchmany(chunksize)\n if not data:\n break\n else:\n yield _wrap_result(data, columns, index_col=index_col,\n coerce_float=coerce_float,\n parse_dates=parse_dates)"} {"instruction": "Implement a function in Python 3 to\nread SQL query into a DataFrame.", "response": "def read_query(self, sql, index_col=None, coerce_float=True,\n parse_dates=None, params=None, chunksize=None):\n \"\"\"Read SQL query into a DataFrame.\n\n Parameters\n ----------\n sql : string\n SQL query to be executed.\n index_col : string, optional, default: None\n Column name to use as index for the returned DataFrame object.\n coerce_float : boolean, default True\n Attempt to convert values of non-string, non-numeric objects (like\n decimal.Decimal) to floating point, useful for SQL result sets.\n params : list, tuple or dict, optional, default: None\n List of parameters to pass to execute method. The syntax used\n to pass parameters is database driver dependent. Check your\n database driver documentation for which of the five syntax styles,\n described in PEP 249's paramstyle, is supported.\n Eg. for psycopg2, uses %(name)s so use params={'name' : 'value'}\n parse_dates : list or dict, default: None\n - List of column names to parse as dates.\n - Dict of ``{column_name: format string}`` where format string is\n strftime compatible in case of parsing string times, or is one of\n (D, s, ns, ms, us) in case of parsing integer timestamps.\n - Dict of ``{column_name: arg dict}``, where the arg dict\n corresponds to the keyword arguments of\n :func:`pandas.to_datetime` Especially useful with databases\n without native Datetime support, such as SQLite.\n chunksize : int, default None\n If specified, return an iterator where `chunksize` is the number\n of rows to include in each chunk.\n\n Returns\n -------\n DataFrame\n\n See Also\n --------\n read_sql_table : Read SQL database table into a DataFrame.\n read_sql\n\n \"\"\"\n args = _convert_params(sql, params)\n\n result = self.execute(*args)\n columns = result.keys()\n\n if chunksize is not None:\n return self._query_iterator(result, chunksize, columns,\n index_col=index_col,\n coerce_float=coerce_float,\n parse_dates=parse_dates)\n else:\n data = result.fetchall()\n frame = _wrap_result(data, columns, index_col=index_col,\n coerce_float=coerce_float,\n parse_dates=parse_dates)\n return frame"} {"instruction": "Create a Python 3 function for\nwriting the DataFrame to a SQL database.", "response": "def to_sql(self, frame, name, if_exists='fail', index=True,\n index_label=None, schema=None, chunksize=None, dtype=None,\n method=None):\n \"\"\"\n Write records stored in a DataFrame to a SQL database.\n\n Parameters\n ----------\n frame : DataFrame\n name : string\n Name of SQL table.\n if_exists : {'fail', 'replace', 'append'}, default 'fail'\n - fail: If table exists, do nothing.\n - replace: If table exists, drop it, recreate it, and insert data.\n - append: If table exists, insert data. Create if does not exist.\n index : boolean, default True\n Write DataFrame index as a column.\n index_label : string or sequence, default None\n Column label for index column(s). If None is given (default) and\n `index` is True, then the index names are used.\n A sequence should be given if the DataFrame uses MultiIndex.\n schema : string, default None\n Name of SQL schema in database to write to (if database flavor\n supports this). If specified, this overwrites the default\n schema of the SQLDatabase object.\n chunksize : int, default None\n If not None, then rows will be written in batches of this size at a\n time. If None, all rows will be written at once.\n dtype : single type or dict of column name to SQL type, default None\n Optional specifying the datatype for columns. The SQL type should\n be a SQLAlchemy type. If all columns are of the same type, one\n single value can be used.\n method : {None', 'multi', callable}, default None\n Controls the SQL insertion clause used:\n\n * None : Uses standard SQL ``INSERT`` clause (one per row).\n * 'multi': Pass multiple values in a single ``INSERT`` clause.\n * callable with signature ``(pd_table, conn, keys, data_iter)``.\n\n Details and a sample callable implementation can be found in the\n section :ref:`insert method `.\n\n .. versionadded:: 0.24.0\n \"\"\"\n if dtype and not is_dict_like(dtype):\n dtype = {col_name: dtype for col_name in frame}\n\n if dtype is not None:\n from sqlalchemy.types import to_instance, TypeEngine\n for col, my_type in dtype.items():\n if not isinstance(to_instance(my_type), TypeEngine):\n raise ValueError('The type of {column} is not a '\n 'SQLAlchemy type '.format(column=col))\n\n table = SQLTable(name, self, frame=frame, index=index,\n if_exists=if_exists, index_label=index_label,\n schema=schema, dtype=dtype)\n table.create()\n table.insert(chunksize, method=method)\n if (not name.isdigit() and not name.islower()):\n # check for potentially case sensitivity issues (GH7815)\n # Only check when name is not a number and name is not lower case\n engine = self.connectable.engine\n with self.connectable.connect() as conn:\n table_names = engine.table_names(\n schema=schema or self.meta.schema,\n connection=conn,\n )\n if name not in table_names:\n msg = (\n \"The provided table name '{0}' is not found exactly as \"\n \"such in the database after writing the table, possibly \"\n \"due to case sensitivity issues. Consider using lower \"\n \"case table names.\"\n ).format(name)\n warnings.warn(msg, UserWarning)"} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef _create_table_setup(self):\n column_names_and_types = self._get_column_names_and_types(\n self._sql_type_name\n )\n\n pat = re.compile(r'\\s+')\n column_names = [col_name for col_name, _, _ in column_names_and_types]\n if any(map(pat.search, column_names)):\n warnings.warn(_SAFE_NAMES_WARNING, stacklevel=6)\n\n escape = _get_valid_sqlite_name\n\n create_tbl_stmts = [escape(cname) + ' ' + ctype\n for cname, ctype, _ in column_names_and_types]\n\n if self.keys is not None and len(self.keys):\n if not is_list_like(self.keys):\n keys = [self.keys]\n else:\n keys = self.keys\n cnames_br = \", \".join(escape(c) for c in keys)\n create_tbl_stmts.append(\n \"CONSTRAINT {tbl}_pk PRIMARY KEY ({cnames_br})\".format(\n tbl=self.name, cnames_br=cnames_br))\n\n create_stmts = [\"CREATE TABLE \" + escape(self.name) + \" (\\n\" +\n ',\\n '.join(create_tbl_stmts) + \"\\n)\"]\n\n ix_cols = [cname for cname, _, is_index in column_names_and_types\n if is_index]\n if len(ix_cols):\n cnames = \"_\".join(ix_cols)\n cnames_br = \",\".join(escape(c) for c in ix_cols)\n create_stmts.append(\n \"CREATE INDEX \" + escape(\"ix_\" + self.name + \"_\" + cnames) +\n \"ON \" + escape(self.name) + \" (\" + cnames_br + \")\")\n\n return create_stmts", "response": "Returns a list of SQL statements that creates a table reflecting the theCOOKIE structure of the current DataFrame."} {"instruction": "Can you write a function in Python 3 where it\nreturns generator through chunked result set", "response": "def _query_iterator(cursor, chunksize, columns, index_col=None,\n coerce_float=True, parse_dates=None):\n \"\"\"Return generator through chunked result set\"\"\"\n\n while True:\n data = cursor.fetchmany(chunksize)\n if type(data) == tuple:\n data = list(data)\n if not data:\n cursor.close()\n break\n else:\n yield _wrap_result(data, columns, index_col=index_col,\n coerce_float=coerce_float,\n parse_dates=parse_dates)"} {"instruction": "Create a Python 3 function to\nwrite records stored in a DataFrame to a SQL database.", "response": "def to_sql(self, frame, name, if_exists='fail', index=True,\n index_label=None, schema=None, chunksize=None, dtype=None,\n method=None):\n \"\"\"\n Write records stored in a DataFrame to a SQL database.\n\n Parameters\n ----------\n frame: DataFrame\n name: string\n Name of SQL table.\n if_exists: {'fail', 'replace', 'append'}, default 'fail'\n fail: If table exists, do nothing.\n replace: If table exists, drop it, recreate it, and insert data.\n append: If table exists, insert data. Create if it does not exist.\n index : boolean, default True\n Write DataFrame index as a column\n index_label : string or sequence, default None\n Column label for index column(s). If None is given (default) and\n `index` is True, then the index names are used.\n A sequence should be given if the DataFrame uses MultiIndex.\n schema : string, default None\n Ignored parameter included for compatibility with SQLAlchemy\n version of ``to_sql``.\n chunksize : int, default None\n If not None, then rows will be written in batches of this\n size at a time. If None, all rows will be written at once.\n dtype : single type or dict of column name to SQL type, default None\n Optional specifying the datatype for columns. The SQL type should\n be a string. If all columns are of the same type, one single value\n can be used.\n method : {None, 'multi', callable}, default None\n Controls the SQL insertion clause used:\n\n * None : Uses standard SQL ``INSERT`` clause (one per row).\n * 'multi': Pass multiple values in a single ``INSERT`` clause.\n * callable with signature ``(pd_table, conn, keys, data_iter)``.\n\n Details and a sample callable implementation can be found in the\n section :ref:`insert method `.\n\n .. versionadded:: 0.24.0\n \"\"\"\n if dtype and not is_dict_like(dtype):\n dtype = {col_name: dtype for col_name in frame}\n\n if dtype is not None:\n for col, my_type in dtype.items():\n if not isinstance(my_type, str):\n raise ValueError('{column} ({type!s}) not a string'.format(\n column=col, type=my_type))\n\n table = SQLiteTable(name, self, frame=frame, index=index,\n if_exists=if_exists, index_label=index_label,\n dtype=dtype)\n table.create()\n table.insert(chunksize, method)"} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef _maybe_to_categorical(array):\n if isinstance(array, (ABCSeries, ABCCategoricalIndex)):\n return array._values\n elif isinstance(array, np.ndarray):\n return Categorical(array)\n return array", "response": "Coerce to a categorical if a series is given."} {"instruction": "Write a Python 3 function for\nreturning True if key is in container False otherwise.", "response": "def contains(cat, key, container):\n \"\"\"\n Helper for membership check for ``key`` in ``cat``.\n\n This is a helper method for :method:`__contains__`\n and :class:`CategoricalIndex.__contains__`.\n\n Returns True if ``key`` is in ``cat.categories`` and the\n location of ``key`` in ``categories`` is in ``container``.\n\n Parameters\n ----------\n cat : :class:`Categorical`or :class:`categoricalIndex`\n key : a hashable object\n The key to check membership for.\n container : Container (e.g. list-like or mapping)\n The container to check for membership in.\n\n Returns\n -------\n is_in : bool\n True if ``key`` is in ``self.categories`` and location of\n ``key`` in ``categories`` is in ``container``, else False.\n\n Notes\n -----\n This method does not check for NaN values. Do that separately\n before calling this method.\n \"\"\"\n hash(key)\n\n # get location of key in categories.\n # If a KeyError, the key isn't in categories, so logically\n # can't be in container either.\n try:\n loc = cat.categories.get_loc(key)\n except KeyError:\n return False\n\n # loc is the location of key in categories, but also the *value*\n # for key in container. So, `key` may be in categories,\n # but still not in `container`. Example ('b' in categories,\n # but not in values):\n # 'b' in Categorical(['a'], categories=['a', 'b']) # False\n if is_scalar(loc):\n return loc in container\n else:\n # if categories is an IntervalIndex, loc is an array.\n return any(loc_ in container for loc_ in loc)"} {"instruction": "Explain what the following Python 3 code does\ndef _recode_for_categories(codes, old_categories, new_categories):\n from pandas.core.algorithms import take_1d\n\n if len(old_categories) == 0:\n # All null anyway, so just retain the nulls\n return codes.copy()\n elif new_categories.equals(old_categories):\n # Same categories, so no need to actually recode\n return codes.copy()\n indexer = coerce_indexer_dtype(new_categories.get_indexer(old_categories),\n new_categories)\n new_codes = take_1d(indexer, codes.copy(), fill_value=-1)\n return new_codes", "response": "Convert a set of codes for to a new set of categories\n "} {"instruction": "Create a Python 3 function for\nfactorizing an input values into categories and codes. Preserves categorical dtype in categories.", "response": "def _factorize_from_iterable(values):\n \"\"\"\n Factorize an input `values` into `categories` and `codes`. Preserves\n categorical dtype in `categories`.\n\n *This is an internal function*\n\n Parameters\n ----------\n values : list-like\n\n Returns\n -------\n codes : ndarray\n categories : Index\n If `values` has a categorical dtype, then `categories` is\n a CategoricalIndex keeping the categories and order of `values`.\n \"\"\"\n from pandas.core.indexes.category import CategoricalIndex\n\n if not is_list_like(values):\n raise TypeError(\"Input must be list-like\")\n\n if is_categorical(values):\n if isinstance(values, (ABCCategoricalIndex, ABCSeries)):\n values = values._values\n categories = CategoricalIndex(values.categories, dtype=values.dtype)\n codes = values.codes\n else:\n # The value of ordered is irrelevant since we don't use cat as such,\n # but only the resulting categories, the order of which is independent\n # from ordered. Set ordered to False as default. See GH #15457\n cat = Categorical(values, ordered=False)\n categories = cat.categories\n codes = cat.codes\n return codes, categories"} {"instruction": "Explain what the following Python 3 code does\ndef astype(self, dtype, copy=True):\n if is_categorical_dtype(dtype):\n # GH 10696/18593\n dtype = self.dtype.update_dtype(dtype)\n self = self.copy() if copy else self\n if dtype == self.dtype:\n return self\n return self._set_dtype(dtype)\n return np.array(self, dtype=dtype, copy=copy)", "response": "Coerce this type to another dtype."} {"instruction": "Implement a function in Python 3 to\nconstruct a Categorical from inferred values.", "response": "def _from_inferred_categories(cls, inferred_categories, inferred_codes,\n dtype, true_values=None):\n \"\"\"\n Construct a Categorical from inferred values.\n\n For inferred categories (`dtype` is None) the categories are sorted.\n For explicit `dtype`, the `inferred_categories` are cast to the\n appropriate type.\n\n Parameters\n ----------\n inferred_categories : Index\n inferred_codes : Index\n dtype : CategoricalDtype or 'category'\n true_values : list, optional\n If none are provided, the default ones are\n \"True\", \"TRUE\", and \"true.\"\n\n Returns\n -------\n Categorical\n \"\"\"\n from pandas import Index, to_numeric, to_datetime, to_timedelta\n\n cats = Index(inferred_categories)\n known_categories = (isinstance(dtype, CategoricalDtype) and\n dtype.categories is not None)\n\n if known_categories:\n # Convert to a specialized type with `dtype` if specified.\n if dtype.categories.is_numeric():\n cats = to_numeric(inferred_categories, errors=\"coerce\")\n elif is_datetime64_dtype(dtype.categories):\n cats = to_datetime(inferred_categories, errors=\"coerce\")\n elif is_timedelta64_dtype(dtype.categories):\n cats = to_timedelta(inferred_categories, errors=\"coerce\")\n elif dtype.categories.is_boolean():\n if true_values is None:\n true_values = [\"True\", \"TRUE\", \"true\"]\n\n cats = cats.isin(true_values)\n\n if known_categories:\n # Recode from observation order to dtype.categories order.\n categories = dtype.categories\n codes = _recode_for_categories(inferred_codes, cats, categories)\n elif not cats.is_monotonic_increasing:\n # Sort categories and recode for unknown categories.\n unsorted = cats.copy()\n categories = cats.sort_values()\n\n codes = _recode_for_categories(inferred_codes, unsorted,\n categories)\n dtype = CategoricalDtype(categories, ordered=False)\n else:\n dtype = CategoricalDtype(cats, ordered=False)\n codes = inferred_codes\n\n return cls(codes, dtype=dtype, fastpath=True)"} {"instruction": "Given the following Python 3 function, write the documentation\ndef from_codes(cls, codes, categories=None, ordered=None, dtype=None):\n dtype = CategoricalDtype._from_values_or_dtype(categories=categories,\n ordered=ordered,\n dtype=dtype)\n if dtype.categories is None:\n msg = (\"The categories must be provided in 'categories' or \"\n \"'dtype'. Both were None.\")\n raise ValueError(msg)\n\n codes = np.asarray(codes) # #21767\n if not is_integer_dtype(codes):\n msg = \"codes need to be array-like integers\"\n if is_float_dtype(codes):\n icodes = codes.astype('i8')\n if (icodes == codes).all():\n msg = None\n codes = icodes\n warn((\"float codes will be disallowed in the future and \"\n \"raise a ValueError\"), FutureWarning, stacklevel=2)\n if msg:\n raise ValueError(msg)\n\n if len(codes) and (\n codes.max() >= len(dtype.categories) or codes.min() < -1):\n raise ValueError(\"codes need to be between -1 and \"\n \"len(categories)-1\")\n\n return cls(codes, dtype=dtype, fastpath=True)", "response": "Create a new instance of the class with the given codes and categories and dtype."} {"instruction": "Write a Python 3 script to\nreturn a view of the codes array.", "response": "def _get_codes(self):\n \"\"\"\n Get the codes.\n\n Returns\n -------\n codes : integer array view\n A non writable view of the `codes` array.\n \"\"\"\n v = self._codes.view()\n v.flags.writeable = False\n return v"} {"instruction": "Explain what the following Python 3 code does\ndef _set_categories(self, categories, fastpath=False):\n\n if fastpath:\n new_dtype = CategoricalDtype._from_fastpath(categories,\n self.ordered)\n else:\n new_dtype = CategoricalDtype(categories, ordered=self.ordered)\n if (not fastpath and self.dtype.categories is not None and\n len(new_dtype.categories) != len(self.dtype.categories)):\n raise ValueError(\"new categories need to have the same number of \"\n \"items than the old categories!\")\n\n self._dtype = new_dtype", "response": "Sets new categories in the current object."} {"instruction": "Here you have a function in Python 3, explain what it does\ndef _set_dtype(self, dtype):\n codes = _recode_for_categories(self.codes, self.categories,\n dtype.categories)\n return type(self)(codes, dtype=dtype, fastpath=True)", "response": "Internal method for directly updating the CategoricalDtype object."} {"instruction": "Make a summary of the following Python 3 code\ndef set_ordered(self, value, inplace=False):\n inplace = validate_bool_kwarg(inplace, 'inplace')\n new_dtype = CategoricalDtype(self.categories, ordered=value)\n cat = self if inplace else self.copy()\n cat._dtype = new_dtype\n if not inplace:\n return cat", "response": "Sets the ordered attribute of the categorical."} {"instruction": "Create a Python 3 function for\nsetting the Categorical to be ordered.", "response": "def as_ordered(self, inplace=False):\n \"\"\"\n Set the Categorical to be ordered.\n\n Parameters\n ----------\n inplace : bool, default False\n Whether or not to set the ordered attribute in-place or return\n a copy of this categorical with ordered set to True.\n \"\"\"\n inplace = validate_bool_kwarg(inplace, 'inplace')\n return self.set_ordered(True, inplace=inplace)"} {"instruction": "How would you explain what the following Python 3 function does\ndef as_unordered(self, inplace=False):\n inplace = validate_bool_kwarg(inplace, 'inplace')\n return self.set_ordered(False, inplace=inplace)", "response": "Set the Categorical to be unordered."} {"instruction": "Write a Python 3 script to\nset the categories of the current entry.", "response": "def set_categories(self, new_categories, ordered=None, rename=False,\n inplace=False):\n \"\"\"\n Set the categories to the specified new_categories.\n\n `new_categories` can include new categories (which will result in\n unused categories) or remove old categories (which results in values\n set to NaN). If `rename==True`, the categories will simple be renamed\n (less or more items than in old categories will result in values set to\n NaN or in unused categories respectively).\n\n This method can be used to perform more than one action of adding,\n removing, and reordering simultaneously and is therefore faster than\n performing the individual steps via the more specialised methods.\n\n On the other hand this methods does not do checks (e.g., whether the\n old categories are included in the new categories on a reorder), which\n can result in surprising changes, for example when using special string\n dtypes on python3, which does not considers a S1 string equal to a\n single char python string.\n\n Parameters\n ----------\n new_categories : Index-like\n The categories in new order.\n ordered : bool, default False\n Whether or not the categorical is treated as a ordered categorical.\n If not given, do not change the ordered information.\n rename : bool, default False\n Whether or not the new_categories should be considered as a rename\n of the old categories or as reordered categories.\n inplace : bool, default False\n Whether or not to reorder the categories in-place or return a copy\n of this categorical with reordered categories.\n\n Returns\n -------\n Categorical with reordered categories or None if inplace.\n\n Raises\n ------\n ValueError\n If new_categories does not validate as categories\n\n See Also\n --------\n rename_categories\n reorder_categories\n add_categories\n remove_categories\n remove_unused_categories\n \"\"\"\n inplace = validate_bool_kwarg(inplace, 'inplace')\n if ordered is None:\n ordered = self.dtype.ordered\n new_dtype = CategoricalDtype(new_categories, ordered=ordered)\n\n cat = self if inplace else self.copy()\n if rename:\n if (cat.dtype.categories is not None and\n len(new_dtype.categories) < len(cat.dtype.categories)):\n # remove all _codes which are larger and set to -1/NaN\n cat._codes[cat._codes >= len(new_dtype.categories)] = -1\n else:\n codes = _recode_for_categories(cat.codes, cat.categories,\n new_dtype.categories)\n cat._codes = codes\n cat._dtype = new_dtype\n\n if not inplace:\n return cat"} {"instruction": "Write a Python 3 script to\nreturn a new categorical with renamed categories.", "response": "def rename_categories(self, new_categories, inplace=False):\n \"\"\"\n Rename categories.\n\n Parameters\n ----------\n new_categories : list-like, dict-like or callable\n\n * list-like: all items must be unique and the number of items in\n the new categories must match the existing number of categories.\n\n * dict-like: specifies a mapping from\n old categories to new. Categories not contained in the mapping\n are passed through and extra categories in the mapping are\n ignored.\n\n .. versionadded:: 0.21.0\n\n * callable : a callable that is called on all items in the old\n categories and whose return values comprise the new categories.\n\n .. versionadded:: 0.23.0\n\n .. warning::\n\n Currently, Series are considered list like. In a future version\n of pandas they'll be considered dict-like.\n\n inplace : bool, default False\n Whether or not to rename the categories inplace or return a copy of\n this categorical with renamed categories.\n\n Returns\n -------\n cat : Categorical or None\n With ``inplace=False``, the new categorical is returned.\n With ``inplace=True``, there is no return value.\n\n Raises\n ------\n ValueError\n If new categories are list-like and do not have the same number of\n items than the current categories or do not validate as categories\n\n See Also\n --------\n reorder_categories\n add_categories\n remove_categories\n remove_unused_categories\n set_categories\n\n Examples\n --------\n >>> c = pd.Categorical(['a', 'a', 'b'])\n >>> c.rename_categories([0, 1])\n [0, 0, 1]\n Categories (2, int64): [0, 1]\n\n For dict-like ``new_categories``, extra keys are ignored and\n categories not in the dictionary are passed through\n\n >>> c.rename_categories({'a': 'A', 'c': 'C'})\n [A, A, b]\n Categories (2, object): [A, b]\n\n You may also provide a callable to create the new categories\n\n >>> c.rename_categories(lambda x: x.upper())\n [A, A, B]\n Categories (2, object): [A, B]\n \"\"\"\n inplace = validate_bool_kwarg(inplace, 'inplace')\n cat = self if inplace else self.copy()\n\n if isinstance(new_categories, ABCSeries):\n msg = (\"Treating Series 'new_categories' as a list-like and using \"\n \"the values. In a future version, 'rename_categories' will \"\n \"treat Series like a dictionary.\\n\"\n \"For dict-like, use 'new_categories.to_dict()'\\n\"\n \"For list-like, use 'new_categories.values'.\")\n warn(msg, FutureWarning, stacklevel=2)\n new_categories = list(new_categories)\n\n if is_dict_like(new_categories):\n cat.categories = [new_categories.get(item, item)\n for item in cat.categories]\n elif callable(new_categories):\n cat.categories = [new_categories(item) for item in cat.categories]\n else:\n cat.categories = new_categories\n if not inplace:\n return cat"} {"instruction": "Can you write a function in Python 3 where it\nreorders the categories of the current object.", "response": "def reorder_categories(self, new_categories, ordered=None, inplace=False):\n \"\"\"\n Reorder categories as specified in new_categories.\n\n `new_categories` need to include all old categories and no new category\n items.\n\n Parameters\n ----------\n new_categories : Index-like\n The categories in new order.\n ordered : bool, optional\n Whether or not the categorical is treated as a ordered categorical.\n If not given, do not change the ordered information.\n inplace : bool, default False\n Whether or not to reorder the categories inplace or return a copy of\n this categorical with reordered categories.\n\n Returns\n -------\n cat : Categorical with reordered categories or None if inplace.\n\n Raises\n ------\n ValueError\n If the new categories do not contain all old category items or any\n new ones\n\n See Also\n --------\n rename_categories\n add_categories\n remove_categories\n remove_unused_categories\n set_categories\n \"\"\"\n inplace = validate_bool_kwarg(inplace, 'inplace')\n if set(self.dtype.categories) != set(new_categories):\n raise ValueError(\"items in new_categories are not the same as in \"\n \"old categories\")\n return self.set_categories(new_categories, ordered=ordered,\n inplace=inplace)"} {"instruction": "Write a Python 3 script to\nadd new categories to the internal Categorical.", "response": "def add_categories(self, new_categories, inplace=False):\n \"\"\"\n Add new categories.\n\n `new_categories` will be included at the last/highest place in the\n categories and will be unused directly after this call.\n\n Parameters\n ----------\n new_categories : category or list-like of category\n The new categories to be included.\n inplace : bool, default False\n Whether or not to add the categories inplace or return a copy of\n this categorical with added categories.\n\n Returns\n -------\n cat : Categorical with new categories added or None if inplace.\n\n Raises\n ------\n ValueError\n If the new categories include old categories or do not validate as\n categories\n\n See Also\n --------\n rename_categories\n reorder_categories\n remove_categories\n remove_unused_categories\n set_categories\n \"\"\"\n inplace = validate_bool_kwarg(inplace, 'inplace')\n if not is_list_like(new_categories):\n new_categories = [new_categories]\n already_included = set(new_categories) & set(self.dtype.categories)\n if len(already_included) != 0:\n msg = (\"new categories must not include old categories: \"\n \"{already_included!s}\")\n raise ValueError(msg.format(already_included=already_included))\n new_categories = list(self.dtype.categories) + list(new_categories)\n new_dtype = CategoricalDtype(new_categories, self.ordered)\n\n cat = self if inplace else self.copy()\n cat._dtype = new_dtype\n cat._codes = coerce_indexer_dtype(cat._codes, new_dtype.categories)\n if not inplace:\n return cat"} {"instruction": "Implement a function in Python 3 to\nremove the specified categories from the internal set.", "response": "def remove_categories(self, removals, inplace=False):\n \"\"\"\n Remove the specified categories.\n\n `removals` must be included in the old categories. Values which were in\n the removed categories will be set to NaN\n\n Parameters\n ----------\n removals : category or list of categories\n The categories which should be removed.\n inplace : bool, default False\n Whether or not to remove the categories inplace or return a copy of\n this categorical with removed categories.\n\n Returns\n -------\n cat : Categorical with removed categories or None if inplace.\n\n Raises\n ------\n ValueError\n If the removals are not contained in the categories\n\n See Also\n --------\n rename_categories\n reorder_categories\n add_categories\n remove_unused_categories\n set_categories\n \"\"\"\n inplace = validate_bool_kwarg(inplace, 'inplace')\n if not is_list_like(removals):\n removals = [removals]\n\n removal_set = set(list(removals))\n not_included = removal_set - set(self.dtype.categories)\n new_categories = [c for c in self.dtype.categories\n if c not in removal_set]\n\n # GH 10156\n if any(isna(removals)):\n not_included = [x for x in not_included if notna(x)]\n new_categories = [x for x in new_categories if notna(x)]\n\n if len(not_included) != 0:\n msg = \"removals must all be in old categories: {not_included!s}\"\n raise ValueError(msg.format(not_included=not_included))\n\n return self.set_categories(new_categories, ordered=self.ordered,\n rename=False, inplace=inplace)"} {"instruction": "Explain what the following Python 3 code does\ndef remove_unused_categories(self, inplace=False):\n inplace = validate_bool_kwarg(inplace, 'inplace')\n cat = self if inplace else self.copy()\n idx, inv = np.unique(cat._codes, return_inverse=True)\n\n if idx.size != 0 and idx[0] == -1: # na sentinel\n idx, inv = idx[1:], inv - 1\n\n new_categories = cat.dtype.categories.take(idx)\n new_dtype = CategoricalDtype._from_fastpath(new_categories,\n ordered=self.ordered)\n cat._dtype = new_dtype\n cat._codes = coerce_indexer_dtype(inv, new_dtype.categories)\n\n if not inplace:\n return cat", "response": "Returns a copy of the Categorical with unused categories dropped."} {"instruction": "Here you have a function in Python 3, explain what it does\ndef map(self, mapper):\n new_categories = self.categories.map(mapper)\n try:\n return self.from_codes(self._codes.copy(),\n categories=new_categories,\n ordered=self.ordered)\n except ValueError:\n # NA values are represented in self._codes with -1\n # np.take causes NA values to take final element in new_categories\n if np.any(self._codes == -1):\n new_categories = new_categories.insert(len(new_categories),\n np.nan)\n return np.take(new_categories, self._codes)", "response": "Maps the categories of the a log entry to new categories."} {"instruction": "Given the following Python 3 function, write the documentation\ndef shift(self, periods, fill_value=None):\n # since categoricals always have ndim == 1, an axis parameter\n # doesn't make any sense here.\n codes = self.codes\n if codes.ndim > 1:\n raise NotImplementedError(\"Categorical with ndim > 1.\")\n if np.prod(codes.shape) and (periods != 0):\n codes = np.roll(codes, ensure_platform_int(periods), axis=0)\n if isna(fill_value):\n fill_value = -1\n elif fill_value in self.categories:\n fill_value = self.categories.get_loc(fill_value)\n else:\n raise ValueError(\"'fill_value={}' is not present \"\n \"in this Categorical's \"\n \"categories\".format(fill_value))\n if periods > 0:\n codes[:periods] = fill_value\n else:\n codes[periods:] = fill_value\n\n return self.from_codes(codes, dtype=self.dtype)", "response": "Shifts the categorical by desired number of periods."} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef memory_usage(self, deep=False):\n return self._codes.nbytes + self.dtype.categories.memory_usage(\n deep=deep)", "response": "Return the memory usage of the object."} {"instruction": "Here you have a function in Python 3, explain what it does\ndef value_counts(self, dropna=True):\n from numpy import bincount\n from pandas import Series, CategoricalIndex\n\n code, cat = self._codes, self.categories\n ncat, mask = len(cat), 0 <= code\n ix, clean = np.arange(ncat), mask.all()\n\n if dropna or clean:\n obs = code if clean else code[mask]\n count = bincount(obs, minlength=ncat or None)\n else:\n count = bincount(np.where(mask, code, ncat))\n ix = np.append(ix, -1)\n\n ix = self._constructor(ix, dtype=self.dtype,\n fastpath=True)\n\n return Series(count, index=CategoricalIndex(ix), dtype='int64')", "response": "Return a Series containing counts of each category."} {"instruction": "Create a Python 3 function for\nreturning the values of the object as numpy array.", "response": "def get_values(self):\n \"\"\"\n Return the values.\n\n For internal compatibility with pandas formatting.\n\n Returns\n -------\n numpy.array\n A numpy array of the same dtype as categorical.categories.dtype or\n Index if datetime / periods.\n \"\"\"\n # if we are a datetime and period index, return Index to keep metadata\n if is_datetimelike(self.categories):\n return self.categories.take(self._codes, fill_value=np.nan)\n elif is_integer_dtype(self.categories) and -1 in self._codes:\n return self.categories.astype(\"object\").take(self._codes,\n fill_value=np.nan)\n return np.array(self)"} {"instruction": "Write a Python 3 function that can\nsort the values of the Categorical by category value returning a new Categorical instance.", "response": "def sort_values(self, inplace=False, ascending=True, na_position='last'):\n \"\"\"\n Sort the Categorical by category value returning a new\n Categorical by default.\n\n While an ordering is applied to the category values, sorting in this\n context refers more to organizing and grouping together based on\n matching category values. Thus, this function can be called on an\n unordered Categorical instance unlike the functions 'Categorical.min'\n and 'Categorical.max'.\n\n Parameters\n ----------\n inplace : bool, default False\n Do operation in place.\n ascending : bool, default True\n Order ascending. Passing False orders descending. The\n ordering parameter provides the method by which the\n category values are organized.\n na_position : {'first', 'last'} (optional, default='last')\n 'first' puts NaNs at the beginning\n 'last' puts NaNs at the end\n\n Returns\n -------\n Categorical or None\n\n See Also\n --------\n Categorical.sort\n Series.sort_values\n\n Examples\n --------\n >>> c = pd.Categorical([1, 2, 2, 1, 5])\n >>> c\n [1, 2, 2, 1, 5]\n Categories (3, int64): [1, 2, 5]\n >>> c.sort_values()\n [1, 1, 2, 2, 5]\n Categories (3, int64): [1, 2, 5]\n >>> c.sort_values(ascending=False)\n [5, 2, 2, 1, 1]\n Categories (3, int64): [1, 2, 5]\n\n Inplace sorting can be done as well:\n\n >>> c.sort_values(inplace=True)\n >>> c\n [1, 1, 2, 2, 5]\n Categories (3, int64): [1, 2, 5]\n >>>\n >>> c = pd.Categorical([1, 2, 2, 1, 5])\n\n 'sort_values' behaviour with NaNs. Note that 'na_position'\n is independent of the 'ascending' parameter:\n\n >>> c = pd.Categorical([np.nan, 2, 2, np.nan, 5])\n >>> c\n [NaN, 2.0, 2.0, NaN, 5.0]\n Categories (2, int64): [2, 5]\n >>> c.sort_values()\n [2.0, 2.0, 5.0, NaN, NaN]\n Categories (2, int64): [2, 5]\n >>> c.sort_values(ascending=False)\n [5.0, 2.0, 2.0, NaN, NaN]\n Categories (2, int64): [2, 5]\n >>> c.sort_values(na_position='first')\n [NaN, NaN, 2.0, 2.0, 5.0]\n Categories (2, int64): [2, 5]\n >>> c.sort_values(ascending=False, na_position='first')\n [NaN, NaN, 5.0, 2.0, 2.0]\n Categories (2, int64): [2, 5]\n \"\"\"\n inplace = validate_bool_kwarg(inplace, 'inplace')\n if na_position not in ['last', 'first']:\n msg = 'invalid na_position: {na_position!r}'\n raise ValueError(msg.format(na_position=na_position))\n\n sorted_idx = nargsort(self,\n ascending=ascending,\n na_position=na_position)\n\n if inplace:\n self._codes = self._codes[sorted_idx]\n else:\n return self._constructor(values=self._codes[sorted_idx],\n dtype=self.dtype,\n fastpath=True)"} {"instruction": "Make a summary of the following Python 3 code\ndef _values_for_rank(self):\n from pandas import Series\n if self.ordered:\n values = self.codes\n mask = values == -1\n if mask.any():\n values = values.astype('float64')\n values[mask] = np.nan\n elif self.categories.is_numeric():\n values = np.array(self)\n else:\n # reorder the categories (so rank can use the float codes)\n # instead of passing an object array to rank\n values = np.array(\n self.rename_categories(Series(self.categories).rank().values)\n )\n return values", "response": "Returns a numpy array of the values for the rank of the items in the record."} {"instruction": "Given the following Python 3 function, write the documentation\ndef fillna(self, value=None, method=None, limit=None):\n value, method = validate_fillna_kwargs(\n value, method, validate_scalar_dict_value=False\n )\n\n if value is None:\n value = np.nan\n if limit is not None:\n raise NotImplementedError(\"specifying a limit for fillna has not \"\n \"been implemented yet\")\n\n codes = self._codes\n\n # pad / bfill\n if method is not None:\n\n values = self.to_dense().reshape(-1, len(self))\n values = interpolate_2d(values, method, 0, None,\n value).astype(self.categories.dtype)[0]\n codes = _get_codes_for_values(values, self.categories)\n\n else:\n\n # If value is a dict or a Series (a dict value has already\n # been converted to a Series)\n if isinstance(value, ABCSeries):\n if not value[~value.isin(self.categories)].isna().all():\n raise ValueError(\"fill value must be in categories\")\n\n values_codes = _get_codes_for_values(value, self.categories)\n indexer = np.where(values_codes != -1)\n codes[indexer] = values_codes[values_codes != -1]\n\n # If value is not a dict or Series it should be a scalar\n elif is_hashable(value):\n if not isna(value) and value not in self.categories:\n raise ValueError(\"fill value must be in categories\")\n\n mask = codes == -1\n if mask.any():\n codes = codes.copy()\n if isna(value):\n codes[mask] = -1\n else:\n codes[mask] = self.categories.get_loc(value)\n\n else:\n raise TypeError('\"value\" parameter must be a scalar, dict '\n 'or Series, but you passed a '\n '\"{0}\"'.format(type(value).__name__))\n\n return self._constructor(codes, dtype=self.dtype, fastpath=True)", "response": "Fill missing values in the specified category with the specified value."} {"instruction": "Create a Python 3 function to\ntake elements from the Categorical.", "response": "def take_nd(self, indexer, allow_fill=None, fill_value=None):\n \"\"\"\n Take elements from the Categorical.\n\n Parameters\n ----------\n indexer : sequence of int\n The indices in `self` to take. The meaning of negative values in\n `indexer` depends on the value of `allow_fill`.\n allow_fill : bool, default None\n How to handle negative values in `indexer`.\n\n * False: negative values in `indices` indicate positional indices\n from the right. This is similar to\n :func:`numpy.take`.\n\n * True: negative values in `indices` indicate missing values\n (the default). These values are set to `fill_value`. Any other\n other negative values raise a ``ValueError``.\n\n .. versionchanged:: 0.23.0\n\n Deprecated the default value of `allow_fill`. The deprecated\n default is ``True``. In the future, this will change to\n ``False``.\n\n fill_value : object\n The value to use for `indices` that are missing (-1), when\n ``allow_fill=True``. This should be the category, i.e. a value\n in ``self.categories``, not a code.\n\n Returns\n -------\n Categorical\n This Categorical will have the same categories and ordered as\n `self`.\n\n See Also\n --------\n Series.take : Similar method for Series.\n numpy.ndarray.take : Similar method for NumPy arrays.\n\n Examples\n --------\n >>> cat = pd.Categorical(['a', 'a', 'b'])\n >>> cat\n [a, a, b]\n Categories (2, object): [a, b]\n\n Specify ``allow_fill==False`` to have negative indices mean indexing\n from the right.\n\n >>> cat.take([0, -1, -2], allow_fill=False)\n [a, b, a]\n Categories (2, object): [a, b]\n\n With ``allow_fill=True``, indices equal to ``-1`` mean \"missing\"\n values that should be filled with the `fill_value`, which is\n ``np.nan`` by default.\n\n >>> cat.take([0, -1, -1], allow_fill=True)\n [a, NaN, NaN]\n Categories (2, object): [a, b]\n\n The fill value can be specified.\n\n >>> cat.take([0, -1, -1], allow_fill=True, fill_value='a')\n [a, a, a]\n Categories (3, object): [a, b]\n\n Specifying a fill value that's not in ``self.categories``\n will raise a ``TypeError``.\n \"\"\"\n indexer = np.asarray(indexer, dtype=np.intp)\n if allow_fill is None:\n if (indexer < 0).any():\n warn(_take_msg, FutureWarning, stacklevel=2)\n allow_fill = True\n\n dtype = self.dtype\n\n if isna(fill_value):\n fill_value = -1\n elif allow_fill:\n # convert user-provided `fill_value` to codes\n if fill_value in self.categories:\n fill_value = self.categories.get_loc(fill_value)\n else:\n msg = (\n \"'fill_value' ('{}') is not in this Categorical's \"\n \"categories.\"\n )\n raise TypeError(msg.format(fill_value))\n\n codes = take(self._codes, indexer, allow_fill=allow_fill,\n fill_value=fill_value)\n result = type(self).from_codes(codes, dtype=dtype)\n return result"} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef _slice(self, slicer):\n\n # only allow 1 dimensional slicing, but can\n # in a 2-d case be passd (slice(None),....)\n if isinstance(slicer, tuple) and len(slicer) == 2:\n if not com.is_null_slice(slicer[0]):\n raise AssertionError(\"invalid slicing for a 1-ndim \"\n \"categorical\")\n slicer = slicer[1]\n\n codes = self._codes[slicer]\n return self._constructor(values=codes, dtype=self.dtype, fastpath=True)", "response": "Return a slice of the object."} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef _repr_categories(self):\n max_categories = (10 if get_option(\"display.max_categories\") == 0 else\n get_option(\"display.max_categories\"))\n from pandas.io.formats import format as fmt\n if len(self.categories) > max_categories:\n num = max_categories // 2\n head = fmt.format_array(self.categories[:num], None)\n tail = fmt.format_array(self.categories[-num:], None)\n category_strs = head + [\"...\"] + tail\n else:\n category_strs = fmt.format_array(self.categories, None)\n\n # Strip all leading spaces, which format_array adds for columns...\n category_strs = [x.strip() for x in category_strs]\n return category_strs", "response": "Return the base repr for the categories"} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef _repr_categories_info(self):\n\n category_strs = self._repr_categories()\n dtype = getattr(self.categories, 'dtype_str',\n str(self.categories.dtype))\n\n levheader = \"Categories ({length}, {dtype}): \".format(\n length=len(self.categories), dtype=dtype)\n width, height = get_terminal_size()\n max_width = get_option(\"display.width\") or width\n if console.in_ipython_frontend():\n # 0 = no breaks\n max_width = 0\n levstring = \"\"\n start = True\n cur_col_len = len(levheader) # header\n sep_len, sep = (3, \" < \") if self.ordered else (2, \", \")\n linesep = sep.rstrip() + \"\\n\" # remove whitespace\n for val in category_strs:\n if max_width != 0 and cur_col_len + sep_len + len(val) > max_width:\n levstring += linesep + (\" \" * (len(levheader) + 1))\n cur_col_len = len(levheader) + 1 # header + a whitespace\n elif not start:\n levstring += sep\n cur_col_len += len(val)\n levstring += val\n start = False\n # replace to simple save space by\n return levheader + \"[\" + levstring.replace(\" < ... < \", \" ... \") + \"]\"", "response": "Returns a string representation of the footer."} {"instruction": "How would you code a function in Python 3 to\ncoerce an indexer to the codes dtype", "response": "def _maybe_coerce_indexer(self, indexer):\n \"\"\"\n return an indexer coerced to the codes dtype\n \"\"\"\n if isinstance(indexer, np.ndarray) and indexer.dtype.kind == 'i':\n indexer = indexer.astype(self._codes.dtype)\n return indexer"} {"instruction": "Write a Python 3 script for\ncomputing the inverse of a categorical returning a dict of categories -> indexers", "response": "def _reverse_indexer(self):\n \"\"\"\n Compute the inverse of a categorical, returning\n a dict of categories -> indexers.\n\n *This is an internal function*\n\n Returns\n -------\n dict of categories -> indexers\n\n Example\n -------\n In [1]: c = pd.Categorical(list('aabca'))\n\n In [2]: c\n Out[2]:\n [a, a, b, c, a]\n Categories (3, object): [a, b, c]\n\n In [3]: c.categories\n Out[3]: Index(['a', 'b', 'c'], dtype='object')\n\n In [4]: c.codes\n Out[4]: array([0, 0, 1, 2, 0], dtype=int8)\n\n In [5]: c._reverse_indexer()\n Out[5]: {'a': array([0, 1, 4]), 'b': array([2]), 'c': array([3])}\n\n \"\"\"\n categories = self.categories\n r, counts = libalgos.groupsort_indexer(self.codes.astype('int64'),\n categories.size)\n counts = counts.cumsum()\n result = (r[start:end] for start, end in zip(counts, counts[1:]))\n result = dict(zip(categories, result))\n return result"} {"instruction": "Explain what the following Python 3 code does\ndef min(self, numeric_only=None, **kwargs):\n self.check_for_ordered('min')\n if numeric_only:\n good = self._codes != -1\n pointer = self._codes[good].min(**kwargs)\n else:\n pointer = self._codes.min(**kwargs)\n if pointer == -1:\n return np.nan\n else:\n return self.categories[pointer]", "response": "Returns the minimum value of the object."} {"instruction": "Here you have a function in Python 3, explain what it does\ndef mode(self, dropna=True):\n\n import pandas._libs.hashtable as htable\n codes = self._codes\n if dropna:\n good = self._codes != -1\n codes = self._codes[good]\n codes = sorted(htable.mode_int64(ensure_int64(codes), dropna))\n return self._constructor(values=codes, dtype=self.dtype, fastpath=True)", "response": "Returns the mode of the Categorical."} {"instruction": "Write a Python 3 function that can\nreturn the unique values of the Categorical. Unused categories are NOT returned.", "response": "def unique(self):\n \"\"\"\n Return the ``Categorical`` which ``categories`` and ``codes`` are\n unique. Unused categories are NOT returned.\n\n - unordered category: values and categories are sorted by appearance\n order.\n - ordered category: values are sorted by appearance order, categories\n keeps existing order.\n\n Returns\n -------\n unique values : ``Categorical``\n\n Examples\n --------\n An unordered Categorical will return categories in the\n order of appearance.\n\n >>> pd.Categorical(list('baabc'))\n [b, a, c]\n Categories (3, object): [b, a, c]\n\n >>> pd.Categorical(list('baabc'), categories=list('abc'))\n [b, a, c]\n Categories (3, object): [b, a, c]\n\n An ordered Categorical preserves the category ordering.\n\n >>> pd.Categorical(list('baabc'),\n ... categories=list('abc'),\n ... ordered=True)\n [b, a, c]\n Categories (3, object): [a < b < c]\n\n See Also\n --------\n unique\n CategoricalIndex.unique\n Series.unique\n\n \"\"\"\n\n # unlike np.unique, unique1d does not sort\n unique_codes = unique1d(self.codes)\n cat = self.copy()\n\n # keep nan in codes\n cat._codes = unique_codes\n\n # exclude nan from indexer for categories\n take_codes = unique_codes[unique_codes != -1]\n if self.ordered:\n take_codes = np.sort(take_codes)\n return cat.set_categories(cat.categories.take(take_codes))"} {"instruction": "Write a Python 3 function that can\nreturn True if categorical arrays are equal.", "response": "def equals(self, other):\n \"\"\"\n Returns True if categorical arrays are equal.\n\n Parameters\n ----------\n other : `Categorical`\n\n Returns\n -------\n bool\n \"\"\"\n if self.is_dtype_equal(other):\n if self.categories.equals(other.categories):\n # fastpath to avoid re-coding\n other_codes = other._codes\n else:\n other_codes = _recode_for_categories(other.codes,\n other.categories,\n self.categories)\n return np.array_equal(self._codes, other_codes)\n return False"} {"instruction": "Implement a function in Python 3 to\nreturn True if the categoricals are the same dtype and same ordered .", "response": "def is_dtype_equal(self, other):\n \"\"\"\n Returns True if categoricals are the same dtype\n same categories, and same ordered\n\n Parameters\n ----------\n other : Categorical\n\n Returns\n -------\n bool\n \"\"\"\n\n try:\n return hash(self.dtype) == hash(other.dtype)\n except (AttributeError, TypeError):\n return False"} {"instruction": "Can you write a function in Python 3 where it\nreturns a description of this Categorical", "response": "def describe(self):\n \"\"\"\n Describes this Categorical\n\n Returns\n -------\n description: `DataFrame`\n A dataframe with frequency and counts by category.\n \"\"\"\n counts = self.value_counts(dropna=False)\n freqs = counts / float(counts.sum())\n\n from pandas.core.reshape.concat import concat\n result = concat([counts, freqs], axis=1)\n result.columns = ['counts', 'freqs']\n result.index.name = 'categories'\n\n return result"} {"instruction": "How would you explain what the following Python 3 function does\ndef isin(self, values):\n from pandas.core.internals.construction import sanitize_array\n if not is_list_like(values):\n raise TypeError(\"only list-like objects are allowed to be passed\"\n \" to isin(), you passed a [{values_type}]\"\n .format(values_type=type(values).__name__))\n values = sanitize_array(values, None, None)\n null_mask = np.asarray(isna(values))\n code_values = self.categories.get_indexer(values)\n code_values = code_values[null_mask | (code_values >= 0)]\n return algorithms.isin(self.codes, code_values)", "response": "Check whether each element in the passed sequence of values is contained in the Categorical."} {"instruction": "Explain what the following Python 3 code does\ndef to_timedelta(arg, unit='ns', box=True, errors='raise'):\n unit = parse_timedelta_unit(unit)\n\n if errors not in ('ignore', 'raise', 'coerce'):\n raise ValueError(\"errors must be one of 'ignore', \"\n \"'raise', or 'coerce'}\")\n\n if unit in {'Y', 'y', 'M'}:\n warnings.warn(\"M and Y units are deprecated and \"\n \"will be removed in a future version.\",\n FutureWarning, stacklevel=2)\n\n if arg is None:\n return arg\n elif isinstance(arg, ABCSeries):\n values = _convert_listlike(arg._values, unit=unit,\n box=False, errors=errors)\n return arg._constructor(values, index=arg.index, name=arg.name)\n elif isinstance(arg, ABCIndexClass):\n return _convert_listlike(arg, unit=unit, box=box,\n errors=errors, name=arg.name)\n elif isinstance(arg, np.ndarray) and arg.ndim == 0:\n # extract array scalar and process below\n arg = arg.item()\n elif is_list_like(arg) and getattr(arg, 'ndim', 1) == 1:\n return _convert_listlike(arg, unit=unit, box=box, errors=errors)\n elif getattr(arg, 'ndim', 1) > 1:\n raise TypeError('arg must be a string, timedelta, list, tuple, '\n '1-d array, or Series')\n\n # ...so it must be a scalar value. Return scalar.\n return _coerce_scalar_to_timedelta_type(arg, unit=unit,\n box=box, errors=errors)", "response": "Convert an argument to a timedelta."} {"instruction": "Can you write a function in Python 3 where it\nconverts string r to a timedelta object.", "response": "def _coerce_scalar_to_timedelta_type(r, unit='ns', box=True, errors='raise'):\n \"\"\"Convert string 'r' to a timedelta object.\"\"\"\n\n try:\n result = Timedelta(r, unit)\n if not box:\n # explicitly view as timedelta64 for case when result is pd.NaT\n result = result.asm8.view('timedelta64[ns]')\n except ValueError:\n if errors == 'raise':\n raise\n elif errors == 'ignore':\n return r\n\n # coerce\n result = NaT\n\n return result"} {"instruction": "Can you create a Python 3 function that\nconverts a list of objects to a timedelta index object.", "response": "def _convert_listlike(arg, unit='ns', box=True, errors='raise', name=None):\n \"\"\"Convert a list of objects to a timedelta index object.\"\"\"\n\n if isinstance(arg, (list, tuple)) or not hasattr(arg, 'dtype'):\n # This is needed only to ensure that in the case where we end up\n # returning arg (errors == \"ignore\"), and where the input is a\n # generator, we return a useful list-like instead of a\n # used-up generator\n arg = np.array(list(arg), dtype=object)\n\n try:\n value = sequence_to_td64ns(arg, unit=unit,\n errors=errors, copy=False)[0]\n except ValueError:\n if errors == 'ignore':\n return arg\n else:\n # This else-block accounts for the cases when errors='raise'\n # and errors='coerce'. If errors == 'raise', these errors\n # should be raised. If errors == 'coerce', we shouldn't\n # expect any errors to be raised, since all parsing errors\n # cause coercion to pd.NaT. However, if an error / bug is\n # introduced that causes an Exception to be raised, we would\n # like to surface it.\n raise\n\n if box:\n from pandas import TimedeltaIndex\n value = TimedeltaIndex(value, unit='ns', name=name)\n return value"} {"instruction": "Write a Python 3 script to\ngenerate a sequence of dates corresponding to the specified time offset.", "response": "def generate_range(start=None, end=None, periods=None, offset=BDay()):\n \"\"\"\n Generates a sequence of dates corresponding to the specified time\n offset. Similar to dateutil.rrule except uses pandas DateOffset\n objects to represent time increments.\n\n Parameters\n ----------\n start : datetime (default None)\n end : datetime (default None)\n periods : int, (default None)\n offset : DateOffset, (default BDay())\n\n Notes\n -----\n * This method is faster for generating weekdays than dateutil.rrule\n * At least two of (start, end, periods) must be specified.\n * If both start and end are specified, the returned dates will\n satisfy start <= date <= end.\n\n Returns\n -------\n dates : generator object\n \"\"\"\n from pandas.tseries.frequencies import to_offset\n offset = to_offset(offset)\n\n start = to_datetime(start)\n end = to_datetime(end)\n\n if start and not offset.onOffset(start):\n start = offset.rollforward(start)\n\n elif end and not offset.onOffset(end):\n end = offset.rollback(end)\n\n if periods is None and end < start and offset.n >= 0:\n end = None\n periods = 0\n\n if end is None:\n end = start + (periods - 1) * offset\n\n if start is None:\n start = end - (periods - 1) * offset\n\n cur = start\n if offset.n >= 0:\n while cur <= end:\n yield cur\n\n # faster than cur + offset\n next_date = offset.apply(cur)\n if next_date <= cur:\n raise ValueError('Offset {offset} did not increment date'\n .format(offset=offset))\n cur = next_date\n else:\n while cur >= end:\n yield cur\n\n # faster than cur + offset\n next_date = offset.apply(cur)\n if next_date >= cur:\n raise ValueError('Offset {offset} did not decrement date'\n .format(offset=offset))\n cur = next_date"} {"instruction": "Make a summary of the following Python 3 code\ndef rollback(self, dt):\n dt = as_timestamp(dt)\n if not self.onOffset(dt):\n dt = dt - self.__class__(1, normalize=self.normalize, **self.kwds)\n return dt", "response": "Roll provided date backward to next offset only if not on offset."} {"instruction": "How would you implement a function in Python 3 that\nrolls provided date forward to next offset only if not on offset.", "response": "def rollforward(self, dt):\n \"\"\"\n Roll provided date forward to next offset only if not on offset.\n \"\"\"\n dt = as_timestamp(dt)\n if not self.onOffset(dt):\n dt = dt + self.__class__(1, normalize=self.normalize, **self.kwds)\n return dt"} {"instruction": "Implement a function in Python 3 to\nuse for moving to next business day.", "response": "def next_bday(self):\n \"\"\"\n Used for moving to next business day.\n \"\"\"\n if self.n >= 0:\n nb_offset = 1\n else:\n nb_offset = -1\n if self._prefix.startswith('C'):\n # CustomBusinessHour\n return CustomBusinessDay(n=nb_offset,\n weekmask=self.weekmask,\n holidays=self.holidays,\n calendar=self.calendar)\n else:\n return BusinessDay(n=nb_offset)"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef _next_opening_time(self, other):\n if not self.next_bday.onOffset(other):\n other = other + self.next_bday\n else:\n if self.n >= 0 and self.start < other.time():\n other = other + self.next_bday\n elif self.n < 0 and other.time() < self.start:\n other = other + self.next_bday\n return datetime(other.year, other.month, other.day,\n self.start.hour, self.start.minute)", "response": "Return next opening time for business day."} {"instruction": "Make a summary of the following Python 3 code\ndef _get_business_hours_by_sec(self):\n if self._get_daytime_flag:\n # create dummy datetime to calculate businesshours in a day\n dtstart = datetime(2014, 4, 1, self.start.hour, self.start.minute)\n until = datetime(2014, 4, 1, self.end.hour, self.end.minute)\n return (until - dtstart).total_seconds()\n else:\n dtstart = datetime(2014, 4, 1, self.start.hour, self.start.minute)\n until = datetime(2014, 4, 2, self.end.hour, self.end.minute)\n return (until - dtstart).total_seconds()", "response": "Return business hours in a day by seconds."} {"instruction": "Explain what the following Python 3 code does\ndef rollback(self, dt):\n if not self.onOffset(dt):\n businesshours = self._get_business_hours_by_sec\n if self.n >= 0:\n dt = self._prev_opening_time(\n dt) + timedelta(seconds=businesshours)\n else:\n dt = self._next_opening_time(\n dt) + timedelta(seconds=businesshours)\n return dt", "response": "Roll provided date backward to next offset only if not on offset."} {"instruction": "How would you explain what the following Python 3 function does\ndef rollforward(self, dt):\n if not self.onOffset(dt):\n if self.n >= 0:\n return self._next_opening_time(dt)\n else:\n return self._prev_opening_time(dt)\n return dt", "response": "Roll provided date forward to next offset only if not on offset."} {"instruction": "Can you tell what is the following Python 3 function doing\ndef _onOffset(self, dt, businesshours):\n # if self.normalize and not _is_normalized(dt):\n # return False\n # Valid BH can be on the different BusinessDay during midnight\n # Distinguish by the time spent from previous opening time\n if self.n >= 0:\n op = self._prev_opening_time(dt)\n else:\n op = self._next_opening_time(dt)\n span = (dt - op).total_seconds()\n if span <= businesshours:\n return True\n else:\n return False", "response": "Check if the given date is on the same business day."} {"instruction": "Make a summary of the following Python 3 code\ndef cbday_roll(self):\n cbday = CustomBusinessDay(n=self.n, normalize=False, **self.kwds)\n\n if self._prefix.endswith('S'):\n # MonthBegin\n roll_func = cbday.rollforward\n else:\n # MonthEnd\n roll_func = cbday.rollback\n return roll_func", "response": "Define default roll function to be called in apply method."} {"instruction": "Write a Python 3 script for\ndefining default roll function to be called in apply method.", "response": "def month_roll(self):\n \"\"\"\n Define default roll function to be called in apply method.\n \"\"\"\n if self._prefix.endswith('S'):\n # MonthBegin\n roll_func = self.m_offset.rollback\n else:\n # MonthEnd\n roll_func = self.m_offset.rollforward\n return roll_func"} {"instruction": "How would you implement a function in Python 3 that\nadds days portion of offset to DatetimeIndex i.", "response": "def _apply_index_days(self, i, roll):\n \"\"\"\n Add days portion of offset to DatetimeIndex i.\n\n Parameters\n ----------\n i : DatetimeIndex\n roll : ndarray[int64_t]\n\n Returns\n -------\n result : DatetimeIndex\n \"\"\"\n nanos = (roll % 2) * Timedelta(days=self.day_of_month - 1).value\n return i + nanos.astype('timedelta64[ns]')"} {"instruction": "Write a Python 3 script for\nadding self to the given DatetimeIndex, specialized for case where self.weekday is non-null. Parameters ---------- dtindex : DatetimeIndex Returns ------- result : DatetimeIndex", "response": "def _end_apply_index(self, dtindex):\n \"\"\"\n Add self to the given DatetimeIndex, specialized for case where\n self.weekday is non-null.\n\n Parameters\n ----------\n dtindex : DatetimeIndex\n\n Returns\n -------\n result : DatetimeIndex\n \"\"\"\n off = dtindex.to_perioddelta('D')\n\n base, mult = libfrequencies.get_freq_code(self.freqstr)\n base_period = dtindex.to_period(base)\n if not isinstance(base_period._data, np.ndarray):\n # unwrap PeriodIndex --> PeriodArray\n base_period = base_period._data\n\n if self.n > 0:\n # when adding, dates on end roll to next\n normed = dtindex - off + Timedelta(1, 'D') - Timedelta(1, 'ns')\n roll = np.where(base_period.to_timestamp(how='end') == normed,\n self.n, self.n - 1)\n # integer-array addition on PeriodIndex is deprecated,\n # so we use _addsub_int_array directly\n shifted = base_period._addsub_int_array(roll, operator.add)\n base = shifted.to_timestamp(how='end')\n else:\n # integer addition on PeriodIndex is deprecated,\n # so we use _time_shift directly\n roll = self.n\n base = base_period._time_shift(roll).to_timestamp(how='end')\n\n return base + off + Timedelta(1, 'ns') - Timedelta(1, 'D')"} {"instruction": "Here you have a function in Python 3, explain what it does\ndef _get_offset_day(self, other):\n mstart = datetime(other.year, other.month, 1)\n wday = mstart.weekday()\n shift_days = (self.weekday - wday) % 7\n return 1 + shift_days + self.week * 7", "response": "Find the day in the same month as other that has the same weekday as self. weekday as other."} {"instruction": "Can you tell what is the following Python 3 function doing\ndef _get_offset_day(self, other):\n dim = ccalendar.get_days_in_month(other.year, other.month)\n mend = datetime(other.year, other.month, dim)\n wday = mend.weekday()\n shift_days = (wday - self.weekday) % 7\n return dim - shift_days", "response": "Find the day in the same month as self. weekday as other that has the same weekday as self. weekday as other. weekday and self. weekday is the last such day in the month."} {"instruction": "Can you tell what is the following Python 3 function doing\ndef _rollback_to_year(self, other):\n num_qtrs = 0\n\n norm = Timestamp(other).tz_localize(None)\n start = self._offset.rollback(norm)\n # Note: start <= norm and self._offset.onOffset(start)\n\n if start < norm:\n # roll adjustment\n qtr_lens = self.get_weeks(norm)\n\n # check thet qtr_lens is consistent with self._offset addition\n end = liboffsets.shift_day(start, days=7 * sum(qtr_lens))\n assert self._offset.onOffset(end), (start, end, qtr_lens)\n\n tdelta = norm - start\n for qlen in qtr_lens:\n if qlen * 7 <= tdelta.days:\n num_qtrs += 1\n tdelta -= Timedelta(days=qlen * 7)\n else:\n break\n else:\n tdelta = Timedelta(0)\n\n # Note: we always have tdelta.value >= 0\n return start, num_qtrs, tdelta", "response": "Roll other back to the most recent date that was on a fiscal year - end and return the date of that year - end the number of full quarters that elapsed between that year - end and other."} {"instruction": "Make a summary of the following Python 3 code\ndef concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False,\n keys=None, levels=None, names=None, verify_integrity=False,\n sort=None, copy=True):\n \"\"\"\n Concatenate pandas objects along a particular axis with optional set logic\n along the other axes.\n\n Can also add a layer of hierarchical indexing on the concatenation axis,\n which may be useful if the labels are the same (or overlapping) on\n the passed axis number.\n\n Parameters\n ----------\n objs : a sequence or mapping of Series, DataFrame, or Panel objects\n If a dict is passed, the sorted keys will be used as the `keys`\n argument, unless it is passed, in which case the values will be\n selected (see below). Any None objects will be dropped silently unless\n they are all None in which case a ValueError will be raised.\n axis : {0/'index', 1/'columns'}, default 0\n The axis to concatenate along.\n join : {'inner', 'outer'}, default 'outer'\n How to handle indexes on other axis (or axes).\n join_axes : list of Index objects\n Specific indexes to use for the other n - 1 axes instead of performing\n inner/outer set logic.\n ignore_index : bool, default False\n If True, do not use the index values along the concatenation axis. The\n resulting axis will be labeled 0, ..., n - 1. This is useful if you are\n concatenating objects where the concatenation axis does not have\n meaningful indexing information. Note the index values on the other\n axes are still respected in the join.\n keys : sequence, default None\n If multiple levels passed, should contain tuples. Construct\n hierarchical index using the passed keys as the outermost level.\n levels : list of sequences, default None\n Specific levels (unique values) to use for constructing a\n MultiIndex. Otherwise they will be inferred from the keys.\n names : list, default None\n Names for the levels in the resulting hierarchical index.\n verify_integrity : bool, default False\n Check whether the new concatenated axis contains duplicates. This can\n be very expensive relative to the actual data concatenation.\n sort : bool, default None\n Sort non-concatenation axis if it is not already aligned when `join`\n is 'outer'. The current default of sorting is deprecated and will\n change to not-sorting in a future version of pandas.\n\n Explicitly pass ``sort=True`` to silence the warning and sort.\n Explicitly pass ``sort=False`` to silence the warning and not sort.\n\n This has no effect when ``join='inner'``, which already preserves\n the order of the non-concatenation axis.\n\n .. versionadded:: 0.23.0\n\n copy : bool, default True\n If False, do not copy data unnecessarily.\n\n Returns\n -------\n object, type of objs\n When concatenating all ``Series`` along the index (axis=0), a\n ``Series`` is returned. When ``objs`` contains at least one\n ``DataFrame``, a ``DataFrame`` is returned. When concatenating along\n the columns (axis=1), a ``DataFrame`` is returned.\n\n See Also\n --------\n Series.append : Concatenate Series.\n DataFrame.append : Concatenate DataFrames.\n DataFrame.join : Join DataFrames using indexes.\n DataFrame.merge : Merge DataFrames by indexes or columns.\n\n Notes\n -----\n The keys, levels, and names arguments are all optional.\n\n A walkthrough of how this method fits in with other tools for combining\n pandas objects can be found `here\n `__.\n\n Examples\n --------\n Combine two ``Series``.\n\n >>> s1 = pd.Series(['a', 'b'])\n >>> s2 = pd.Series(['c', 'd'])\n >>> pd.concat([s1, s2])\n 0 a\n 1 b\n 0 c\n 1 d\n dtype: object\n\n Clear the existing index and reset it in the result\n by setting the ``ignore_index`` option to ``True``.\n\n >>> pd.concat([s1, s2], ignore_index=True)\n 0 a\n 1 b\n 2 c\n 3 d\n dtype: object\n\n Add a hierarchical index at the outermost level of\n the data with the ``keys`` option.\n\n >>> pd.concat([s1, s2], keys=['s1', 's2'])\n s1 0 a\n 1 b\n s2 0 c\n 1 d\n dtype: object\n\n Label the index keys you create with the ``names`` option.\n\n >>> pd.concat([s1, s2], keys=['s1', 's2'],\n ... names=['Series name', 'Row ID'])\n Series name Row ID\n s1 0 a\n 1 b\n s2 0 c\n 1 d\n dtype: object\n\n Combine two ``DataFrame`` objects with identical columns.\n\n >>> df1 = pd.DataFrame([['a', 1], ['b', 2]],\n ... columns=['letter', 'number'])\n >>> df1\n letter number\n 0 a 1\n 1 b 2\n >>> df2 = pd.DataFrame([['c', 3], ['d', 4]],\n ... columns=['letter', 'number'])\n >>> df2\n letter number\n 0 c 3\n 1 d 4\n >>> pd.concat([df1, df2])\n letter number\n 0 a 1\n 1 b 2\n 0 c 3\n 1 d 4\n\n Combine ``DataFrame`` objects with overlapping columns\n and return everything. Columns outside the intersection will\n be filled with ``NaN`` values.\n\n >>> df3 = pd.DataFrame([['c', 3, 'cat'], ['d', 4, 'dog']],\n ... columns=['letter', 'number', 'animal'])\n >>> df3\n letter number animal\n 0 c 3 cat\n 1 d 4 dog\n >>> pd.concat([df1, df3], sort=False)\n letter number animal\n 0 a 1 NaN\n 1 b 2 NaN\n 0 c 3 cat\n 1 d 4 dog\n\n Combine ``DataFrame`` objects with overlapping columns\n and return only those that are shared by passing ``inner`` to\n the ``join`` keyword argument.\n\n >>> pd.concat([df1, df3], join=\"inner\")\n letter number\n 0 a 1\n 1 b 2\n 0 c 3\n 1 d 4\n\n Combine ``DataFrame`` objects horizontally along the x axis by\n passing in ``axis=1``.\n\n >>> df4 = pd.DataFrame([['bird', 'polly'], ['monkey', 'george']],\n ... columns=['animal', 'name'])\n >>> pd.concat([df1, df4], axis=1)\n letter number animal name\n 0 a 1 bird polly\n 1 b 2 monkey george\n\n Prevent the result from including duplicate index values with the\n ``verify_integrity`` option.\n\n >>> df5 = pd.DataFrame([1], index=['a'])\n >>> df5\n 0\n a 1\n >>> df6 = pd.DataFrame([2], index=['a'])\n >>> df6\n 0\n a 2\n >>> pd.concat([df5, df6], verify_integrity=True)\n Traceback (most recent call last):\n ...\n ValueError: Indexes have overlapping values: ['a']\n \"\"\"\n op = _Concatenator(objs, axis=axis, join_axes=join_axes,\n ignore_index=ignore_index, join=join,\n keys=keys, levels=levels, names=names,\n verify_integrity=verify_integrity,\n copy=copy, sort=sort)\n return op.get_result()", "response": "Concatenate pandas objects along a particular axis with optional set of logic."} {"instruction": "Write a Python 3 function that can\nreturn index to be used along concatenation axis.", "response": "def _get_concat_axis(self):\n \"\"\"\n Return index to be used along concatenation axis.\n \"\"\"\n if self._is_series:\n if self.axis == 0:\n indexes = [x.index for x in self.objs]\n elif self.ignore_index:\n idx = ibase.default_index(len(self.objs))\n return idx\n elif self.keys is None:\n names = [None] * len(self.objs)\n num = 0\n has_names = False\n for i, x in enumerate(self.objs):\n if not isinstance(x, Series):\n raise TypeError(\"Cannot concatenate type 'Series' \"\n \"with object of type {type!r}\"\n .format(type=type(x).__name__))\n if x.name is not None:\n names[i] = x.name\n has_names = True\n else:\n names[i] = num\n num += 1\n if has_names:\n return Index(names)\n else:\n return ibase.default_index(len(self.objs))\n else:\n return ensure_index(self.keys).set_names(self.names)\n else:\n indexes = [x._data.axes[self.axis] for x in self.objs]\n\n if self.ignore_index:\n idx = ibase.default_index(sum(len(i) for i in indexes))\n return idx\n\n if self.keys is None:\n concat_axis = _concat_indexes(indexes)\n else:\n concat_axis = _make_concat_multiindex(indexes, self.keys,\n self.levels, self.names)\n\n self._maybe_check_integrity(concat_axis)\n\n return concat_axis"} {"instruction": "How would you implement a function in Python 3 that\ncomputes the vectorized membership of x in y.", "response": "def _in(x, y):\n \"\"\"Compute the vectorized membership of ``x in y`` if possible, otherwise\n use Python.\n \"\"\"\n try:\n return x.isin(y)\n except AttributeError:\n if is_list_like(x):\n try:\n return y.isin(x)\n except AttributeError:\n pass\n return x in y"} {"instruction": "Can you write a function in Python 3 where it\ncomputes the vectorized membership of x not in y.", "response": "def _not_in(x, y):\n \"\"\"Compute the vectorized membership of ``x not in y`` if possible,\n otherwise use Python.\n \"\"\"\n try:\n return ~x.isin(y)\n except AttributeError:\n if is_list_like(x):\n try:\n return ~y.isin(x)\n except AttributeError:\n pass\n return x not in y"} {"instruction": "How would you explain what the following Python 3 function does\ndef _cast_inplace(terms, acceptable_dtypes, dtype):\n dt = np.dtype(dtype)\n for term in terms:\n if term.type in acceptable_dtypes:\n continue\n\n try:\n new_value = term.value.astype(dt)\n except AttributeError:\n new_value = dt.type(term.value)\n term.update(new_value)", "response": "Cast an expression inplace."} {"instruction": "Given the following Python 3 function, write the documentation\ndef update(self, value):\n key = self.name\n\n # if it's a variable name (otherwise a constant)\n if isinstance(key, str):\n self.env.swapkey(self.local_name, key, new_value=value)\n\n self.value = value", "response": "Update the value of the local variable."} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef evaluate(self, env, engine, parser, term_type, eval_in_python):\n if engine == 'python':\n res = self(env)\n else:\n # recurse over the left/right nodes\n left = self.lhs.evaluate(env, engine=engine, parser=parser,\n term_type=term_type,\n eval_in_python=eval_in_python)\n right = self.rhs.evaluate(env, engine=engine, parser=parser,\n term_type=term_type,\n eval_in_python=eval_in_python)\n\n # base cases\n if self.op in eval_in_python:\n res = self.func(left.value, right.value)\n else:\n from pandas.core.computation.eval import eval\n res = eval(self, local_dict=env, engine=engine,\n parser=parser)\n\n name = env.add_tmp(res)\n return term_type(name, env=env)", "response": "Evaluate a binary operation before being passed to the engine."} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef convert_values(self):\n def stringify(value):\n if self.encoding is not None:\n encoder = partial(pprint_thing_encoded,\n encoding=self.encoding)\n else:\n encoder = pprint_thing\n return encoder(value)\n\n lhs, rhs = self.lhs, self.rhs\n\n if is_term(lhs) and lhs.is_datetime and is_term(rhs) and rhs.is_scalar:\n v = rhs.value\n if isinstance(v, (int, float)):\n v = stringify(v)\n v = Timestamp(_ensure_decoded(v))\n if v.tz is not None:\n v = v.tz_convert('UTC')\n self.rhs.update(v)\n\n if is_term(rhs) and rhs.is_datetime and is_term(lhs) and lhs.is_scalar:\n v = lhs.value\n if isinstance(v, (int, float)):\n v = stringify(v)\n v = Timestamp(_ensure_decoded(v))\n if v.tz is not None:\n v = v.tz_convert('UTC')\n self.lhs.update(v)", "response": "Convert datetimes to a comparable value in an expression."} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef crosstab(index, columns, values=None, rownames=None, colnames=None,\n aggfunc=None, margins=False, margins_name='All', dropna=True,\n normalize=False):\n \"\"\"\n Compute a simple cross tabulation of two (or more) factors. By default\n computes a frequency table of the factors unless an array of values and an\n aggregation function are passed.\n\n Parameters\n ----------\n index : array-like, Series, or list of arrays/Series\n Values to group by in the rows.\n columns : array-like, Series, or list of arrays/Series\n Values to group by in the columns.\n values : array-like, optional\n Array of values to aggregate according to the factors.\n Requires `aggfunc` be specified.\n rownames : sequence, default None\n If passed, must match number of row arrays passed.\n colnames : sequence, default None\n If passed, must match number of column arrays passed.\n aggfunc : function, optional\n If specified, requires `values` be specified as well.\n margins : bool, default False\n Add row/column margins (subtotals).\n margins_name : str, default 'All'\n Name of the row/column that will contain the totals\n when margins is True.\n\n .. versionadded:: 0.21.0\n\n dropna : bool, default True\n Do not include columns whose entries are all NaN.\n normalize : bool, {'all', 'index', 'columns'}, or {0,1}, default False\n Normalize by dividing all values by the sum of values.\n\n - If passed 'all' or `True`, will normalize over all values.\n - If passed 'index' will normalize over each row.\n - If passed 'columns' will normalize over each column.\n - If margins is `True`, will also normalize margin values.\n\n .. versionadded:: 0.18.1\n\n Returns\n -------\n DataFrame\n Cross tabulation of the data.\n\n See Also\n --------\n DataFrame.pivot : Reshape data based on column values.\n pivot_table : Create a pivot table as a DataFrame.\n\n Notes\n -----\n Any Series passed will have their name attributes used unless row or column\n names for the cross-tabulation are specified.\n\n Any input passed containing Categorical data will have **all** of its\n categories included in the cross-tabulation, even if the actual data does\n not contain any instances of a particular category.\n\n In the event that there aren't overlapping indexes an empty DataFrame will\n be returned.\n\n Examples\n --------\n >>> a = np.array([\"foo\", \"foo\", \"foo\", \"foo\", \"bar\", \"bar\",\n ... \"bar\", \"bar\", \"foo\", \"foo\", \"foo\"], dtype=object)\n >>> b = np.array([\"one\", \"one\", \"one\", \"two\", \"one\", \"one\",\n ... \"one\", \"two\", \"two\", \"two\", \"one\"], dtype=object)\n >>> c = np.array([\"dull\", \"dull\", \"shiny\", \"dull\", \"dull\", \"shiny\",\n ... \"shiny\", \"dull\", \"shiny\", \"shiny\", \"shiny\"],\n ... dtype=object)\n >>> pd.crosstab(a, [b, c], rownames=['a'], colnames=['b', 'c'])\n b one two\n c dull shiny dull shiny\n a\n bar 1 2 1 0\n foo 2 2 1 2\n\n Here 'c' and 'f' are not represented in the data and will not be\n shown in the output because dropna is True by default. Set\n dropna=False to preserve categories with no data.\n\n >>> foo = pd.Categorical(['a', 'b'], categories=['a', 'b', 'c'])\n >>> bar = pd.Categorical(['d', 'e'], categories=['d', 'e', 'f'])\n >>> pd.crosstab(foo, bar)\n col_0 d e\n row_0\n a 1 0\n b 0 1\n >>> pd.crosstab(foo, bar, dropna=False)\n col_0 d e f\n row_0\n a 1 0 0\n b 0 1 0\n c 0 0 0\n \"\"\"\n\n index = com.maybe_make_list(index)\n columns = com.maybe_make_list(columns)\n\n rownames = _get_names(index, rownames, prefix='row')\n colnames = _get_names(columns, colnames, prefix='col')\n\n common_idx = _get_objs_combined_axis(index + columns, intersect=True,\n sort=False)\n\n data = {}\n data.update(zip(rownames, index))\n data.update(zip(colnames, columns))\n\n if values is None and aggfunc is not None:\n raise ValueError(\"aggfunc cannot be used without values.\")\n\n if values is not None and aggfunc is None:\n raise ValueError(\"values cannot be used without an aggfunc.\")\n\n from pandas import DataFrame\n df = DataFrame(data, index=common_idx)\n if values is None:\n df['__dummy__'] = 0\n kwargs = {'aggfunc': len, 'fill_value': 0}\n else:\n df['__dummy__'] = values\n kwargs = {'aggfunc': aggfunc}\n\n table = df.pivot_table('__dummy__', index=rownames, columns=colnames,\n margins=margins, margins_name=margins_name,\n dropna=dropna, **kwargs)\n\n # Post-process\n if normalize is not False:\n table = _normalize(table, normalize=normalize, margins=margins,\n margins_name=margins_name)\n\n return table", "response": "Compute a cross tabulation of two or more factors."} {"instruction": "Given the following Python 3 function, write the documentation\ndef _shape(self, df):\n\n row, col = df.shape\n return row + df.columns.nlevels, col + df.index.nlevels", "response": "Calculate table chape considering index levels."} {"instruction": "How would you code a function in Python 3 to\ncalculate appropriate figure size based on left and right data.", "response": "def _get_cells(self, left, right, vertical):\n \"\"\"\n Calculate appropriate figure size based on left and right data.\n \"\"\"\n\n if vertical:\n # calculate required number of cells\n vcells = max(sum(self._shape(l)[0] for l in left),\n self._shape(right)[0])\n hcells = (max(self._shape(l)[1] for l in left) +\n self._shape(right)[1])\n else:\n vcells = max([self._shape(l)[0] for l in left] +\n [self._shape(right)[0]])\n hcells = sum([self._shape(l)[1] for l in left] +\n [self._shape(right)[1]])\n return hcells, vcells"} {"instruction": "How would you code a function in Python 3 to\nplot left and right DataFrames in specified layout.", "response": "def plot(self, left, right, labels=None, vertical=True):\n \"\"\"\n Plot left / right DataFrames in specified layout.\n\n Parameters\n ----------\n left : list of DataFrames before operation is applied\n right : DataFrame of operation result\n labels : list of str to be drawn as titles of left DataFrames\n vertical : bool\n If True, use vertical layout. If False, use horizontal layout.\n \"\"\"\n import matplotlib.pyplot as plt\n import matplotlib.gridspec as gridspec\n\n if not isinstance(left, list):\n left = [left]\n left = [self._conv(l) for l in left]\n right = self._conv(right)\n\n hcells, vcells = self._get_cells(left, right, vertical)\n\n if vertical:\n figsize = self.cell_width * hcells, self.cell_height * vcells\n else:\n # include margin for titles\n figsize = self.cell_width * hcells, self.cell_height * vcells\n fig = plt.figure(figsize=figsize)\n\n if vertical:\n gs = gridspec.GridSpec(len(left), hcells)\n # left\n max_left_cols = max(self._shape(l)[1] for l in left)\n max_left_rows = max(self._shape(l)[0] for l in left)\n for i, (l, label) in enumerate(zip(left, labels)):\n ax = fig.add_subplot(gs[i, 0:max_left_cols])\n self._make_table(ax, l, title=label,\n height=1.0 / max_left_rows)\n # right\n ax = plt.subplot(gs[:, max_left_cols:])\n self._make_table(ax, right, title='Result', height=1.05 / vcells)\n fig.subplots_adjust(top=0.9, bottom=0.05, left=0.05, right=0.95)\n else:\n max_rows = max(self._shape(df)[0] for df in left + [right])\n height = 1.0 / np.max(max_rows)\n gs = gridspec.GridSpec(1, hcells)\n # left\n i = 0\n for l, label in zip(left, labels):\n sp = self._shape(l)\n ax = fig.add_subplot(gs[0, i:i + sp[1]])\n self._make_table(ax, l, title=label, height=height)\n i += sp[1]\n # right\n ax = plt.subplot(gs[0, i:])\n self._make_table(ax, right, title='Result', height=height)\n fig.subplots_adjust(top=0.85, bottom=0.05, left=0.05, right=0.95)\n\n return fig"} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef _conv(self, data):\n if isinstance(data, pd.Series):\n if data.name is None:\n data = data.to_frame(name='')\n else:\n data = data.to_frame()\n data = data.fillna('NaN')\n return data", "response": "Convert each input to appropriate for table outplot"} {"instruction": "Implement a Python 3 function for\nbinning values into discrete intervals. Use `cut` when you need to segment and sort data values into bins. This function is also useful for going from a continuous variable to a categorical variable. For example, `cut` could convert ages to groups of age ranges. Supports binning into an equal number of bins, or a pre-specified array of bins. Parameters ---------- x : array-like The input array to be binned. Must be 1-dimensional. bins : int, sequence of scalars, or IntervalIndex The criteria to bin by. * int : Defines the number of equal-width bins in the range of `x`. The range of `x` is extended by .1% on each side to include the minimum and maximum values of `x`. * sequence of scalars : Defines the bin edges allowing for non-uniform width. No extension of the range of `x` is done. * IntervalIndex : Defines the exact bins to be used. Note that IntervalIndex for `bins` must be non-overlapping. right : bool, default True Indicates whether `bins` includes the rightmost edge or not. If ``right == True`` (the default), then the `bins` ``[1, 2, 3, 4]`` indicate (1,2], (2,3], (3,4]. This argument is ignored when `bins` is an IntervalIndex. labels : array or bool, optional Specifies the labels for the returned bins. Must be the same length as the resulting bins. If False, returns only integer indicators of the bins. This affects the type of the output container (see below). This argument is ignored when `bins` is an IntervalIndex. retbins : bool, default False Whether to return the bins or not. Useful when bins is provided as a scalar. precision : int, default 3 The precision at which to store and display the bins labels. include_lowest : bool, default False Whether the first interval should be left-inclusive or not. duplicates : {default 'raise', 'drop'}, optional If bin edges are not unique, raise ValueError or drop non-uniques. .. versionadded:: 0.23.0 Returns ------- out : Categorical, Series, or ndarray An array-like object representing the respective bin for each value of `x`. The type depends on the value of `labels`. * True (default) : returns a Series for Series `x` or a Categorical for all other inputs. The values stored within are Interval dtype. * sequence of scalars : returns a Series for Series `x` or a Categorical for all other inputs. The values stored within are whatever the type in the sequence is. * False : returns an ndarray of integers. bins : numpy.ndarray or IntervalIndex. The computed or specified bins. Only returned when `retbins=True`. For scalar or sequence `bins`, this is an ndarray with the computed bins. If set `duplicates=drop`, `bins` will drop non-unique bin. For an IntervalIndex `bins`, this is equal to `bins`. See Also -------- qcut : Discretize variable into equal-sized buckets based on rank or based on sample quantiles. Categorical : Array type for storing data that come from a fixed set of values. Series : One-dimensional array with axis labels (including time series). IntervalIndex : Immutable Index implementing an ordered, sliceable set. Notes ----- Any NA values will be NA in the result. Out of bounds values will be NA in the resulting Series or Categorical object. Examples -------- Discretize into three equal-sized bins. >>> pd.cut(np.array([1, 7, 5, 4, 6, 3]), 3) ... # doctest: +ELLIPSIS [(0.994, 3.0], (5.0, 7.0], (3.0, 5.0], (3.0, 5.0], (5.0, 7.0], ... Categories (3, interval[float64]): [(0.994, 3.0] < (3.0, 5.0] ... >>> pd.cut(np.array([1, 7, 5, 4, 6, 3]), 3, retbins=True) ... # doctest: +ELLIPSIS ([(0.994, 3.0], (5.0, 7.0], (3.0, 5.0], (3.0, 5.0], (5.0, 7.0], ... Categories (3, interval[float64]): [(0.994, 3.0] < (3.0, 5.0] ... array([0.994, 3. , 5. , 7. ])) Discovers the same bins, but assign them specific labels. Notice that the returned Categorical's categories are `labels` and is ordered. >>> pd.cut(np.array([1, 7, 5, 4, 6, 3]), ... 3, labels=[\"bad\", \"medium\", \"good\"]) [bad, good, medium, medium, good, bad] Categories (3, object): [bad < medium < good] ``labels=False`` implies you just want the bins back. >>> pd.cut([0, 1, 1, 2], bins=4, labels=False) array([0, 1, 1, 3]) Passing a Series as an input returns a Series with categorical dtype: >>> s = pd.Series(np.array([2, 4, 6, 8, 10]), ... index=['a', 'b', 'c', 'd', 'e']) >>> pd.cut(s, 3) ... # doctest: +ELLIPSIS a (1.992, 4.667] b (1.992, 4.667] c (4.667, 7.333] d (7.333, 10.0] e (7.333, 10.0] dtype: category Categories (3, interval[float64]): [(1.992, 4.667] < (4.667, ... Passing a Series as an input returns a Series with mapping value. It is used to map numerically to intervals based on bins. >>> s = pd.Series(np.array([2, 4, 6, 8, 10]), ... index=['a', 'b', 'c', 'd', 'e']) >>> pd.cut(s, [0, 2, 4, 6, 8, 10], labels=False, retbins=True, right=False) ... # doctest: +ELLIPSIS (a 0.0 b 1.0 c 2.0 d 3.0 e 4.0 dtype: float64, array([0, 2, 4, 6, 8])) Use `drop` optional when bins is not unique >>> pd.cut(s, [0, 2, 4, 6, 10, 10], labels=False, retbins=True, ... right=False, duplicates='drop') ... # doctest: +ELLIPSIS (a 0.0 b 1.0 c 2.0 d 3.0 e 3.0 dtype: float64, array([0, 2, 4, 6, 8])) Passing an IntervalIndex for `bins` results in those categories exactly. Notice that values not covered by the IntervalIndex are set to NaN. 0 is to the left of the first bin (which is closed on the right), and 1.5 falls between two bins. >>> bins = pd.IntervalIndex.from_tuples([(0, 1), (2, 3), (4, 5)]) >>> pd.cut([0, 0.5, 1.5, 2.5, 4.5], bins) [NaN, (0, 1], NaN, (2, 3], (4, 5]] Categories (3, interval[int64]): [(0, 1] < (2, 3] < (4, 5]]", "response": "def cut(x, bins, right=True, labels=None, retbins=False, precision=3,\n include_lowest=False, duplicates='raise'):\n \"\"\"\n Bin values into discrete intervals.\n\n Use `cut` when you need to segment and sort data values into bins. This\n function is also useful for going from a continuous variable to a\n categorical variable. For example, `cut` could convert ages to groups of\n age ranges. Supports binning into an equal number of bins, or a\n pre-specified array of bins.\n\n Parameters\n ----------\n x : array-like\n The input array to be binned. Must be 1-dimensional.\n bins : int, sequence of scalars, or IntervalIndex\n The criteria to bin by.\n\n * int : Defines the number of equal-width bins in the range of `x`. The\n range of `x` is extended by .1% on each side to include the minimum\n and maximum values of `x`.\n * sequence of scalars : Defines the bin edges allowing for non-uniform\n width. No extension of the range of `x` is done.\n * IntervalIndex : Defines the exact bins to be used. Note that\n IntervalIndex for `bins` must be non-overlapping.\n\n right : bool, default True\n Indicates whether `bins` includes the rightmost edge or not. If\n ``right == True`` (the default), then the `bins` ``[1, 2, 3, 4]``\n indicate (1,2], (2,3], (3,4]. This argument is ignored when\n `bins` is an IntervalIndex.\n labels : array or bool, optional\n Specifies the labels for the returned bins. Must be the same length as\n the resulting bins. If False, returns only integer indicators of the\n bins. This affects the type of the output container (see below).\n This argument is ignored when `bins` is an IntervalIndex.\n retbins : bool, default False\n Whether to return the bins or not. Useful when bins is provided\n as a scalar.\n precision : int, default 3\n The precision at which to store and display the bins labels.\n include_lowest : bool, default False\n Whether the first interval should be left-inclusive or not.\n duplicates : {default 'raise', 'drop'}, optional\n If bin edges are not unique, raise ValueError or drop non-uniques.\n\n .. versionadded:: 0.23.0\n\n Returns\n -------\n out : Categorical, Series, or ndarray\n An array-like object representing the respective bin for each value\n of `x`. The type depends on the value of `labels`.\n\n * True (default) : returns a Series for Series `x` or a\n Categorical for all other inputs. The values stored within\n are Interval dtype.\n\n * sequence of scalars : returns a Series for Series `x` or a\n Categorical for all other inputs. The values stored within\n are whatever the type in the sequence is.\n\n * False : returns an ndarray of integers.\n\n bins : numpy.ndarray or IntervalIndex.\n The computed or specified bins. Only returned when `retbins=True`.\n For scalar or sequence `bins`, this is an ndarray with the computed\n bins. If set `duplicates=drop`, `bins` will drop non-unique bin. For\n an IntervalIndex `bins`, this is equal to `bins`.\n\n See Also\n --------\n qcut : Discretize variable into equal-sized buckets based on rank\n or based on sample quantiles.\n Categorical : Array type for storing data that come from a\n fixed set of values.\n Series : One-dimensional array with axis labels (including time series).\n IntervalIndex : Immutable Index implementing an ordered, sliceable set.\n\n Notes\n -----\n Any NA values will be NA in the result. Out of bounds values will be NA in\n the resulting Series or Categorical object.\n\n Examples\n --------\n Discretize into three equal-sized bins.\n\n >>> pd.cut(np.array([1, 7, 5, 4, 6, 3]), 3)\n ... # doctest: +ELLIPSIS\n [(0.994, 3.0], (5.0, 7.0], (3.0, 5.0], (3.0, 5.0], (5.0, 7.0], ...\n Categories (3, interval[float64]): [(0.994, 3.0] < (3.0, 5.0] ...\n\n >>> pd.cut(np.array([1, 7, 5, 4, 6, 3]), 3, retbins=True)\n ... # doctest: +ELLIPSIS\n ([(0.994, 3.0], (5.0, 7.0], (3.0, 5.0], (3.0, 5.0], (5.0, 7.0], ...\n Categories (3, interval[float64]): [(0.994, 3.0] < (3.0, 5.0] ...\n array([0.994, 3. , 5. , 7. ]))\n\n Discovers the same bins, but assign them specific labels. Notice that\n the returned Categorical's categories are `labels` and is ordered.\n\n >>> pd.cut(np.array([1, 7, 5, 4, 6, 3]),\n ... 3, labels=[\"bad\", \"medium\", \"good\"])\n [bad, good, medium, medium, good, bad]\n Categories (3, object): [bad < medium < good]\n\n ``labels=False`` implies you just want the bins back.\n\n >>> pd.cut([0, 1, 1, 2], bins=4, labels=False)\n array([0, 1, 1, 3])\n\n Passing a Series as an input returns a Series with categorical dtype:\n\n >>> s = pd.Series(np.array([2, 4, 6, 8, 10]),\n ... index=['a', 'b', 'c', 'd', 'e'])\n >>> pd.cut(s, 3)\n ... # doctest: +ELLIPSIS\n a (1.992, 4.667]\n b (1.992, 4.667]\n c (4.667, 7.333]\n d (7.333, 10.0]\n e (7.333, 10.0]\n dtype: category\n Categories (3, interval[float64]): [(1.992, 4.667] < (4.667, ...\n\n Passing a Series as an input returns a Series with mapping value.\n It is used to map numerically to intervals based on bins.\n\n >>> s = pd.Series(np.array([2, 4, 6, 8, 10]),\n ... index=['a', 'b', 'c', 'd', 'e'])\n >>> pd.cut(s, [0, 2, 4, 6, 8, 10], labels=False, retbins=True, right=False)\n ... # doctest: +ELLIPSIS\n (a 0.0\n b 1.0\n c 2.0\n d 3.0\n e 4.0\n dtype: float64, array([0, 2, 4, 6, 8]))\n\n Use `drop` optional when bins is not unique\n\n >>> pd.cut(s, [0, 2, 4, 6, 10, 10], labels=False, retbins=True,\n ... right=False, duplicates='drop')\n ... # doctest: +ELLIPSIS\n (a 0.0\n b 1.0\n c 2.0\n d 3.0\n e 3.0\n dtype: float64, array([0, 2, 4, 6, 8]))\n\n Passing an IntervalIndex for `bins` results in those categories exactly.\n Notice that values not covered by the IntervalIndex are set to NaN. 0\n is to the left of the first bin (which is closed on the right), and 1.5\n falls between two bins.\n\n >>> bins = pd.IntervalIndex.from_tuples([(0, 1), (2, 3), (4, 5)])\n >>> pd.cut([0, 0.5, 1.5, 2.5, 4.5], bins)\n [NaN, (0, 1], NaN, (2, 3], (4, 5]]\n Categories (3, interval[int64]): [(0, 1] < (2, 3] < (4, 5]]\n \"\"\"\n # NOTE: this binning code is changed a bit from histogram for var(x) == 0\n\n # for handling the cut for datetime and timedelta objects\n x_is_series, series_index, name, x = _preprocess_for_cut(x)\n x, dtype = _coerce_to_type(x)\n\n if not np.iterable(bins):\n if is_scalar(bins) and bins < 1:\n raise ValueError(\"`bins` should be a positive integer.\")\n\n try: # for array-like\n sz = x.size\n except AttributeError:\n x = np.asarray(x)\n sz = x.size\n\n if sz == 0:\n raise ValueError('Cannot cut empty array')\n\n rng = (nanops.nanmin(x), nanops.nanmax(x))\n mn, mx = [mi + 0.0 for mi in rng]\n\n if np.isinf(mn) or np.isinf(mx):\n # GH 24314\n raise ValueError('cannot specify integer `bins` when input data '\n 'contains infinity')\n elif mn == mx: # adjust end points before binning\n mn -= .001 * abs(mn) if mn != 0 else .001\n mx += .001 * abs(mx) if mx != 0 else .001\n bins = np.linspace(mn, mx, bins + 1, endpoint=True)\n else: # adjust end points after binning\n bins = np.linspace(mn, mx, bins + 1, endpoint=True)\n adj = (mx - mn) * 0.001 # 0.1% of the range\n if right:\n bins[0] -= adj\n else:\n bins[-1] += adj\n\n elif isinstance(bins, IntervalIndex):\n if bins.is_overlapping:\n raise ValueError('Overlapping IntervalIndex is not accepted.')\n\n else:\n if is_datetime64tz_dtype(bins):\n bins = np.asarray(bins, dtype=_NS_DTYPE)\n else:\n bins = np.asarray(bins)\n bins = _convert_bin_to_numeric_type(bins, dtype)\n\n # GH 26045: cast to float64 to avoid an overflow\n if (np.diff(bins.astype('float64')) < 0).any():\n raise ValueError('bins must increase monotonically.')\n\n fac, bins = _bins_to_cuts(x, bins, right=right, labels=labels,\n precision=precision,\n include_lowest=include_lowest,\n dtype=dtype,\n duplicates=duplicates)\n\n return _postprocess_for_cut(fac, bins, retbins, x_is_series,\n series_index, name, dtype)"} {"instruction": "Given the following Python 3 function, write the documentation\ndef _coerce_to_type(x):\n dtype = None\n\n if is_datetime64tz_dtype(x):\n dtype = x.dtype\n elif is_datetime64_dtype(x):\n x = to_datetime(x)\n dtype = np.dtype('datetime64[ns]')\n elif is_timedelta64_dtype(x):\n x = to_timedelta(x)\n dtype = np.dtype('timedelta64[ns]')\n\n if dtype is not None:\n # GH 19768: force NaT to NaN during integer conversion\n x = np.where(x.notna(), x.view(np.int64), np.nan)\n\n return x, dtype", "response": "Coerce data to the correct type."} {"instruction": "Write a Python 3 function for\nconverting a list of bins to numeric type.", "response": "def _convert_bin_to_numeric_type(bins, dtype):\n \"\"\"\n if the passed bin is of datetime/timedelta type,\n this method converts it to integer\n\n Parameters\n ----------\n bins : list-like of bins\n dtype : dtype of data\n\n Raises\n ------\n ValueError if bins are not of a compat dtype to dtype\n \"\"\"\n bins_dtype = infer_dtype(bins, skipna=False)\n if is_timedelta64_dtype(dtype):\n if bins_dtype in ['timedelta', 'timedelta64']:\n bins = to_timedelta(bins).view(np.int64)\n else:\n raise ValueError(\"bins must be of timedelta64 dtype\")\n elif is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype):\n if bins_dtype in ['datetime', 'datetime64']:\n bins = to_datetime(bins).view(np.int64)\n else:\n raise ValueError(\"bins must be of datetime64 dtype\")\n\n return bins"} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef _convert_bin_to_datelike_type(bins, dtype):\n if is_datetime64tz_dtype(dtype):\n bins = to_datetime(bins.astype(np.int64),\n utc=True).tz_convert(dtype.tz)\n elif is_datetime_or_timedelta_dtype(dtype):\n bins = Index(bins.astype(np.int64), dtype=dtype)\n return bins", "response": "Convert bin to a DatetimeIndex or TimedeltaIndex if the orginal dtype is\n datelike\n "} {"instruction": "Can you implement a function in Python 3 that\nformats the labels for the node.", "response": "def _format_labels(bins, precision, right=True,\n include_lowest=False, dtype=None):\n \"\"\" based on the dtype, return our labels \"\"\"\n\n closed = 'right' if right else 'left'\n\n if is_datetime64tz_dtype(dtype):\n formatter = partial(Timestamp, tz=dtype.tz)\n adjust = lambda x: x - Timedelta('1ns')\n elif is_datetime64_dtype(dtype):\n formatter = Timestamp\n adjust = lambda x: x - Timedelta('1ns')\n elif is_timedelta64_dtype(dtype):\n formatter = Timedelta\n adjust = lambda x: x - Timedelta('1ns')\n else:\n precision = _infer_precision(precision, bins)\n formatter = lambda x: _round_frac(x, precision)\n adjust = lambda x: x - 10 ** (-precision)\n\n breaks = [formatter(b) for b in bins]\n labels = IntervalIndex.from_breaks(breaks, closed=closed)\n\n if right and include_lowest:\n # we will adjust the left hand side by precision to\n # account that we are all right closed\n v = adjust(labels[0].left)\n\n i = IntervalIndex([Interval(v, labels[0].right, closed='right')])\n labels = i.append(labels[1:])\n\n return labels"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef _preprocess_for_cut(x):\n x_is_series = isinstance(x, Series)\n series_index = None\n name = None\n\n if x_is_series:\n series_index = x.index\n name = x.name\n\n # Check that the passed array is a Pandas or Numpy object\n # We don't want to strip away a Pandas data-type here (e.g. datetimetz)\n ndim = getattr(x, 'ndim', None)\n if ndim is None:\n x = np.asarray(x)\n if x.ndim != 1:\n raise ValueError(\"Input array must be 1 dimensional\")\n\n return x_is_series, series_index, name, x", "response": "handles preprocessing for cut where we convert the passed array to array strip the index information and store it\n separately"} {"instruction": "How would you code a function in Python 3 to\nhandle post processing for the cut method where we combine the index information if the originally passed datatype was a series", "response": "def _postprocess_for_cut(fac, bins, retbins, x_is_series,\n series_index, name, dtype):\n \"\"\"\n handles post processing for the cut method where\n we combine the index information if the originally passed\n datatype was a series\n \"\"\"\n if x_is_series:\n fac = Series(fac, index=series_index, name=name)\n\n if not retbins:\n return fac\n\n bins = _convert_bin_to_datelike_type(bins, dtype)\n\n return fac, bins"} {"instruction": "Create a Python 3 function to\nround the fractional part of the given number to the given precision.", "response": "def _round_frac(x, precision):\n \"\"\"\n Round the fractional part of the given number\n \"\"\"\n if not np.isfinite(x) or x == 0:\n return x\n else:\n frac, whole = np.modf(x)\n if whole == 0:\n digits = -int(np.floor(np.log10(abs(frac)))) - 1 + precision\n else:\n digits = precision\n return np.around(x, digits)"} {"instruction": "Implement a Python 3 function for\ninferring an appropriate precision for _round_frac", "response": "def _infer_precision(base_precision, bins):\n \"\"\"Infer an appropriate precision for _round_frac\n \"\"\"\n for precision in range(base_precision, 20):\n levels = [_round_frac(b, precision) for b in bins]\n if algos.unique(levels).size == bins.size:\n return precision\n return base_precision"} {"instruction": "Write a Python 3 script for\ntrying to detect the most capable encoding supported by the console.", "response": "def detect_console_encoding():\n \"\"\"\n Try to find the most capable encoding supported by the console.\n slightly modified from the way IPython handles the same issue.\n \"\"\"\n global _initial_defencoding\n\n encoding = None\n try:\n encoding = sys.stdout.encoding or sys.stdin.encoding\n except (AttributeError, IOError):\n pass\n\n # try again for something better\n if not encoding or 'ascii' in encoding.lower():\n try:\n encoding = locale.getpreferredencoding()\n except Exception:\n pass\n\n # when all else fails. this will usually be \"ascii\"\n if not encoding or 'ascii' in encoding.lower():\n encoding = sys.getdefaultencoding()\n\n # GH#3360, save the reported defencoding at import time\n # MPL backends may change it. Make available for debugging.\n if not _initial_defencoding:\n _initial_defencoding = sys.getdefaultencoding()\n\n return encoding"} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef _check_arg_length(fname, args, max_fname_arg_count, compat_args):\n if max_fname_arg_count < 0:\n raise ValueError(\"'max_fname_arg_count' must be non-negative\")\n\n if len(args) > len(compat_args):\n max_arg_count = len(compat_args) + max_fname_arg_count\n actual_arg_count = len(args) + max_fname_arg_count\n argument = 'argument' if max_arg_count == 1 else 'arguments'\n\n raise TypeError(\n \"{fname}() takes at most {max_arg} {argument} \"\n \"({given_arg} given)\".format(\n fname=fname, max_arg=max_arg_count,\n argument=argument, given_arg=actual_arg_count))", "response": "Checks whether args has at most max_arg_count arguments. Raises a TypeError if that is not the case."} {"instruction": "How would you implement a function in Python 3 that\nchecks that the keys in arg_val_dict are mapped to their respective base classes and raises a helpful ValueError if they are not.", "response": "def _check_for_default_values(fname, arg_val_dict, compat_args):\n \"\"\"\n Check that the keys in `arg_val_dict` are mapped to their\n default values as specified in `compat_args`.\n\n Note that this function is to be called only when it has been\n checked that arg_val_dict.keys() is a subset of compat_args\n\n \"\"\"\n for key in arg_val_dict:\n # try checking equality directly with '=' operator,\n # as comparison may have been overridden for the left\n # hand object\n try:\n v1 = arg_val_dict[key]\n v2 = compat_args[key]\n\n # check for None-ness otherwise we could end up\n # comparing a numpy array vs None\n if (v1 is not None and v2 is None) or \\\n (v1 is None and v2 is not None):\n match = False\n else:\n match = (v1 == v2)\n\n if not is_bool(match):\n raise ValueError(\"'match' is not a boolean\")\n\n # could not compare them directly, so try comparison\n # using the 'is' operator\n except ValueError:\n match = (arg_val_dict[key] is compat_args[key])\n\n if not match:\n raise ValueError((\"the '{arg}' parameter is not \"\n \"supported in the pandas \"\n \"implementation of {fname}()\".\n format(fname=fname, arg=key)))"} {"instruction": "Here you have a function in Python 3, explain what it does\ndef validate_args(fname, args, max_fname_arg_count, compat_args):\n _check_arg_length(fname, args, max_fname_arg_count, compat_args)\n\n # We do this so that we can provide a more informative\n # error message about the parameters that we are not\n # supporting in the pandas implementation of 'fname'\n kwargs = dict(zip(compat_args, args))\n _check_for_default_values(fname, kwargs, compat_args)", "response": "Checks whether the arguments passed into a function are valid."} {"instruction": "Here you have a function in Python 3, explain what it does\ndef _check_for_invalid_keys(fname, kwargs, compat_args):\n # set(dict) --> set of the dictionary's keys\n diff = set(kwargs) - set(compat_args)\n\n if diff:\n bad_arg = list(diff)[0]\n raise TypeError((\"{fname}() got an unexpected \"\n \"keyword argument '{arg}'\".\n format(fname=fname, arg=bad_arg)))", "response": "Checks whether the kwargs contains any keys that are not\n in compat_args and raises a TypeError if there is one."} {"instruction": "Implement a Python 3 function for\nchecking whether the kwargs passed to the function fname are valid and whether or not they are set to their default values.", "response": "def validate_kwargs(fname, kwargs, compat_args):\n \"\"\"\n Checks whether parameters passed to the **kwargs argument in a\n function `fname` are valid parameters as specified in `*compat_args`\n and whether or not they are set to their default values.\n\n Parameters\n ----------\n fname: str\n The name of the function being passed the `**kwargs` parameter\n\n kwargs: dict\n The `**kwargs` parameter passed into `fname`\n\n compat_args: dict\n A dictionary of keys that `kwargs` is allowed to have and their\n associated default values\n\n Raises\n ------\n TypeError if `kwargs` contains keys not in `compat_args`\n ValueError if `kwargs` contains keys in `compat_args` that do not\n map to the default values specified in `compat_args`\n\n \"\"\"\n kwds = kwargs.copy()\n _check_for_invalid_keys(fname, kwargs, compat_args)\n _check_for_default_values(fname, kwds, compat_args)"} {"instruction": "How would you explain what the following Python 3 function does\ndef validate_args_and_kwargs(fname, args, kwargs,\n max_fname_arg_count,\n compat_args):\n \"\"\"\n Checks whether parameters passed to the *args and **kwargs argument in a\n function `fname` are valid parameters as specified in `*compat_args`\n and whether or not they are set to their default values.\n\n Parameters\n ----------\n fname: str\n The name of the function being passed the `**kwargs` parameter\n\n args: tuple\n The `*args` parameter passed into a function\n\n kwargs: dict\n The `**kwargs` parameter passed into `fname`\n\n max_fname_arg_count: int\n The minimum number of arguments that the function `fname`\n requires, excluding those in `args`. Used for displaying\n appropriate error messages. Must be non-negative.\n\n compat_args: OrderedDict\n A ordered dictionary of keys that `kwargs` is allowed to\n have and their associated default values. Note that if there\n is only one key, a generic dict can be passed in as well.\n\n Raises\n ------\n TypeError if `args` contains more values than there are\n `compat_args` OR `kwargs` contains keys not in `compat_args`\n ValueError if `args` contains values not at the default value (`None`)\n `kwargs` contains keys in `compat_args` that do not map to the default\n value as specified in `compat_args`\n\n See Also\n --------\n validate_args : Purely args validation.\n validate_kwargs : Purely kwargs validation.\n\n \"\"\"\n # Check that the total number of arguments passed in (i.e.\n # args and kwargs) does not exceed the length of compat_args\n _check_arg_length(fname, args + tuple(kwargs.values()),\n max_fname_arg_count, compat_args)\n\n # Check there is no overlap with the positional and keyword\n # arguments, similar to what is done in actual Python functions\n args_dict = dict(zip(compat_args, args))\n\n for key in args_dict:\n if key in kwargs:\n raise TypeError(\"{fname}() got multiple values for keyword \"\n \"argument '{arg}'\".format(fname=fname, arg=key))\n\n kwargs.update(args_dict)\n validate_kwargs(fname, kwargs, compat_args)", "response": "Validate the arguments and kwargs of a function fname."} {"instruction": "Can you tell what is the following Python 3 function doing\ndef validate_bool_kwarg(value, arg_name):\n if not (is_bool(value) or value is None):\n raise ValueError('For argument \"{arg}\" expected type bool, received '\n 'type {typ}.'.format(arg=arg_name,\n typ=type(value).__name__))\n return value", "response": "Validates that the value passed in arg_name is of type bool."} {"instruction": "Given the following Python 3 function, write the documentation\ndef validate_fillna_kwargs(value, method, validate_scalar_dict_value=True):\n from pandas.core.missing import clean_fill_method\n\n if value is None and method is None:\n raise ValueError(\"Must specify a fill 'value' or 'method'.\")\n elif value is None and method is not None:\n method = clean_fill_method(method)\n\n elif value is not None and method is None:\n if validate_scalar_dict_value and isinstance(value, (list, tuple)):\n raise TypeError('\"value\" parameter must be a scalar or dict, but '\n 'you passed a \"{0}\"'.format(type(value).__name__))\n\n elif value is not None and method is not None:\n raise ValueError(\"Cannot specify both 'value' and 'method'.\")\n\n return value, method", "response": "Validate the keyword arguments passed to the fillna method."} {"instruction": "Here you have a function in Python 3, explain what it does\ndef resample(obj, kind=None, **kwds):\n tg = TimeGrouper(**kwds)\n return tg._get_resampler(obj, kind=kind)", "response": "Create a TimeGrouper and return our resampler."} {"instruction": "Implement a function in Python 3 to\nreturn our appropriate resampler when grouping as well.", "response": "def get_resampler_for_grouping(groupby, rule, how=None, fill_method=None,\n limit=None, kind=None, **kwargs):\n \"\"\"\n Return our appropriate resampler when grouping as well.\n \"\"\"\n\n # .resample uses 'on' similar to how .groupby uses 'key'\n kwargs['key'] = kwargs.pop('on', None)\n\n tg = TimeGrouper(freq=rule, **kwargs)\n resampler = tg._get_resampler(groupby.obj, kind=kind)\n r = resampler._get_resampler_for_grouping(groupby=groupby)\n return _maybe_process_deprecations(r,\n how=how,\n fill_method=fill_method,\n limit=limit)"} {"instruction": "Can you write a function in Python 3 where it\nadjusts the first and last timestamps to the preceeding Timestamps that resides on the provided offset.", "response": "def _get_timestamp_range_edges(first, last, offset, closed='left', base=0):\n \"\"\"\n Adjust the `first` Timestamp to the preceeding Timestamp that resides on\n the provided offset. Adjust the `last` Timestamp to the following\n Timestamp that resides on the provided offset. Input Timestamps that\n already reside on the offset will be adjusted depending on the type of\n offset and the `closed` parameter.\n\n Parameters\n ----------\n first : pd.Timestamp\n The beginning Timestamp of the range to be adjusted.\n last : pd.Timestamp\n The ending Timestamp of the range to be adjusted.\n offset : pd.DateOffset\n The dateoffset to which the Timestamps will be adjusted.\n closed : {'right', 'left'}, default None\n Which side of bin interval is closed.\n base : int, default 0\n The \"origin\" of the adjusted Timestamps.\n\n Returns\n -------\n A tuple of length 2, containing the adjusted pd.Timestamp objects.\n \"\"\"\n if isinstance(offset, Tick):\n if isinstance(offset, Day):\n # _adjust_dates_anchored assumes 'D' means 24H, but first/last\n # might contain a DST transition (23H, 24H, or 25H).\n # So \"pretend\" the dates are naive when adjusting the endpoints\n tz = first.tz\n first = first.tz_localize(None)\n last = last.tz_localize(None)\n\n first, last = _adjust_dates_anchored(first, last, offset,\n closed=closed, base=base)\n if isinstance(offset, Day):\n first = first.tz_localize(tz)\n last = last.tz_localize(tz)\n return first, last\n\n else:\n first = first.normalize()\n last = last.normalize()\n\n if closed == 'left':\n first = Timestamp(offset.rollback(first))\n else:\n first = Timestamp(first - offset)\n\n last = Timestamp(last + offset)\n\n return first, last"} {"instruction": "Explain what the following Python 3 code does\ndef _get_period_range_edges(first, last, offset, closed='left', base=0):\n if not all(isinstance(obj, pd.Period) for obj in [first, last]):\n raise TypeError(\"'first' and 'last' must be instances of type Period\")\n\n # GH 23882\n first = first.to_timestamp()\n last = last.to_timestamp()\n adjust_first = not offset.onOffset(first)\n adjust_last = offset.onOffset(last)\n\n first, last = _get_timestamp_range_edges(first, last, offset,\n closed=closed, base=base)\n\n first = (first + adjust_first * offset).to_period(offset)\n last = (last - adjust_last * offset).to_period(offset)\n return first, last", "response": "Adjusts the provided first and last Periods to the respective Period of\n the given offset that encompasses them."} {"instruction": "Explain what the following Python 3 code does\ndef asfreq(obj, freq, method=None, how=None, normalize=False, fill_value=None):\n if isinstance(obj.index, PeriodIndex):\n if method is not None:\n raise NotImplementedError(\"'method' argument is not supported\")\n\n if how is None:\n how = 'E'\n\n new_obj = obj.copy()\n new_obj.index = obj.index.asfreq(freq, how=how)\n\n elif len(obj.index) == 0:\n new_obj = obj.copy()\n new_obj.index = obj.index._shallow_copy(freq=to_offset(freq))\n\n else:\n dti = date_range(obj.index[0], obj.index[-1], freq=freq)\n dti.name = obj.index.name\n new_obj = obj.reindex(dti, method=method, fill_value=fill_value)\n if normalize:\n new_obj.index = new_obj.index.normalize()\n\n return new_obj", "response": "Utility frequency conversion method for Series / DataFrame."} {"instruction": "How would you explain what the following Python 3 function does\ndef _from_selection(self):\n # upsampling and PeriodIndex resampling do not work\n # with selection, this state used to catch and raise an error\n return (self.groupby is not None and\n (self.groupby.key is not None or\n self.groupby.level is not None))", "response": "Is the resampling from a DataFrame column or MultiIndex level or PeriodIndex level?"} {"instruction": "Make a summary of the following Python 3 code\ndef _get_binner(self):\n\n binner, bins, binlabels = self._get_binner_for_time()\n bin_grouper = BinGrouper(bins, binlabels, indexer=self.groupby.indexer)\n return binner, bin_grouper", "response": "Create the BinGrouper and return it."} {"instruction": "Explain what the following Python 3 code does\ndef transform(self, arg, *args, **kwargs):\n return self._selected_obj.groupby(self.groupby).transform(\n arg, *args, **kwargs)", "response": "Returns a Series with the transformed values."} {"instruction": "How would you explain what the following Python 3 function does\ndef _groupby_and_aggregate(self, how, grouper=None, *args, **kwargs):\n\n if grouper is None:\n self._set_binner()\n grouper = self.grouper\n\n obj = self._selected_obj\n\n grouped = groupby(obj, by=None, grouper=grouper, axis=self.axis)\n\n try:\n if isinstance(obj, ABCDataFrame) and callable(how):\n # Check if the function is reducing or not.\n result = grouped._aggregate_item_by_item(how, *args, **kwargs)\n else:\n result = grouped.aggregate(how, *args, **kwargs)\n except Exception:\n\n # we have a non-reducing function\n # try to evaluate\n result = grouped.apply(how, *args, **kwargs)\n\n result = self._apply_loffset(result)\n return self._wrap_result(result)", "response": "Re - evaluate the obj with a groupby aggregation."} {"instruction": "Given the following Python 3 function, write the documentation\ndef _apply_loffset(self, result):\n\n needs_offset = (\n isinstance(self.loffset, (DateOffset, timedelta,\n np.timedelta64)) and\n isinstance(result.index, DatetimeIndex) and\n len(result.index) > 0\n )\n\n if needs_offset:\n result.index = result.index + self.loffset\n\n self.loffset = None\n return result", "response": "Applies the loffset to the result."} {"instruction": "Here you have a function in Python 3, explain what it does\ndef _get_resampler_for_grouping(self, groupby, **kwargs):\n return self._resampler_for_grouping(self, groupby=groupby, **kwargs)", "response": "Returns the correct class for resampling with groupby."} {"instruction": "How would you explain what the following Python 3 function does\ndef interpolate(self, method='linear', axis=0, limit=None, inplace=False,\n limit_direction='forward', limit_area=None,\n downcast=None, **kwargs):\n \"\"\"\n Interpolate values according to different methods.\n\n .. versionadded:: 0.18.1\n \"\"\"\n result = self._upsample(None)\n return result.interpolate(method=method, axis=axis, limit=limit,\n inplace=inplace,\n limit_direction=limit_direction,\n limit_area=limit_area,\n downcast=downcast, **kwargs)", "response": "Interpolate the current object."} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef std(self, ddof=1, *args, **kwargs):\n nv.validate_resampler_func('std', args, kwargs)\n return self._downsample('std', ddof=ddof)", "response": "Compute standard deviation of groups excluding missing values."} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef var(self, ddof=1, *args, **kwargs):\n nv.validate_resampler_func('var', args, kwargs)\n return self._downsample('var', ddof=ddof)", "response": "Compute variance of groups excluding missing values."} {"instruction": "How would you code a function in Python 3 to\ndispatch to _upsample and apply the function f on the object.", "response": "def _apply(self, f, grouper=None, *args, **kwargs):\n \"\"\"\n Dispatch to _upsample; we are stripping all of the _upsample kwargs and\n performing the original function call on the grouped object.\n \"\"\"\n\n def func(x):\n x = self._shallow_copy(x, groupby=self.groupby)\n\n if isinstance(f, str):\n return getattr(x, f)(**kwargs)\n\n return x.apply(f, *args, **kwargs)\n\n result = self._groupby.apply(func)\n return self._wrap_result(result)"} {"instruction": "How would you explain what the following Python 3 function does\ndef _downsample(self, how, **kwargs):\n self._set_binner()\n how = self._is_cython_func(how) or how\n ax = self.ax\n obj = self._selected_obj\n\n if not len(ax):\n # reset to the new freq\n obj = obj.copy()\n obj.index.freq = self.freq\n return obj\n\n # do we have a regular frequency\n if ax.freq is not None or ax.inferred_freq is not None:\n\n if len(self.grouper.binlabels) > len(ax) and how is None:\n\n # let's do an asfreq\n return self.asfreq()\n\n # we are downsampling\n # we want to call the actual grouper method here\n result = obj.groupby(\n self.grouper, axis=self.axis).aggregate(how, **kwargs)\n\n result = self._apply_loffset(result)\n return self._wrap_result(result)", "response": "Downsampling the cython mapped function."} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef _adjust_binner_for_upsample(self, binner):\n if self.closed == 'right':\n binner = binner[1:]\n else:\n binner = binner[:-1]\n return binner", "response": "Adjust our binner when upsampling."} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef _get_resampler(self, obj, kind=None):\n self._set_grouper(obj)\n\n ax = self.ax\n if isinstance(ax, DatetimeIndex):\n return DatetimeIndexResampler(obj,\n groupby=self,\n kind=kind,\n axis=self.axis)\n elif isinstance(ax, PeriodIndex) or kind == 'period':\n return PeriodIndexResampler(obj,\n groupby=self,\n kind=kind,\n axis=self.axis)\n elif isinstance(ax, TimedeltaIndex):\n return TimedeltaIndexResampler(obj,\n groupby=self,\n axis=self.axis)\n\n raise TypeError(\"Only valid with DatetimeIndex, \"\n \"TimedeltaIndex or PeriodIndex, \"\n \"but got an instance of %r\" % type(ax).__name__)", "response": "Returns a Resampler object for the given object and kind."} {"instruction": "Explain what the following Python 3 code does\ndef hash_pandas_object(obj, index=True, encoding='utf8', hash_key=None,\n categorize=True):\n \"\"\"\n Return a data hash of the Index/Series/DataFrame\n\n .. versionadded:: 0.19.2\n\n Parameters\n ----------\n index : boolean, default True\n include the index in the hash (if Series/DataFrame)\n encoding : string, default 'utf8'\n encoding for data & key when strings\n hash_key : string key to encode, default to _default_hash_key\n categorize : bool, default True\n Whether to first categorize object arrays before hashing. This is more\n efficient when the array contains duplicate values.\n\n .. versionadded:: 0.20.0\n\n Returns\n -------\n Series of uint64, same length as the object\n \"\"\"\n from pandas import Series\n if hash_key is None:\n hash_key = _default_hash_key\n\n if isinstance(obj, ABCMultiIndex):\n return Series(hash_tuples(obj, encoding, hash_key),\n dtype='uint64', copy=False)\n\n if isinstance(obj, ABCIndexClass):\n h = hash_array(obj.values, encoding, hash_key,\n categorize).astype('uint64', copy=False)\n h = Series(h, index=obj, dtype='uint64', copy=False)\n elif isinstance(obj, ABCSeries):\n h = hash_array(obj.values, encoding, hash_key,\n categorize).astype('uint64', copy=False)\n if index:\n index_iter = (hash_pandas_object(obj.index,\n index=False,\n encoding=encoding,\n hash_key=hash_key,\n categorize=categorize).values\n for _ in [None])\n arrays = itertools.chain([h], index_iter)\n h = _combine_hash_arrays(arrays, 2)\n\n h = Series(h, index=obj.index, dtype='uint64', copy=False)\n\n elif isinstance(obj, ABCDataFrame):\n hashes = (hash_array(series.values) for _, series in obj.iteritems())\n num_items = len(obj.columns)\n if index:\n index_hash_generator = (hash_pandas_object(obj.index,\n index=False,\n encoding=encoding,\n hash_key=hash_key,\n categorize=categorize).values # noqa\n for _ in [None])\n num_items += 1\n hashes = itertools.chain(hashes, index_hash_generator)\n h = _combine_hash_arrays(hashes, num_items)\n\n h = Series(h, index=obj.index, dtype='uint64', copy=False)\n else:\n raise TypeError(\"Unexpected type for hashing %s\" % type(obj))\n return h", "response": "Hash a pandas object to a Series or MultiIndex."} {"instruction": "Given the following Python 3 function, write the documentation\ndef hash_tuples(vals, encoding='utf8', hash_key=None):\n is_tuple = False\n if isinstance(vals, tuple):\n vals = [vals]\n is_tuple = True\n elif not is_list_like(vals):\n raise TypeError(\"must be convertible to a list-of-tuples\")\n\n from pandas import Categorical, MultiIndex\n\n if not isinstance(vals, ABCMultiIndex):\n vals = MultiIndex.from_tuples(vals)\n\n # create a list-of-Categoricals\n vals = [Categorical(vals.codes[level],\n vals.levels[level],\n ordered=False,\n fastpath=True)\n for level in range(vals.nlevels)]\n\n # hash the list-of-ndarrays\n hashes = (_hash_categorical(cat,\n encoding=encoding,\n hash_key=hash_key)\n for cat in vals)\n h = _combine_hash_arrays(hashes, len(vals))\n if is_tuple:\n h = h[0]\n\n return h", "response": "Hash a list - of - tuples efficiently."} {"instruction": "Make a summary of the following Python 3 code\ndef hash_tuple(val, encoding='utf8', hash_key=None):\n hashes = (_hash_scalar(v, encoding=encoding, hash_key=hash_key)\n for v in val)\n\n h = _combine_hash_arrays(hashes, len(val))[0]\n\n return h", "response": "Hash a single tuple efficiently"} {"instruction": "Explain what the following Python 3 code does\ndef _hash_categorical(c, encoding, hash_key):\n # Convert ExtensionArrays to ndarrays\n values = np.asarray(c.categories.values)\n hashed = hash_array(values, encoding, hash_key,\n categorize=False)\n\n # we have uint64, as we don't directly support missing values\n # we don't want to use take_nd which will coerce to float\n # instead, directly construct the result with a\n # max(np.uint64) as the missing value indicator\n #\n # TODO: GH 15362\n\n mask = c.isna()\n if len(hashed):\n result = hashed.take(c.codes)\n else:\n result = np.zeros(len(mask), dtype='uint64')\n\n if mask.any():\n result[mask] = np.iinfo(np.uint64).max\n\n return result", "response": "Hash a Categorical by hashing its categories and then mapping the codes\n to the hashes\n "} {"instruction": "Can you create a Python 3 function that\ngives a 1d array of integers return an array of deterministic integers.", "response": "def hash_array(vals, encoding='utf8', hash_key=None, categorize=True):\n \"\"\"\n Given a 1d array, return an array of deterministic integers.\n\n .. versionadded:: 0.19.2\n\n Parameters\n ----------\n vals : ndarray, Categorical\n encoding : string, default 'utf8'\n encoding for data & key when strings\n hash_key : string key to encode, default to _default_hash_key\n categorize : bool, default True\n Whether to first categorize object arrays before hashing. This is more\n efficient when the array contains duplicate values.\n\n .. versionadded:: 0.20.0\n\n Returns\n -------\n 1d uint64 numpy array of hash values, same length as the vals\n \"\"\"\n\n if not hasattr(vals, 'dtype'):\n raise TypeError(\"must pass a ndarray-like\")\n dtype = vals.dtype\n\n if hash_key is None:\n hash_key = _default_hash_key\n\n # For categoricals, we hash the categories, then remap the codes to the\n # hash values. (This check is above the complex check so that we don't ask\n # numpy if categorical is a subdtype of complex, as it will choke).\n if is_categorical_dtype(dtype):\n return _hash_categorical(vals, encoding, hash_key)\n elif is_extension_array_dtype(dtype):\n vals, _ = vals._values_for_factorize()\n dtype = vals.dtype\n\n # we'll be working with everything as 64-bit values, so handle this\n # 128-bit value early\n if np.issubdtype(dtype, np.complex128):\n return hash_array(vals.real) + 23 * hash_array(vals.imag)\n\n # First, turn whatever array this is into unsigned 64-bit ints, if we can\n # manage it.\n elif isinstance(dtype, np.bool):\n vals = vals.astype('u8')\n elif issubclass(dtype.type, (np.datetime64, np.timedelta64)):\n vals = vals.view('i8').astype('u8', copy=False)\n elif issubclass(dtype.type, np.number) and dtype.itemsize <= 8:\n vals = vals.view('u{}'.format(vals.dtype.itemsize)).astype('u8')\n else:\n # With repeated values, its MUCH faster to categorize object dtypes,\n # then hash and rename categories. We allow skipping the categorization\n # when the values are known/likely to be unique.\n if categorize:\n from pandas import factorize, Categorical, Index\n codes, categories = factorize(vals, sort=False)\n cat = Categorical(codes, Index(categories),\n ordered=False, fastpath=True)\n return _hash_categorical(cat, encoding, hash_key)\n\n try:\n vals = hashing.hash_object_array(vals, hash_key, encoding)\n except TypeError:\n # we have mixed types\n vals = hashing.hash_object_array(vals.astype(str).astype(object),\n hash_key, encoding)\n\n # Then, redistribute these 64-bit ints within the space of 64-bit ints\n vals ^= vals >> 30\n vals *= np.uint64(0xbf58476d1ce4e5b9)\n vals ^= vals >> 27\n vals *= np.uint64(0x94d049bb133111eb)\n vals ^= vals >> 31\n return vals"} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef _hash_scalar(val, encoding='utf8', hash_key=None):\n\n if isna(val):\n # this is to be consistent with the _hash_categorical implementation\n return np.array([np.iinfo(np.uint64).max], dtype='u8')\n\n if getattr(val, 'tzinfo', None) is not None:\n # for tz-aware datetimes, we need the underlying naive UTC value and\n # not the tz aware object or pd extension type (as\n # infer_dtype_from_scalar would do)\n if not isinstance(val, tslibs.Timestamp):\n val = tslibs.Timestamp(val)\n val = val.tz_convert(None)\n\n dtype, val = infer_dtype_from_scalar(val)\n vals = np.array([val], dtype=dtype)\n\n return hash_array(vals, hash_key=hash_key, encoding=encoding,\n categorize=False)", "response": "Hash a scalar value."} {"instruction": "Given the following Python 3 function, write the documentation\ndef _process_single_doc(self, single_doc):\n base_name, extension = os.path.splitext(single_doc)\n if extension in ('.rst', '.ipynb'):\n if os.path.exists(os.path.join(SOURCE_PATH, single_doc)):\n return single_doc\n else:\n raise FileNotFoundError('File {} not found'.format(single_doc))\n\n elif single_doc.startswith('pandas.'):\n try:\n obj = pandas # noqa: F821\n for name in single_doc.split('.'):\n obj = getattr(obj, name)\n except AttributeError:\n raise ImportError('Could not import {}'.format(single_doc))\n else:\n return single_doc[len('pandas.'):]\n else:\n raise ValueError(('--single={} not understood. Value should be a '\n 'valid path to a .rst or .ipynb file, or a '\n 'valid pandas object (e.g. categorical.rst or '\n 'pandas.DataFrame.head)').format(single_doc))", "response": "Process a single document and return the corresponding file path."} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef _run_os(*args):\n subprocess.check_call(args, stdout=sys.stdout, stderr=sys.stderr)", "response": "Execute a command as a OS terminal."} {"instruction": "Implement a Python 3 function for\ncalling sphinx to build documentation.", "response": "def _sphinx_build(self, kind):\n \"\"\"\n Call sphinx to build documentation.\n\n Attribute `num_jobs` from the class is used.\n\n Parameters\n ----------\n kind : {'html', 'latex'}\n\n Examples\n --------\n >>> DocBuilder(num_jobs=4)._sphinx_build('html')\n \"\"\"\n if kind not in ('html', 'latex'):\n raise ValueError('kind must be html or latex, '\n 'not {}'.format(kind))\n\n cmd = ['sphinx-build', '-b', kind]\n if self.num_jobs:\n cmd += ['-j', str(self.num_jobs)]\n if self.warnings_are_errors:\n cmd += ['-W', '--keep-going']\n if self.verbosity:\n cmd.append('-{}'.format('v' * self.verbosity))\n cmd += ['-d', os.path.join(BUILD_PATH, 'doctrees'),\n SOURCE_PATH, os.path.join(BUILD_PATH, kind)]\n return subprocess.call(cmd)"} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef _open_browser(self, single_doc_html):\n url = os.path.join('file://', DOC_PATH, 'build', 'html',\n single_doc_html)\n webbrowser.open(url, new=2)", "response": "Open a browser tab showing the single_doc_html"} {"instruction": "How would you explain what the following Python 3 function does\ndef _get_page_title(self, page):\n fname = os.path.join(SOURCE_PATH, '{}.rst'.format(page))\n option_parser = docutils.frontend.OptionParser(\n components=(docutils.parsers.rst.Parser,))\n doc = docutils.utils.new_document(\n '',\n option_parser.get_default_values())\n with open(fname) as f:\n data = f.read()\n\n parser = docutils.parsers.rst.Parser()\n # do not generate any warning when parsing the rst\n with open(os.devnull, 'a') as f:\n doc.reporter.stream = f\n parser.parse(data, doc)\n\n section = next(node for node in doc.children\n if isinstance(node, docutils.nodes.section))\n title = next(node for node in section.children\n if isinstance(node, docutils.nodes.title))\n\n return title.astext()", "response": "Open the rst file page and extract its title."} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef _add_redirects(self):\n html = '''\n \n \n \n \n \n

\n The page has been moved to {title}\n

\n \n \n '''\n with open(REDIRECTS_FILE) as mapping_fd:\n reader = csv.reader(mapping_fd)\n for row in reader:\n if not row or row[0].strip().startswith('#'):\n continue\n\n path = os.path.join(BUILD_PATH,\n 'html',\n *row[0].split('/')) + '.html'\n\n try:\n title = self._get_page_title(row[1])\n except Exception:\n # the file can be an ipynb and not an rst, or docutils\n # may not be able to read the rst because it has some\n # sphinx specific stuff\n title = 'this page'\n\n if os.path.exists(path):\n raise RuntimeError((\n 'Redirection would overwrite an existing file: '\n '{}').format(path))\n\n with open(path, 'w') as moved_page_fd:\n moved_page_fd.write(\n html.format(url='{}.html'.format(row[1]),\n title=title))", "response": "Add the redirects to the build directory."} {"instruction": "Write a Python 3 script to\nclean documentation generated files.", "response": "def clean():\n \"\"\"\n Clean documentation generated files.\n \"\"\"\n shutil.rmtree(BUILD_PATH, ignore_errors=True)\n shutil.rmtree(os.path.join(SOURCE_PATH, 'reference', 'api'),\n ignore_errors=True)"} {"instruction": "Given the following Python 3 function, write the documentation\ndef zip_html(self):\n zip_fname = os.path.join(BUILD_PATH, 'html', 'pandas.zip')\n if os.path.exists(zip_fname):\n os.remove(zip_fname)\n dirname = os.path.join(BUILD_PATH, 'html')\n fnames = os.listdir(dirname)\n os.chdir(dirname)\n self._run_os('zip',\n zip_fname,\n '-r',\n '-q',\n *fnames)", "response": "Compress HTML documentation into a zip file."} {"instruction": "Create a Python 3 function for\nrendering a DataFrame to a LaTeX tabular or longtable environment output.", "response": "def write_result(self, buf):\n \"\"\"\n Render a DataFrame to a LaTeX tabular/longtable environment output.\n \"\"\"\n\n # string representation of the columns\n if len(self.frame.columns) == 0 or len(self.frame.index) == 0:\n info_line = ('Empty {name}\\nColumns: {col}\\nIndex: {idx}'\n .format(name=type(self.frame).__name__,\n col=self.frame.columns,\n idx=self.frame.index))\n strcols = [[info_line]]\n else:\n strcols = self.fmt._to_str_columns()\n\n def get_col_type(dtype):\n if issubclass(dtype.type, np.number):\n return 'r'\n else:\n return 'l'\n\n # reestablish the MultiIndex that has been joined by _to_str_column\n if self.fmt.index and isinstance(self.frame.index, ABCMultiIndex):\n out = self.frame.index.format(\n adjoin=False, sparsify=self.fmt.sparsify,\n names=self.fmt.has_index_names, na_rep=self.fmt.na_rep\n )\n\n # index.format will sparsify repeated entries with empty strings\n # so pad these with some empty space\n def pad_empties(x):\n for pad in reversed(x):\n if pad:\n break\n return [x[0]] + [i if i else ' ' * len(pad) for i in x[1:]]\n out = (pad_empties(i) for i in out)\n\n # Add empty spaces for each column level\n clevels = self.frame.columns.nlevels\n out = [[' ' * len(i[-1])] * clevels + i for i in out]\n\n # Add the column names to the last index column\n cnames = self.frame.columns.names\n if any(cnames):\n new_names = [i if i else '{}' for i in cnames]\n out[self.frame.index.nlevels - 1][:clevels] = new_names\n\n # Get rid of old multiindex column and add new ones\n strcols = out + strcols[1:]\n\n column_format = self.column_format\n if column_format is None:\n dtypes = self.frame.dtypes._values\n column_format = ''.join(map(get_col_type, dtypes))\n if self.fmt.index:\n index_format = 'l' * self.frame.index.nlevels\n column_format = index_format + column_format\n elif not isinstance(column_format, str): # pragma: no cover\n raise AssertionError('column_format must be str or unicode, '\n 'not {typ}'.format(typ=type(column_format)))\n\n if not self.longtable:\n buf.write('\\\\begin{{tabular}}{{{fmt}}}\\n'\n .format(fmt=column_format))\n buf.write('\\\\toprule\\n')\n else:\n buf.write('\\\\begin{{longtable}}{{{fmt}}}\\n'\n .format(fmt=column_format))\n buf.write('\\\\toprule\\n')\n\n ilevels = self.frame.index.nlevels\n clevels = self.frame.columns.nlevels\n nlevels = clevels\n if self.fmt.has_index_names and self.fmt.show_index_names:\n nlevels += 1\n strrows = list(zip(*strcols))\n self.clinebuf = []\n\n for i, row in enumerate(strrows):\n if i == nlevels and self.fmt.header:\n buf.write('\\\\midrule\\n') # End of header\n if self.longtable:\n buf.write('\\\\endhead\\n')\n buf.write('\\\\midrule\\n')\n buf.write('\\\\multicolumn{{{n}}}{{r}}{{{{Continued on next '\n 'page}}}} \\\\\\\\\\n'.format(n=len(row)))\n buf.write('\\\\midrule\\n')\n buf.write('\\\\endfoot\\n\\n')\n buf.write('\\\\bottomrule\\n')\n buf.write('\\\\endlastfoot\\n')\n if self.fmt.kwds.get('escape', True):\n # escape backslashes first\n crow = [(x.replace('\\\\', '\\\\textbackslash ')\n .replace('_', '\\\\_')\n .replace('%', '\\\\%').replace('$', '\\\\$')\n .replace('#', '\\\\#').replace('{', '\\\\{')\n .replace('}', '\\\\}').replace('~', '\\\\textasciitilde ')\n .replace('^', '\\\\textasciicircum ')\n .replace('&', '\\\\&')\n if (x and x != '{}') else '{}') for x in row]\n else:\n crow = [x if x else '{}' for x in row]\n if self.bold_rows and self.fmt.index:\n # bold row labels\n crow = ['\\\\textbf{{{x}}}'.format(x=x)\n if j < ilevels and x.strip() not in ['', '{}'] else x\n for j, x in enumerate(crow)]\n if i < clevels and self.fmt.header and self.multicolumn:\n # sum up columns to multicolumns\n crow = self._format_multicolumn(crow, ilevels)\n if (i >= nlevels and self.fmt.index and self.multirow and\n ilevels > 1):\n # sum up rows to multirows\n crow = self._format_multirow(crow, ilevels, i, strrows)\n buf.write(' & '.join(crow))\n buf.write(' \\\\\\\\\\n')\n if self.multirow and i < len(strrows) - 1:\n self._print_cline(buf, i, len(strcols))\n\n if not self.longtable:\n buf.write('\\\\bottomrule\\n')\n buf.write('\\\\end{tabular}\\n')\n else:\n buf.write('\\\\end{longtable}\\n')"} {"instruction": "Make a summary of the following Python 3 code\ndef _format_multirow(self, row, ilevels, i, rows):\n for j in range(ilevels):\n if row[j].strip():\n nrow = 1\n for r in rows[i + 1:]:\n if not r[j].strip():\n nrow += 1\n else:\n break\n if nrow > 1:\n # overwrite non-multirow entry\n row[j] = '\\\\multirow{{{nrow:d}}}{{*}}{{{row:s}}}'.format(\n nrow=nrow, row=row[j].strip())\n # save when to end the current block with \\cline\n self.clinebuf.append([i + nrow - 1, j + 1])\n return row", "response": "Format a multirow entry."} {"instruction": "Write a Python 3 script for\nprinting clines after multirow - blocks are finished", "response": "def _print_cline(self, buf, i, icol):\n \"\"\"\n Print clines after multirow-blocks are finished\n \"\"\"\n for cl in self.clinebuf:\n if cl[0] == i:\n buf.write('\\\\cline{{{cl:d}-{icol:d}}}\\n'\n .format(cl=cl[1], icol=icol))\n # remove entries that have been written to buffer\n self.clinebuf = [x for x in self.clinebuf if x[0] != i]"} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef _validate_integer(name, val, min_val=0):\n msg = \"'{name:s}' must be an integer >={min_val:d}\".format(name=name,\n min_val=min_val)\n\n if val is not None:\n if is_float(val):\n if int(val) != val:\n raise ValueError(msg)\n val = int(val)\n elif not (is_integer(val) and val >= min_val):\n raise ValueError(msg)\n\n return val", "response": "Checks whether the value of the name parameter for parsing is an integer or float. Raises a ValueError if that is not the case."} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef _validate_names(names):\n\n if names is not None:\n if len(names) != len(set(names)):\n msg = (\"Duplicate names specified. This \"\n \"will raise an error in the future.\")\n warnings.warn(msg, UserWarning, stacklevel=3)\n\n return names", "response": "Validate the names parameter."} {"instruction": "Make a summary of the following Python 3 code\ndef read_fwf(filepath_or_buffer: FilePathOrBuffer,\n colspecs='infer',\n widths=None,\n infer_nrows=100,\n **kwds):\n\n r\"\"\"\n Read a table of fixed-width formatted lines into DataFrame.\n\n Also supports optionally iterating or breaking of the file\n into chunks.\n\n Additional help can be found in the `online docs for IO Tools\n `_.\n\n Parameters\n ----------\n filepath_or_buffer : str, path object, or file-like object\n Any valid string path is acceptable. The string could be a URL. Valid\n URL schemes include http, ftp, s3, and file. For file URLs, a host is\n expected. A local file could be: file://localhost/path/to/table.csv.\n\n If you want to pass in a path object, pandas accepts either\n ``pathlib.Path`` or ``py._path.local.LocalPath``.\n\n By file-like object, we refer to objects with a ``read()`` method,\n such as a file handler (e.g. via builtin ``open`` function)\n or ``StringIO``.\n colspecs : list of tuple (int, int) or 'infer'. optional\n A list of tuples giving the extents of the fixed-width\n fields of each line as half-open intervals (i.e., [from, to[ ).\n String value 'infer' can be used to instruct the parser to try\n detecting the column specifications from the first 100 rows of\n the data which are not being skipped via skiprows (default='infer').\n widths : list of int, optional\n A list of field widths which can be used instead of 'colspecs' if\n the intervals are contiguous.\n infer_nrows : int, default 100\n The number of rows to consider when letting the parser determine the\n `colspecs`.\n\n .. versionadded:: 0.24.0\n **kwds : optional\n Optional keyword arguments can be passed to ``TextFileReader``.\n\n Returns\n -------\n DataFrame or TextParser\n A comma-separated values (csv) file is returned as two-dimensional\n data structure with labeled axes.\n\n See Also\n --------\n to_csv : Write DataFrame to a comma-separated values (csv) file.\n read_csv : Read a comma-separated values (csv) file into DataFrame.\n\n Examples\n --------\n >>> pd.read_fwf('data.csv') # doctest: +SKIP\n \"\"\"\n\n # Check input arguments.\n if colspecs is None and widths is None:\n raise ValueError(\"Must specify either colspecs or widths\")\n elif colspecs not in (None, 'infer') and widths is not None:\n raise ValueError(\"You must specify only one of 'widths' and \"\n \"'colspecs'\")\n\n # Compute 'colspecs' from 'widths', if specified.\n if widths is not None:\n colspecs, col = [], 0\n for w in widths:\n colspecs.append((col, col + w))\n col += w\n\n kwds['colspecs'] = colspecs\n kwds['infer_nrows'] = infer_nrows\n kwds['engine'] = 'python-fwf'\n return _read(filepath_or_buffer, kwds)", "response": "r Reads a fixed - width formatted file into a DataFrame."} {"instruction": "How would you explain what the following Python 3 function does\ndef _is_potential_multi_index(columns):\n return (len(columns) and not isinstance(columns, MultiIndex) and\n all(isinstance(c, tuple) for c in columns))", "response": "Checks whether or not the columns parameter could be converted into a MultiIndex."} {"instruction": "Can you create a Python 3 function that\nevaluates the usecols parameter of the .", "response": "def _evaluate_usecols(usecols, names):\n \"\"\"\n Check whether or not the 'usecols' parameter\n is a callable. If so, enumerates the 'names'\n parameter and returns a set of indices for\n each entry in 'names' that evaluates to True.\n If not a callable, returns 'usecols'.\n \"\"\"\n if callable(usecols):\n return {i for i, name in enumerate(names) if usecols(name)}\n return usecols"} {"instruction": "Can you implement a function in Python 3 that\nvalidates that all usecols are present in a given list of names. If not raises a ValueError.", "response": "def _validate_usecols_names(usecols, names):\n \"\"\"\n Validates that all usecols are present in a given\n list of names. If not, raise a ValueError that\n shows what usecols are missing.\n\n Parameters\n ----------\n usecols : iterable of usecols\n The columns to validate are present in names.\n names : iterable of names\n The column names to check against.\n\n Returns\n -------\n usecols : iterable of usecols\n The `usecols` parameter if the validation succeeds.\n\n Raises\n ------\n ValueError : Columns were missing. Error message will list them.\n \"\"\"\n missing = [c for c in usecols if c not in names]\n if len(missing) > 0:\n raise ValueError(\n \"Usecols do not match columns, \"\n \"columns expected but not found: {missing}\".format(missing=missing)\n )\n\n return usecols"} {"instruction": "Implement a Python 3 function for\nvalidating the usecols parameter.", "response": "def _validate_usecols_arg(usecols):\n \"\"\"\n Validate the 'usecols' parameter.\n\n Checks whether or not the 'usecols' parameter contains all integers\n (column selection by index), strings (column by name) or is a callable.\n Raises a ValueError if that is not the case.\n\n Parameters\n ----------\n usecols : list-like, callable, or None\n List of columns to use when parsing or a callable that can be used\n to filter a list of table columns.\n\n Returns\n -------\n usecols_tuple : tuple\n A tuple of (verified_usecols, usecols_dtype).\n\n 'verified_usecols' is either a set if an array-like is passed in or\n 'usecols' if a callable or None is passed in.\n\n 'usecols_dtype` is the inferred dtype of 'usecols' if an array-like\n is passed in or None if a callable or None is passed in.\n \"\"\"\n msg = (\"'usecols' must either be list-like of all strings, all unicode, \"\n \"all integers or a callable.\")\n if usecols is not None:\n if callable(usecols):\n return usecols, None\n\n if not is_list_like(usecols):\n # see gh-20529\n #\n # Ensure it is iterable container but not string.\n raise ValueError(msg)\n\n usecols_dtype = lib.infer_dtype(usecols, skipna=False)\n\n if usecols_dtype not in (\"empty\", \"integer\",\n \"string\", \"unicode\"):\n raise ValueError(msg)\n\n usecols = set(usecols)\n\n return usecols, usecols_dtype\n return usecols, None"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef _validate_parse_dates_arg(parse_dates):\n msg = (\"Only booleans, lists, and \"\n \"dictionaries are accepted \"\n \"for the 'parse_dates' parameter\")\n\n if parse_dates is not None:\n if is_scalar(parse_dates):\n if not lib.is_bool(parse_dates):\n raise TypeError(msg)\n\n elif not isinstance(parse_dates, (list, dict)):\n raise TypeError(msg)\n\n return parse_dates", "response": "Validate the parse_dates parameter."} {"instruction": "Given the following Python 3 function, write the documentation\ndef _stringify_na_values(na_values):\n result = []\n for x in na_values:\n result.append(str(x))\n result.append(x)\n try:\n v = float(x)\n\n # we are like 999 here\n if v == int(v):\n v = int(v)\n result.append(\"{value}.0\".format(value=v))\n result.append(str(v))\n\n result.append(v)\n except (TypeError, ValueError, OverflowError):\n pass\n try:\n result.append(int(x))\n except (TypeError, ValueError, OverflowError):\n pass\n return set(result)", "response": "return a stringified and numeric for these values"} {"instruction": "Make a summary of the following Python 3 code\ndef _get_na_values(col, na_values, na_fvalues, keep_default_na):\n\n if isinstance(na_values, dict):\n if col in na_values:\n return na_values[col], na_fvalues[col]\n else:\n if keep_default_na:\n return _NA_VALUES, set()\n\n return set(), set()\n else:\n return na_values, na_fvalues", "response": "Get the NaN values for a given column."} {"instruction": "Write a Python 3 script for\nextracting and return the names index_names col_names passed_names", "response": "def _extract_multi_indexer_columns(self, header, index_names, col_names,\n passed_names=False):\n \"\"\" extract and return the names, index_names, col_names\n header is a list-of-lists returned from the parsers \"\"\"\n if len(header) < 2:\n return header[0], index_names, col_names, passed_names\n\n # the names are the tuples of the header that are not the index cols\n # 0 is the name of the index, assuming index_col is a list of column\n # numbers\n ic = self.index_col\n if ic is None:\n ic = []\n\n if not isinstance(ic, (list, tuple, np.ndarray)):\n ic = [ic]\n sic = set(ic)\n\n # clean the index_names\n index_names = header.pop(-1)\n index_names, names, index_col = _clean_index_names(index_names,\n self.index_col,\n self.unnamed_cols)\n\n # extract the columns\n field_count = len(header[0])\n\n def extract(r):\n return tuple(r[i] for i in range(field_count) if i not in sic)\n\n columns = lzip(*[extract(r) for r in header])\n names = ic + columns\n\n # If we find unnamed columns all in a single\n # level, then our header was too long.\n for n in range(len(columns[0])):\n if all(compat.to_str(c[n]) in self.unnamed_cols for c in columns):\n raise ParserError(\n \"Passed header=[{header}] are too many rows for this \"\n \"multi_index of columns\"\n .format(header=','.join(str(x) for x in self.header))\n )\n\n # Clean the column names (if we have an index_col).\n if len(ic):\n col_names = [r[0] if (len(r[0]) and\n r[0] not in self.unnamed_cols) else None\n for r in header]\n else:\n col_names = [None] * len(header)\n\n passed_names = True\n\n return names, index_names, col_names, passed_names"} {"instruction": "Implement a Python 3 function for\ninferring types of values from the internal data structures.", "response": "def _infer_types(self, values, na_values, try_num_bool=True):\n \"\"\"\n Infer types of values, possibly casting\n\n Parameters\n ----------\n values : ndarray\n na_values : set\n try_num_bool : bool, default try\n try to cast values to numeric (first preference) or boolean\n\n Returns:\n --------\n converted : ndarray\n na_count : int\n \"\"\"\n na_count = 0\n if issubclass(values.dtype.type, (np.number, np.bool_)):\n mask = algorithms.isin(values, list(na_values))\n na_count = mask.sum()\n if na_count > 0:\n if is_integer_dtype(values):\n values = values.astype(np.float64)\n np.putmask(values, mask, np.nan)\n return values, na_count\n\n if try_num_bool:\n try:\n result = lib.maybe_convert_numeric(values, na_values, False)\n na_count = isna(result).sum()\n except Exception:\n result = values\n if values.dtype == np.object_:\n na_count = parsers.sanitize_objects(result,\n na_values, False)\n else:\n result = values\n if values.dtype == np.object_:\n na_count = parsers.sanitize_objects(values, na_values, False)\n\n if result.dtype == np.object_ and try_num_bool:\n result = libops.maybe_convert_bool(np.asarray(values),\n true_values=self.true_values,\n false_values=self.false_values)\n\n return result, na_count"} {"instruction": "Here you have a function in Python 3, explain what it does\ndef _cast_types(self, values, cast_type, column):\n\n if is_categorical_dtype(cast_type):\n known_cats = (isinstance(cast_type, CategoricalDtype) and\n cast_type.categories is not None)\n\n if not is_object_dtype(values) and not known_cats:\n # XXX this is for consistency with\n # c-parser which parses all categories\n # as strings\n values = astype_nansafe(values, str)\n\n cats = Index(values).unique().dropna()\n values = Categorical._from_inferred_categories(\n cats, cats.get_indexer(values), cast_type,\n true_values=self.true_values)\n\n # use the EA's implementation of casting\n elif is_extension_array_dtype(cast_type):\n # ensure cast_type is an actual dtype and not a string\n cast_type = pandas_dtype(cast_type)\n array_type = cast_type.construct_array_type()\n try:\n return array_type._from_sequence_of_strings(values,\n dtype=cast_type)\n except NotImplementedError:\n raise NotImplementedError(\n \"Extension Array: {ea} must implement \"\n \"_from_sequence_of_strings in order \"\n \"to be used in parser methods\".format(ea=array_type))\n\n else:\n try:\n values = astype_nansafe(values, cast_type,\n copy=True, skipna=True)\n except ValueError:\n raise ValueError(\n \"Unable to convert column {column} to type \"\n \"{cast_type}\".format(\n column=column, cast_type=cast_type))\n return values", "response": "Casts the values to the specified type."} {"instruction": "Explain what the following Python 3 code does\ndef _handle_usecols(self, columns, usecols_key):\n if self.usecols is not None:\n if callable(self.usecols):\n col_indices = _evaluate_usecols(self.usecols, usecols_key)\n elif any(isinstance(u, str) for u in self.usecols):\n if len(columns) > 1:\n raise ValueError(\"If using multiple headers, usecols must \"\n \"be integers.\")\n col_indices = []\n\n for col in self.usecols:\n if isinstance(col, str):\n try:\n col_indices.append(usecols_key.index(col))\n except ValueError:\n _validate_usecols_names(self.usecols, usecols_key)\n else:\n col_indices.append(col)\n else:\n col_indices = self.usecols\n\n columns = [[n for i, n in enumerate(column) if i in col_indices]\n for column in columns]\n self._col_indices = col_indices\n return columns", "response": "Handles the usecols of the current record."} {"instruction": "Write a Python 3 script for\nchecking whether the file starts with the BOM character and remove it if it does.", "response": "def _check_for_bom(self, first_row):\n \"\"\"\n Checks whether the file begins with the BOM character.\n If it does, remove it. In addition, if there is quoting\n in the field subsequent to the BOM, remove it as well\n because it technically takes place at the beginning of\n the name, not the middle of it.\n \"\"\"\n # first_row will be a list, so we need to check\n # that that list is not empty before proceeding.\n if not first_row:\n return first_row\n\n # The first element of this row is the one that could have the\n # BOM that we want to remove. Check that the first element is a\n # string before proceeding.\n if not isinstance(first_row[0], str):\n return first_row\n\n # Check that the string is not empty, as that would\n # obviously not have a BOM at the start of it.\n if not first_row[0]:\n return first_row\n\n # Since the string is non-empty, check that it does\n # in fact begin with a BOM.\n first_elt = first_row[0][0]\n if first_elt != _BOM:\n return first_row\n\n first_row = first_row[0]\n\n if len(first_row) > 1 and first_row[1] == self.quotechar:\n start = 2\n quote = first_row[1]\n end = first_row[2:].index(quote) + 2\n\n # Extract the data between the quotation marks\n new_row = first_row[start:end]\n\n # Extract any remaining data after the second\n # quotation mark.\n if len(first_row) > end + 1:\n new_row += first_row[end + 1:]\n return [new_row]\n elif len(first_row) > 1:\n return [first_row[1:]]\n else:\n # First row is just the BOM, so we\n # return an empty string.\n return [\"\"]"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef _alert_malformed(self, msg, row_num):\n\n if self.error_bad_lines:\n raise ParserError(msg)\n elif self.warn_bad_lines:\n base = 'Skipping line {row_num}: '.format(row_num=row_num)\n sys.stderr.write(base + msg + '\\n')", "response": "A helper function to alert a user about a malformed row."} {"instruction": "Make a summary of the following Python 3 code\ndef _next_iter_line(self, row_num):\n\n try:\n return next(self.data)\n except csv.Error as e:\n if self.warn_bad_lines or self.error_bad_lines:\n msg = str(e)\n\n if 'NULL byte' in msg:\n msg = ('NULL byte detected. This byte '\n 'cannot be processed in Python\\'s '\n 'native csv library at the moment, '\n 'so please pass in engine=\\'c\\' instead')\n\n if self.skipfooter > 0:\n reason = ('Error could possibly be due to '\n 'parsing errors in the skipped footer rows '\n '(the skipfooter keyword is only applied '\n 'after Python\\'s csv library has parsed '\n 'all rows).')\n msg += '. ' + reason\n\n self._alert_malformed(msg, row_num)\n return None", "response": "Wrapper around iterating through self. data and checking for specific CSV errors."} {"instruction": "Given the following Python 3 function, write the documentation\ndef _remove_empty_lines(self, lines):\n\n ret = []\n for l in lines:\n # Remove empty lines and lines with only one whitespace value\n if (len(l) > 1 or len(l) == 1 and\n (not isinstance(l[0], str) or l[0].strip())):\n ret.append(l)\n return ret", "response": "Remove empty lines from the lines that are not empty and contain only one whitespace value."} {"instruction": "How would you explain what the following Python 3 function does\ndef _get_index_name(self, columns):\n orig_names = list(columns)\n columns = list(columns)\n\n try:\n line = self._next_line()\n except StopIteration:\n line = None\n\n try:\n next_line = self._next_line()\n except StopIteration:\n next_line = None\n\n # implicitly index_col=0 b/c 1 fewer column names\n implicit_first_cols = 0\n if line is not None:\n # leave it 0, #2442\n # Case 1\n if self.index_col is not False:\n implicit_first_cols = len(line) - self.num_original_columns\n\n # Case 0\n if next_line is not None:\n if len(next_line) == len(line) + self.num_original_columns:\n # column and index names on diff rows\n self.index_col = lrange(len(line))\n self.buf = self.buf[1:]\n\n for c in reversed(line):\n columns.insert(0, c)\n\n # Update list of original names to include all indices.\n orig_names = list(columns)\n self.num_original_columns = len(columns)\n return line, orig_names, columns\n\n if implicit_first_cols > 0:\n # Case 1\n self._implicit_index = True\n if self.index_col is None:\n self.index_col = lrange(implicit_first_cols)\n\n index_name = None\n\n else:\n # Case 2\n (index_name, columns_,\n self.index_col) = _clean_index_names(columns, self.index_col,\n self.unnamed_cols)\n\n return index_name, orig_names, columns", "response": "Get the index name of the current entry in the columns."} {"instruction": "Create a Python 3 function to\nread the rows from self. f and return them as a list of str.", "response": "def get_rows(self, infer_nrows, skiprows=None):\n \"\"\"\n Read rows from self.f, skipping as specified.\n\n We distinguish buffer_rows (the first <= infer_nrows\n lines) from the rows returned to detect_colspecs\n because it's simpler to leave the other locations\n with skiprows logic alone than to modify them to\n deal with the fact we skipped some rows here as\n well.\n\n Parameters\n ----------\n infer_nrows : int\n Number of rows to read from self.f, not counting\n rows that are skipped.\n skiprows: set, optional\n Indices of rows to skip.\n\n Returns\n -------\n detect_rows : list of str\n A list containing the rows to read.\n\n \"\"\"\n if skiprows is None:\n skiprows = set()\n buffer_rows = []\n detect_rows = []\n for i, row in enumerate(self.f):\n if i not in skiprows:\n detect_rows.append(row)\n buffer_rows.append(row)\n if len(detect_rows) >= infer_nrows:\n break\n self.buffer = iter(buffer_rows)\n return detect_rows"} {"instruction": "Can you write a function in Python 3 where it\ndetermines the URL corresponding to Python object", "response": "def linkcode_resolve(domain, info):\n \"\"\"\n Determine the URL corresponding to Python object\n \"\"\"\n if domain != 'py':\n return None\n\n modname = info['module']\n fullname = info['fullname']\n\n submod = sys.modules.get(modname)\n if submod is None:\n return None\n\n obj = submod\n for part in fullname.split('.'):\n try:\n obj = getattr(obj, part)\n except AttributeError:\n return None\n\n try:\n # inspect.unwrap() was added in Python version 3.4\n if sys.version_info >= (3, 5):\n fn = inspect.getsourcefile(inspect.unwrap(obj))\n else:\n fn = inspect.getsourcefile(obj)\n except TypeError:\n fn = None\n if not fn:\n return None\n\n try:\n source, lineno = inspect.getsourcelines(obj)\n except OSError:\n lineno = None\n\n if lineno:\n linespec = \"#L{:d}-L{:d}\".format(lineno, lineno + len(source) - 1)\n else:\n linespec = \"\"\n\n fn = os.path.relpath(fn, start=os.path.dirname(pandas.__file__))\n\n if '+' in pandas.__version__:\n return (\"http://github.com/pandas-dev/pandas/blob/master/pandas/\"\n \"{}{}\".format(fn, linespec))\n else:\n return (\"http://github.com/pandas-dev/pandas/blob/\"\n \"v{}/pandas/{}{}\".format(pandas.__version__, fn, linespec))"} {"instruction": "Write a Python 3 script to\nprocess the class docstring.", "response": "def process_class_docstrings(app, what, name, obj, options, lines):\n \"\"\"\n For those classes for which we use ::\n\n :template: autosummary/class_without_autosummary.rst\n\n the documented attributes/methods have to be listed in the class\n docstring. However, if one of those lists is empty, we use 'None',\n which then generates warnings in sphinx / ugly html output.\n This \"autodoc-process-docstring\" event connector removes that part\n from the processed docstring.\n\n \"\"\"\n if what == \"class\":\n joined = '\\n'.join(lines)\n\n templates = [\n \"\"\".. rubric:: Attributes\n\n.. autosummary::\n :toctree:\n\n None\n\"\"\",\n \"\"\".. rubric:: Methods\n\n.. autosummary::\n :toctree:\n\n None\n\"\"\"\n ]\n\n for template in templates:\n if template in joined:\n joined = joined.replace(template, '')\n lines[:] = joined.split('\\n')"} {"instruction": "Implement a function in Python 3 to\npack object o and write it to stream.", "response": "def pack(o, stream, **kwargs):\n \"\"\"\n Pack object `o` and write it to `stream`\n\n See :class:`Packer` for options.\n \"\"\"\n packer = Packer(**kwargs)\n stream.write(packer.pack(o))"} {"instruction": "Given the following Python 3 function, write the documentation\ndef get_mgr_concatenation_plan(mgr, indexers):\n # Calculate post-reindex shape , save for item axis which will be separate\n # for each block anyway.\n mgr_shape = list(mgr.shape)\n for ax, indexer in indexers.items():\n mgr_shape[ax] = len(indexer)\n mgr_shape = tuple(mgr_shape)\n\n if 0 in indexers:\n ax0_indexer = indexers.pop(0)\n blknos = algos.take_1d(mgr._blknos, ax0_indexer, fill_value=-1)\n blklocs = algos.take_1d(mgr._blklocs, ax0_indexer, fill_value=-1)\n else:\n\n if mgr._is_single_block:\n blk = mgr.blocks[0]\n return [(blk.mgr_locs, JoinUnit(blk, mgr_shape, indexers))]\n\n ax0_indexer = None\n blknos = mgr._blknos\n blklocs = mgr._blklocs\n\n plan = []\n for blkno, placements in libinternals.get_blkno_placements(blknos,\n mgr.nblocks,\n group=False):\n\n assert placements.is_slice_like\n\n join_unit_indexers = indexers.copy()\n\n shape = list(mgr_shape)\n shape[0] = len(placements)\n shape = tuple(shape)\n\n if blkno == -1:\n unit = JoinUnit(None, shape)\n else:\n blk = mgr.blocks[blkno]\n ax0_blk_indexer = blklocs[placements.indexer]\n\n unit_no_ax0_reindexing = (len(placements) == len(blk.mgr_locs) and\n # Fastpath detection of join unit not\n # needing to reindex its block: no ax0\n # reindexing took place and block\n # placement was sequential before.\n ((ax0_indexer is None and\n blk.mgr_locs.is_slice_like and\n blk.mgr_locs.as_slice.step == 1) or\n # Slow-ish detection: all indexer locs\n # are sequential (and length match is\n # checked above).\n (np.diff(ax0_blk_indexer) == 1).all()))\n\n # Omit indexer if no item reindexing is required.\n if unit_no_ax0_reindexing:\n join_unit_indexers.pop(0, None)\n else:\n join_unit_indexers[0] = ax0_blk_indexer\n\n unit = JoinUnit(blk, shape, join_unit_indexers)\n\n plan.append((placements, unit))\n\n return plan", "response": "Construct a concatenation plan for given block manager and indexers."} {"instruction": "Make a summary of the following Python 3 code\ndef concatenate_join_units(join_units, concat_axis, copy):\n if concat_axis == 0 and len(join_units) > 1:\n # Concatenating join units along ax0 is handled in _merge_blocks.\n raise AssertionError(\"Concatenating join units along axis0\")\n\n empty_dtype, upcasted_na = get_empty_dtype_and_na(join_units)\n\n to_concat = [ju.get_reindexed_values(empty_dtype=empty_dtype,\n upcasted_na=upcasted_na)\n for ju in join_units]\n\n if len(to_concat) == 1:\n # Only one block, nothing to concatenate.\n concat_values = to_concat[0]\n if copy:\n if isinstance(concat_values, np.ndarray):\n # non-reindexed (=not yet copied) arrays are made into a view\n # in JoinUnit.get_reindexed_values\n if concat_values.base is not None:\n concat_values = concat_values.copy()\n else:\n concat_values = concat_values.copy()\n else:\n concat_values = _concat._concat_compat(to_concat, axis=concat_axis)\n\n return concat_values", "response": "Concatenate values from several join units along selected axis."} {"instruction": "Implement a Python 3 function for\nreturning the dtype and NA values to use when concatenating specified units.", "response": "def get_empty_dtype_and_na(join_units):\n \"\"\"\n Return dtype and N/A values to use when concatenating specified units.\n\n Returned N/A value may be None which means there was no casting involved.\n\n Returns\n -------\n dtype\n na\n \"\"\"\n if len(join_units) == 1:\n blk = join_units[0].block\n if blk is None:\n return np.float64, np.nan\n\n if is_uniform_reindex(join_units):\n # XXX: integrate property\n empty_dtype = join_units[0].block.dtype\n upcasted_na = join_units[0].block.fill_value\n return empty_dtype, upcasted_na\n\n has_none_blocks = False\n dtypes = [None] * len(join_units)\n for i, unit in enumerate(join_units):\n if unit.block is None:\n has_none_blocks = True\n else:\n dtypes[i] = unit.dtype\n\n upcast_classes = defaultdict(list)\n null_upcast_classes = defaultdict(list)\n for dtype, unit in zip(dtypes, join_units):\n if dtype is None:\n continue\n\n if is_categorical_dtype(dtype):\n upcast_cls = 'category'\n elif is_datetime64tz_dtype(dtype):\n upcast_cls = 'datetimetz'\n elif issubclass(dtype.type, np.bool_):\n upcast_cls = 'bool'\n elif issubclass(dtype.type, np.object_):\n upcast_cls = 'object'\n elif is_datetime64_dtype(dtype):\n upcast_cls = 'datetime'\n elif is_timedelta64_dtype(dtype):\n upcast_cls = 'timedelta'\n elif is_sparse(dtype):\n upcast_cls = dtype.subtype.name\n elif is_extension_array_dtype(dtype):\n upcast_cls = 'object'\n elif is_float_dtype(dtype) or is_numeric_dtype(dtype):\n upcast_cls = dtype.name\n else:\n upcast_cls = 'float'\n\n # Null blocks should not influence upcast class selection, unless there\n # are only null blocks, when same upcasting rules must be applied to\n # null upcast classes.\n if unit.is_na:\n null_upcast_classes[upcast_cls].append(dtype)\n else:\n upcast_classes[upcast_cls].append(dtype)\n\n if not upcast_classes:\n upcast_classes = null_upcast_classes\n\n # create the result\n if 'object' in upcast_classes:\n return np.dtype(np.object_), np.nan\n elif 'bool' in upcast_classes:\n if has_none_blocks:\n return np.dtype(np.object_), np.nan\n else:\n return np.dtype(np.bool_), None\n elif 'category' in upcast_classes:\n return np.dtype(np.object_), np.nan\n elif 'datetimetz' in upcast_classes:\n # GH-25014. We use NaT instead of iNaT, since this eventually\n # ends up in DatetimeArray.take, which does not allow iNaT.\n dtype = upcast_classes['datetimetz']\n return dtype[0], tslibs.NaT\n elif 'datetime' in upcast_classes:\n return np.dtype('M8[ns]'), tslibs.iNaT\n elif 'timedelta' in upcast_classes:\n return np.dtype('m8[ns]'), tslibs.iNaT\n else: # pragma\n try:\n g = np.find_common_type(upcast_classes, [])\n except TypeError:\n # At least one is an ExtensionArray\n return np.dtype(np.object_), np.nan\n else:\n if is_float_dtype(g):\n return g, g.type(np.nan)\n elif is_numeric_dtype(g):\n if has_none_blocks:\n return np.float64, np.nan\n else:\n return g, None\n\n msg = \"invalid dtype determination in get_concat_dtype\"\n raise AssertionError(msg)"} {"instruction": "Explain what the following Python 3 code does\ndef is_uniform_join_units(join_units):\n return (\n # all blocks need to have the same type\n all(type(ju.block) is type(join_units[0].block) for ju in join_units) and # noqa\n # no blocks that would get missing values (can lead to type upcasts)\n # unless we're an extension dtype.\n all(not ju.is_na or ju.block.is_extension for ju in join_units) and\n # no blocks with indexers (as then the dimensions do not fit)\n all(not ju.indexers for ju in join_units) and\n # disregard Panels\n all(ju.block.ndim <= 2 for ju in join_units) and\n # only use this path when there is something to concatenate\n len(join_units) > 1)", "response": "Check if the join units consist of blocks of uniform type that can be concatenated using Block. concat_same_type instead of Block. concat_compat."} {"instruction": "Implement a Python 3 function for\ntrimming a join unit to a given length.", "response": "def trim_join_unit(join_unit, length):\n \"\"\"\n Reduce join_unit's shape along item axis to length.\n\n Extra items that didn't fit are returned as a separate block.\n \"\"\"\n\n if 0 not in join_unit.indexers:\n extra_indexers = join_unit.indexers\n\n if join_unit.block is None:\n extra_block = None\n else:\n extra_block = join_unit.block.getitem_block(slice(length, None))\n join_unit.block = join_unit.block.getitem_block(slice(length))\n else:\n extra_block = join_unit.block\n\n extra_indexers = copy.copy(join_unit.indexers)\n extra_indexers[0] = extra_indexers[0][length:]\n join_unit.indexers[0] = join_unit.indexers[0][:length]\n\n extra_shape = (join_unit.shape[0] - length,) + join_unit.shape[1:]\n join_unit.shape = (length,) + join_unit.shape[1:]\n\n return JoinUnit(block=extra_block, indexers=extra_indexers,\n shape=extra_shape)"} {"instruction": "Make a summary of the following Python 3 code\ndef combine_concat_plans(plans, concat_axis):\n if len(plans) == 1:\n for p in plans[0]:\n yield p[0], [p[1]]\n\n elif concat_axis == 0:\n offset = 0\n for plan in plans:\n last_plc = None\n\n for plc, unit in plan:\n yield plc.add(offset), [unit]\n last_plc = plc\n\n if last_plc is not None:\n offset += last_plc.as_slice.stop\n\n else:\n num_ended = [0]\n\n def _next_or_none(seq):\n retval = next(seq, None)\n if retval is None:\n num_ended[0] += 1\n return retval\n\n plans = list(map(iter, plans))\n next_items = list(map(_next_or_none, plans))\n\n while num_ended[0] != len(next_items):\n if num_ended[0] > 0:\n raise ValueError(\"Plan shapes are not aligned\")\n\n placements, units = zip(*next_items)\n\n lengths = list(map(len, placements))\n min_len, max_len = min(lengths), max(lengths)\n\n if min_len == max_len:\n yield placements[0], units\n next_items[:] = map(_next_or_none, plans)\n else:\n yielded_placement = None\n yielded_units = [None] * len(next_items)\n for i, (plc, unit) in enumerate(next_items):\n yielded_units[i] = unit\n if len(plc) > min_len:\n # trim_join_unit updates unit in place, so only\n # placement needs to be sliced to skip min_len.\n next_items[i] = (plc[min_len:],\n trim_join_unit(unit, min_len))\n else:\n yielded_placement = plc\n next_items[i] = _next_or_none(plans[i])\n\n yield yielded_placement, yielded_units", "response": "Combine multiple concatenation plans into one."} {"instruction": "How would you explain what the following Python 3 function does\ndef use(self, key, value):\n old_value = self[key]\n try:\n self[key] = value\n yield self\n finally:\n self[key] = old_value", "response": "Temporarily set a parameter value using the with statement."} {"instruction": "Make a summary of the following Python 3 code\ndef _stata_elapsed_date_to_datetime_vec(dates, fmt):\n MIN_YEAR, MAX_YEAR = Timestamp.min.year, Timestamp.max.year\n MAX_DAY_DELTA = (Timestamp.max - datetime.datetime(1960, 1, 1)).days\n MIN_DAY_DELTA = (Timestamp.min - datetime.datetime(1960, 1, 1)).days\n MIN_MS_DELTA = MIN_DAY_DELTA * 24 * 3600 * 1000\n MAX_MS_DELTA = MAX_DAY_DELTA * 24 * 3600 * 1000\n\n def convert_year_month_safe(year, month):\n \"\"\"\n Convert year and month to datetimes, using pandas vectorized versions\n when the date range falls within the range supported by pandas.\n Otherwise it falls back to a slower but more robust method\n using datetime.\n \"\"\"\n if year.max() < MAX_YEAR and year.min() > MIN_YEAR:\n return to_datetime(100 * year + month, format='%Y%m')\n else:\n index = getattr(year, 'index', None)\n return Series(\n [datetime.datetime(y, m, 1) for y, m in zip(year, month)],\n index=index)\n\n def convert_year_days_safe(year, days):\n \"\"\"\n Converts year (e.g. 1999) and days since the start of the year to a\n datetime or datetime64 Series\n \"\"\"\n if year.max() < (MAX_YEAR - 1) and year.min() > MIN_YEAR:\n return (to_datetime(year, format='%Y') +\n to_timedelta(days, unit='d'))\n else:\n index = getattr(year, 'index', None)\n value = [datetime.datetime(y, 1, 1) + relativedelta(days=int(d))\n for y, d in zip(year, days)]\n return Series(value, index=index)\n\n def convert_delta_safe(base, deltas, unit):\n \"\"\"\n Convert base dates and deltas to datetimes, using pandas vectorized\n versions if the deltas satisfy restrictions required to be expressed\n as dates in pandas.\n \"\"\"\n index = getattr(deltas, 'index', None)\n if unit == 'd':\n if deltas.max() > MAX_DAY_DELTA or deltas.min() < MIN_DAY_DELTA:\n values = [base + relativedelta(days=int(d)) for d in deltas]\n return Series(values, index=index)\n elif unit == 'ms':\n if deltas.max() > MAX_MS_DELTA or deltas.min() < MIN_MS_DELTA:\n values = [base + relativedelta(microseconds=(int(d) * 1000))\n for d in deltas]\n return Series(values, index=index)\n else:\n raise ValueError('format not understood')\n base = to_datetime(base)\n deltas = to_timedelta(deltas, unit=unit)\n return base + deltas\n\n # TODO: If/when pandas supports more than datetime64[ns], this should be\n # improved to use correct range, e.g. datetime[Y] for yearly\n bad_locs = np.isnan(dates)\n has_bad_values = False\n if bad_locs.any():\n has_bad_values = True\n data_col = Series(dates)\n data_col[bad_locs] = 1.0 # Replace with NaT\n dates = dates.astype(np.int64)\n\n if fmt.startswith((\"%tc\", \"tc\")): # Delta ms relative to base\n base = stata_epoch\n ms = dates\n conv_dates = convert_delta_safe(base, ms, 'ms')\n elif fmt.startswith((\"%tC\", \"tC\")):\n\n warnings.warn(\"Encountered %tC format. Leaving in Stata \"\n \"Internal Format.\")\n conv_dates = Series(dates, dtype=np.object)\n if has_bad_values:\n conv_dates[bad_locs] = NaT\n return conv_dates\n # Delta days relative to base\n elif fmt.startswith((\"%td\", \"td\", \"%d\", \"d\")):\n base = stata_epoch\n days = dates\n conv_dates = convert_delta_safe(base, days, 'd')\n # does not count leap days - 7 days is a week.\n # 52nd week may have more than 7 days\n elif fmt.startswith((\"%tw\", \"tw\")):\n year = stata_epoch.year + dates // 52\n days = (dates % 52) * 7\n conv_dates = convert_year_days_safe(year, days)\n elif fmt.startswith((\"%tm\", \"tm\")): # Delta months relative to base\n year = stata_epoch.year + dates // 12\n month = (dates % 12) + 1\n conv_dates = convert_year_month_safe(year, month)\n elif fmt.startswith((\"%tq\", \"tq\")): # Delta quarters relative to base\n year = stata_epoch.year + dates // 4\n month = (dates % 4) * 3 + 1\n conv_dates = convert_year_month_safe(year, month)\n elif fmt.startswith((\"%th\", \"th\")): # Delta half-years relative to base\n year = stata_epoch.year + dates // 2\n month = (dates % 2) * 6 + 1\n conv_dates = convert_year_month_safe(year, month)\n elif fmt.startswith((\"%ty\", \"ty\")): # Years -- not delta\n year = dates\n month = np.ones_like(dates)\n conv_dates = convert_year_month_safe(year, month)\n else:\n raise ValueError(\"Date fmt {fmt} not understood\".format(fmt=fmt))\n\n if has_bad_values: # Restore NaT for bad values\n conv_dates[bad_locs] = NaT\n\n return conv_dates", "response": "Convert from SIF to datetime."} {"instruction": "Can you tell what is the following Python 3 function doing\ndef _datetime_to_stata_elapsed_vec(dates, fmt):\n index = dates.index\n NS_PER_DAY = 24 * 3600 * 1000 * 1000 * 1000\n US_PER_DAY = NS_PER_DAY / 1000\n\n def parse_dates_safe(dates, delta=False, year=False, days=False):\n d = {}\n if is_datetime64_dtype(dates.values):\n if delta:\n delta = dates - stata_epoch\n d['delta'] = delta.values.astype(\n np.int64) // 1000 # microseconds\n if days or year:\n dates = DatetimeIndex(dates)\n d['year'], d['month'] = dates.year, dates.month\n if days:\n days = (dates.astype(np.int64) -\n to_datetime(d['year'], format='%Y').astype(np.int64))\n d['days'] = days // NS_PER_DAY\n\n elif infer_dtype(dates, skipna=False) == 'datetime':\n if delta:\n delta = dates.values - stata_epoch\n f = lambda x: \\\n US_PER_DAY * x.days + 1000000 * x.seconds + x.microseconds\n v = np.vectorize(f)\n d['delta'] = v(delta)\n if year:\n year_month = dates.apply(lambda x: 100 * x.year + x.month)\n d['year'] = year_month.values // 100\n d['month'] = (year_month.values - d['year'] * 100)\n if days:\n f = lambda x: (x - datetime.datetime(x.year, 1, 1)).days\n v = np.vectorize(f)\n d['days'] = v(dates)\n else:\n raise ValueError('Columns containing dates must contain either '\n 'datetime64, datetime.datetime or null values.')\n\n return DataFrame(d, index=index)\n\n bad_loc = isna(dates)\n index = dates.index\n if bad_loc.any():\n dates = Series(dates)\n if is_datetime64_dtype(dates):\n dates[bad_loc] = to_datetime(stata_epoch)\n else:\n dates[bad_loc] = stata_epoch\n\n if fmt in [\"%tc\", \"tc\"]:\n d = parse_dates_safe(dates, delta=True)\n conv_dates = d.delta / 1000\n elif fmt in [\"%tC\", \"tC\"]:\n warnings.warn(\"Stata Internal Format tC not supported.\")\n conv_dates = dates\n elif fmt in [\"%td\", \"td\"]:\n d = parse_dates_safe(dates, delta=True)\n conv_dates = d.delta // US_PER_DAY\n elif fmt in [\"%tw\", \"tw\"]:\n d = parse_dates_safe(dates, year=True, days=True)\n conv_dates = (52 * (d.year - stata_epoch.year) + d.days // 7)\n elif fmt in [\"%tm\", \"tm\"]:\n d = parse_dates_safe(dates, year=True)\n conv_dates = (12 * (d.year - stata_epoch.year) + d.month - 1)\n elif fmt in [\"%tq\", \"tq\"]:\n d = parse_dates_safe(dates, year=True)\n conv_dates = 4 * (d.year - stata_epoch.year) + (d.month - 1) // 3\n elif fmt in [\"%th\", \"th\"]:\n d = parse_dates_safe(dates, year=True)\n conv_dates = (2 * (d.year - stata_epoch.year) +\n (d.month > 6).astype(np.int))\n elif fmt in [\"%ty\", \"ty\"]:\n d = parse_dates_safe(dates, year=True)\n conv_dates = d.year\n else:\n raise ValueError(\n \"Format {fmt} is not a known Stata date format\".format(fmt=fmt))\n\n conv_dates = Series(conv_dates, dtype=np.float64)\n missing_value = struct.unpack('= 2 ** 53:\n ws = precision_loss_doc % ('uint64', 'float64')\n\n data[col] = data[col].astype(dtype)\n\n # Check values and upcast if necessary\n if dtype == np.int8:\n if data[col].max() > 100 or data[col].min() < -127:\n data[col] = data[col].astype(np.int16)\n elif dtype == np.int16:\n if data[col].max() > 32740 or data[col].min() < -32767:\n data[col] = data[col].astype(np.int32)\n elif dtype == np.int64:\n if (data[col].max() <= 2147483620 and\n data[col].min() >= -2147483647):\n data[col] = data[col].astype(np.int32)\n else:\n data[col] = data[col].astype(np.float64)\n if data[col].max() >= 2 ** 53 or data[col].min() <= -2 ** 53:\n ws = precision_loss_doc % ('int64', 'float64')\n elif dtype in (np.float32, np.float64):\n value = data[col].max()\n if np.isinf(value):\n raise ValueError('Column {col} has a maximum value of '\n 'infinity which is outside the range '\n 'supported by Stata.'.format(col=col))\n if dtype == np.float32 and value > float32_max:\n data[col] = data[col].astype(np.float64)\n elif dtype == np.float64:\n if value > float64_max:\n raise ValueError('Column {col} has a maximum value '\n '({val}) outside the range supported by '\n 'Stata ({float64_max})'\n .format(col=col, val=value,\n float64_max=float64_max))\n\n if ws:\n warnings.warn(ws, PossiblePrecisionLoss)\n\n return data", "response": "Checks the dtypes of the columns of a pandas DataFrame for compatibility with the data types and ranges supported by Stata and converts them to the appropriate types."} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef _dtype_to_stata_type(dtype, column):\n # TODO: expand to handle datetime to integer conversion\n if dtype.type == np.object_: # try to coerce it to the biggest string\n # not memory efficient, what else could we\n # do?\n itemsize = max_len_string_array(ensure_object(column.values))\n return max(itemsize, 1)\n elif dtype == np.float64:\n return 255\n elif dtype == np.float32:\n return 254\n elif dtype == np.int32:\n return 253\n elif dtype == np.int16:\n return 252\n elif dtype == np.int8:\n return 251\n else: # pragma : no cover\n raise NotImplementedError(\n \"Data type {dtype} not supported.\".format(dtype=dtype))", "response": "Convert dtype types to stata types."} {"instruction": "Can you tell what is the following Python 3 function doing\ndef _dtype_to_default_stata_fmt(dtype, column, dta_version=114,\n force_strl=False):\n \"\"\"\n Map numpy dtype to stata's default format for this type. Not terribly\n important since users can change this in Stata. Semantics are\n\n object -> \"%DDs\" where DD is the length of the string. If not a string,\n raise ValueError\n float64 -> \"%10.0g\"\n float32 -> \"%9.0g\"\n int64 -> \"%9.0g\"\n int32 -> \"%12.0g\"\n int16 -> \"%8.0g\"\n int8 -> \"%8.0g\"\n strl -> \"%9s\"\n \"\"\"\n # TODO: Refactor to combine type with format\n # TODO: expand this to handle a default datetime format?\n if dta_version < 117:\n max_str_len = 244\n else:\n max_str_len = 2045\n if force_strl:\n return '%9s'\n if dtype.type == np.object_:\n inferred_dtype = infer_dtype(column, skipna=True)\n if not (inferred_dtype in ('string', 'unicode') or\n len(column) == 0):\n raise ValueError('Column `{col}` cannot be exported.\\n\\nOnly '\n 'string-like object arrays containing all '\n 'strings or a mix of strings and None can be '\n 'exported. Object arrays containing only null '\n 'values are prohibited. Other object types'\n 'cannot be exported and must first be converted '\n 'to one of the supported '\n 'types.'.format(col=column.name))\n itemsize = max_len_string_array(ensure_object(column.values))\n if itemsize > max_str_len:\n if dta_version >= 117:\n return '%9s'\n else:\n raise ValueError(excessive_string_length_error % column.name)\n return \"%\" + str(max(itemsize, 1)) + \"s\"\n elif dtype == np.float64:\n return \"%10.0g\"\n elif dtype == np.float32:\n return \"%9.0g\"\n elif dtype == np.int32:\n return \"%12.0g\"\n elif dtype == np.int8 or dtype == np.int16:\n return \"%8.0g\"\n else: # pragma : no cover\n raise NotImplementedError(\n \"Data type {dtype} not supported.\".format(dtype=dtype))", "response": "Convert numpy dtype to default stata format."} {"instruction": "Implement a function in Python 3 to\ntake a bytes instance and pads it with null bytes until it s length chars.", "response": "def _pad_bytes_new(name, length):\n \"\"\"\n Takes a bytes instance and pads it with null bytes until it's length chars.\n \"\"\"\n if isinstance(name, str):\n name = bytes(name, 'utf-8')\n return name + b'\\x00' * (length - len(name))"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef generate_value_label(self, byteorder, encoding):\n\n self._encoding = encoding\n bio = BytesIO()\n null_string = '\\x00'\n null_byte = b'\\x00'\n\n # len\n bio.write(struct.pack(byteorder + 'i', self.len))\n\n # labname\n labname = self._encode(_pad_bytes(self.labname[:32], 33))\n bio.write(labname)\n\n # padding - 3 bytes\n for i in range(3):\n bio.write(struct.pack('c', null_byte))\n\n # value_label_table\n # n - int32\n bio.write(struct.pack(byteorder + 'i', self.n))\n\n # textlen - int32\n bio.write(struct.pack(byteorder + 'i', self.text_len))\n\n # off - int32 array (n elements)\n for offset in self.off:\n bio.write(struct.pack(byteorder + 'i', offset))\n\n # val - int32 array (n elements)\n for value in self.val:\n bio.write(struct.pack(byteorder + 'i', value))\n\n # txt - Text labels, null terminated\n for text in self.txt:\n bio.write(self._encode(text + null_string))\n\n bio.seek(0)\n return bio.read()", "response": "Generates a value label for a specific locale."} {"instruction": "Write a Python 3 function for\nmapping between numpy and state dtypes", "response": "def _setup_dtype(self):\n \"\"\"Map between numpy and state dtypes\"\"\"\n if self._dtype is not None:\n return self._dtype\n\n dtype = [] # Convert struct data types to numpy data type\n for i, typ in enumerate(self.typlist):\n if typ in self.NUMPY_TYPE_MAP:\n dtype.append(('s' + str(i), self.byteorder +\n self.NUMPY_TYPE_MAP[typ]))\n else:\n dtype.append(('s' + str(i), 'S' + str(typ)))\n dtype = np.dtype(dtype)\n self._dtype = dtype\n\n return self._dtype"} {"instruction": "Write a Python 3 script to\nconvert categorical columns to categorical columns.", "response": "def _do_convert_categoricals(self, data, value_label_dict, lbllist,\n order_categoricals):\n \"\"\"\n Converts categorical columns to Categorical type.\n \"\"\"\n value_labels = list(value_label_dict.keys())\n cat_converted_data = []\n for col, label in zip(data, lbllist):\n if label in value_labels:\n # Explicit call with ordered=True\n cat_data = Categorical(data[col], ordered=order_categoricals)\n categories = []\n for category in cat_data.categories:\n if category in value_label_dict[label]:\n categories.append(value_label_dict[label][category])\n else:\n categories.append(category) # Partially labeled\n try:\n cat_data.categories = categories\n except ValueError:\n vc = Series(categories).value_counts()\n repeats = list(vc.index[vc > 1])\n repeats = '-' * 80 + '\\n' + '\\n'.join(repeats)\n # GH 25772\n msg = \"\"\"\nValue labels for column {col} are not unique. These cannot be converted to\npandas categoricals.\n\nEither read the file with `convert_categoricals` set to False or use the\nlow level interface in `StataReader` to separately read the values and the\nvalue_labels.\n\nThe repeated labels are:\n{repeats}\n\"\"\"\n raise ValueError(msg.format(col=col, repeats=repeats))\n # TODO: is the next line needed above in the data(...) method?\n cat_data = Series(cat_data, index=data.index)\n cat_converted_data.append((col, cat_data))\n else:\n cat_converted_data.append((col, data[col]))\n data = DataFrame.from_dict(OrderedDict(cat_converted_data))\n return data"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef _write(self, to_write):\n self._file.write(to_write.encode(self._encoding or\n self._default_encoding))", "response": "Helper to call encode before writing to file for Python 3 compat."} {"instruction": "Make a summary of the following Python 3 code\ndef _prepare_categoricals(self, data):\n\n is_cat = [is_categorical_dtype(data[col]) for col in data]\n self._is_col_cat = is_cat\n self._value_labels = []\n if not any(is_cat):\n return data\n\n get_base_missing_value = StataMissingValue.get_base_missing_value\n data_formatted = []\n for col, col_is_cat in zip(data, is_cat):\n if col_is_cat:\n self._value_labels.append(StataValueLabel(data[col]))\n dtype = data[col].cat.codes.dtype\n if dtype == np.int64:\n raise ValueError('It is not possible to export '\n 'int64-based categorical data to Stata.')\n values = data[col].cat.codes.values.copy()\n\n # Upcast if needed so that correct missing values can be set\n if values.max() >= get_base_missing_value(dtype):\n if dtype == np.int8:\n dtype = np.int16\n elif dtype == np.int16:\n dtype = np.int32\n else:\n dtype = np.float64\n values = np.array(values, dtype=dtype)\n\n # Replace missing values with Stata missing value for type\n values[values == -1] = get_base_missing_value(dtype)\n data_formatted.append((col, values))\n else:\n data_formatted.append((col, data[col]))\n return DataFrame.from_dict(OrderedDict(data_formatted))", "response": "Check for categorical columns retain categorical information for\n Stata file and convert categorical data to int"} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef _replace_nans(self, data):\n # return data\n \"\"\"Checks floating point data columns for nans, and replaces these with\n the generic Stata for missing value (.)\"\"\"\n for c in data:\n dtype = data[c].dtype\n if dtype in (np.float32, np.float64):\n if dtype == np.float32:\n replacement = self.MISSING_VALUES['f']\n else:\n replacement = self.MISSING_VALUES['d']\n data[c] = data[c].fillna(replacement)\n\n return data", "response": "Replaces missing values in the data columns with their respective values."} {"instruction": "Explain what the following Python 3 code does\ndef _check_column_names(self, data):\n converted_names = {}\n columns = list(data.columns)\n original_columns = columns[:]\n\n duplicate_var_id = 0\n for j, name in enumerate(columns):\n orig_name = name\n if not isinstance(name, str):\n name = str(name)\n\n for c in name:\n if ((c < 'A' or c > 'Z') and (c < 'a' or c > 'z') and\n (c < '0' or c > '9') and c != '_'):\n name = name.replace(c, '_')\n\n # Variable name must not be a reserved word\n if name in self.RESERVED_WORDS:\n name = '_' + name\n\n # Variable name may not start with a number\n if name[0] >= '0' and name[0] <= '9':\n name = '_' + name\n\n name = name[:min(len(name), 32)]\n\n if not name == orig_name:\n # check for duplicates\n while columns.count(name) > 0:\n # prepend ascending number to avoid duplicates\n name = '_' + str(duplicate_var_id) + name\n name = name[:min(len(name), 32)]\n duplicate_var_id += 1\n converted_names[orig_name] = name\n\n columns[j] = name\n\n data.columns = columns\n\n # Check date conversion, and fix key if needed\n if self._convert_dates:\n for c, o in zip(columns, original_columns):\n if c != o:\n self._convert_dates[c] = self._convert_dates[o]\n del self._convert_dates[o]\n\n if converted_names:\n conversion_warning = []\n for orig_name, name in converted_names.items():\n # need to possibly encode the orig name if its unicode\n try:\n orig_name = orig_name.encode('utf-8')\n except (UnicodeDecodeError, AttributeError):\n pass\n msg = '{0} -> {1}'.format(orig_name, name)\n conversion_warning.append(msg)\n\n ws = invalid_name_doc.format('\\n '.join(conversion_warning))\n warnings.warn(ws, InvalidColumnName)\n\n self._converted_names = converted_names\n self._update_strl_names()\n\n return data", "response": "Checks that the column names of the data dictionary are valid and that they are valid."} {"instruction": "Write a Python 3 script to\nclose the file if it was created by the writer.", "response": "def _close(self):\n \"\"\"\n Close the file if it was created by the writer.\n\n If a buffer or file-like object was passed in, for example a GzipFile,\n then leave this file open for the caller to close. In either case,\n attempt to flush the file contents to ensure they are written to disk\n (if supported)\n \"\"\"\n # Some file-like objects might not support flush\n try:\n self._file.flush()\n except AttributeError:\n pass\n if self._own_file:\n self._file.close()"} {"instruction": "Write a Python 3 function that can\ngenerate the GSO lookup table for the DataFRame", "response": "def generate_table(self):\n \"\"\"\n Generates the GSO lookup table for the DataFRame\n\n Returns\n -------\n gso_table : OrderedDict\n Ordered dictionary using the string found as keys\n and their lookup position (v,o) as values\n gso_df : DataFrame\n DataFrame where strl columns have been converted to\n (v,o) values\n\n Notes\n -----\n Modifies the DataFrame in-place.\n\n The DataFrame returned encodes the (v,o) values as uint64s. The\n encoding depends on teh dta version, and can be expressed as\n\n enc = v + o * 2 ** (o_size * 8)\n\n so that v is stored in the lower bits and o is in the upper\n bits. o_size is\n\n * 117: 4\n * 118: 6\n * 119: 5\n \"\"\"\n\n gso_table = self._gso_table\n gso_df = self.df\n columns = list(gso_df.columns)\n selected = gso_df[self.columns]\n col_index = [(col, columns.index(col)) for col in self.columns]\n keys = np.empty(selected.shape, dtype=np.uint64)\n for o, (idx, row) in enumerate(selected.iterrows()):\n for j, (col, v) in enumerate(col_index):\n val = row[col]\n # Allow columns with mixed str and None (GH 23633)\n val = '' if val is None else val\n key = gso_table.get(val, None)\n if key is None:\n # Stata prefers human numbers\n key = (v + 1, o + 1)\n gso_table[val] = key\n keys[o, j] = self._convert_key(key)\n for i, col in enumerate(self.columns):\n gso_df[col] = keys[:, i]\n\n return gso_table, gso_df"} {"instruction": "Implement a Python 3 function for\ngenerating the binary blob of GSOs that are written to the dta file.", "response": "def generate_blob(self, gso_table):\n \"\"\"\n Generates the binary blob of GSOs that is written to the dta file.\n\n Parameters\n ----------\n gso_table : OrderedDict\n Ordered dictionary (str, vo)\n\n Returns\n -------\n gso : bytes\n Binary content of dta file to be placed between strl tags\n\n Notes\n -----\n Output format depends on dta version. 117 uses two uint32s to\n express v and o while 118+ uses a uint32 for v and a uint64 for o.\n \"\"\"\n # Format information\n # Length includes null term\n # 117\n # GSOvvvvooootllllxxxxxxxxxxxxxxx...x\n # 3 u4 u4 u1 u4 string + null term\n #\n # 118, 119\n # GSOvvvvooooooootllllxxxxxxxxxxxxxxx...x\n # 3 u4 u8 u1 u4 string + null term\n\n bio = BytesIO()\n gso = bytes('GSO', 'ascii')\n gso_type = struct.pack(self._byteorder + 'B', 130)\n null = struct.pack(self._byteorder + 'B', 0)\n v_type = self._byteorder + self._gso_v_type\n o_type = self._byteorder + self._gso_o_type\n len_type = self._byteorder + 'I'\n for strl, vo in gso_table.items():\n if vo == (0, 0):\n continue\n v, o = vo\n\n # GSO\n bio.write(gso)\n\n # vvvv\n bio.write(struct.pack(v_type, v))\n\n # oooo / oooooooo\n bio.write(struct.pack(o_type, o))\n\n # t\n bio.write(gso_type)\n\n # llll\n utf8_string = bytes(strl, 'utf-8')\n bio.write(struct.pack(len_type, len(utf8_string) + 1))\n\n # xxx...xxx\n bio.write(utf8_string)\n bio.write(null)\n\n bio.seek(0)\n return bio.read()"} {"instruction": "Given the following Python 3 function, write the documentation\ndef _tag(val, tag):\n if isinstance(val, str):\n val = bytes(val, 'utf-8')\n return (bytes('<' + tag + '>', 'utf-8') + val +\n bytes('', 'utf-8'))", "response": "Surround val with tag"} {"instruction": "Explain what the following Python 3 code does\ndef _write_header(self, data_label=None, time_stamp=None):\n byteorder = self._byteorder\n self._file.write(bytes('', 'utf-8'))\n bio = BytesIO()\n # ds_format - 117\n bio.write(self._tag(bytes('117', 'utf-8'), 'release'))\n # byteorder\n bio.write(self._tag(byteorder == \">\" and \"MSF\" or \"LSF\", 'byteorder'))\n # number of vars, 2 bytes\n assert self.nvar < 2 ** 16\n bio.write(self._tag(struct.pack(byteorder + \"H\", self.nvar), 'K'))\n # number of obs, 4 bytes\n bio.write(self._tag(struct.pack(byteorder + \"I\", self.nobs), 'N'))\n # data label 81 bytes, char, null terminated\n label = data_label[:80] if data_label is not None else ''\n label_len = struct.pack(byteorder + \"B\", len(label))\n label = label_len + bytes(label, 'utf-8')\n bio.write(self._tag(label, 'label'))\n # time stamp, 18 bytes, char, null terminated\n # format dd Mon yyyy hh:mm\n if time_stamp is None:\n time_stamp = datetime.datetime.now()\n elif not isinstance(time_stamp, datetime.datetime):\n raise ValueError(\"time_stamp should be datetime type\")\n # Avoid locale-specific month conversion\n months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug',\n 'Sep', 'Oct', 'Nov', 'Dec']\n month_lookup = {i + 1: month for i, month in enumerate(months)}\n ts = (time_stamp.strftime(\"%d \") +\n month_lookup[time_stamp.month] +\n time_stamp.strftime(\" %Y %H:%M\"))\n # '\\x11' added due to inspection of Stata file\n ts = b'\\x11' + bytes(ts, 'utf8')\n bio.write(self._tag(ts, 'timestamp'))\n bio.seek(0)\n self._file.write(self._tag(bio.read(), 'header'))", "response": "Write the header of the Stata file."} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef _write_map(self):\n if self._map is None:\n self._map = OrderedDict((('stata_data', 0),\n ('map', self._file.tell()),\n ('variable_types', 0),\n ('varnames', 0),\n ('sortlist', 0),\n ('formats', 0),\n ('value_label_names', 0),\n ('variable_labels', 0),\n ('characteristics', 0),\n ('data', 0),\n ('strls', 0),\n ('value_labels', 0),\n ('stata_data_close', 0),\n ('end-of-file', 0)))\n # Move to start of map\n self._file.seek(self._map['map'])\n bio = BytesIO()\n for val in self._map.values():\n bio.write(struct.pack(self._byteorder + 'Q', val))\n bio.seek(0)\n self._file.write(self._tag(bio.read(), 'map'))", "response": "Called twice during file write."} {"instruction": "Create a Python 3 function to\nupdate column names for conversion to strl if they have been changed to comply with Stata naming rules", "response": "def _update_strl_names(self):\n \"\"\"Update column names for conversion to strl if they might have been\n changed to comply with Stata naming rules\"\"\"\n # Update convert_strl if names changed\n for orig, new in self._converted_names.items():\n if orig in self._convert_strl:\n idx = self._convert_strl.index(orig)\n self._convert_strl[idx] = new"} {"instruction": "Write a Python 3 script for\nconverting columns to StrLs if either very large or in the convert_strl variable", "response": "def _convert_strls(self, data):\n \"\"\"Convert columns to StrLs if either very large or in the\n convert_strl variable\"\"\"\n convert_cols = [\n col for i, col in enumerate(data)\n if self.typlist[i] == 32768 or col in self._convert_strl]\n\n if convert_cols:\n ssw = StataStrLWriter(data, convert_cols)\n tab, new_data = ssw.generate_table()\n data = new_data\n self._strl_blob = ssw.generate_blob(tab)\n return data"} {"instruction": "Create a Python 3 function to\nregister Pandas Formatters and Converters with matplotlib.", "response": "def register(explicit=True):\n \"\"\"\n Register Pandas Formatters and Converters with matplotlib\n\n This function modifies the global ``matplotlib.units.registry``\n dictionary. Pandas adds custom converters for\n\n * pd.Timestamp\n * pd.Period\n * np.datetime64\n * datetime.datetime\n * datetime.date\n * datetime.time\n\n See Also\n --------\n deregister_matplotlib_converter\n \"\"\"\n # Renamed in pandas.plotting.__init__\n global _WARN\n\n if explicit:\n _WARN = False\n\n pairs = get_pairs()\n for type_, cls in pairs:\n converter = cls()\n if type_ in units.registry:\n previous = units.registry[type_]\n _mpl_units[type_] = previous\n units.registry[type_] = converter"} {"instruction": "Can you write a function in Python 3 where it\nremoves pandas' formatters and converters Removes the custom converters added by :func:`register`. This attempts to set the state of the registry back to the state before pandas registered its own units. Converters for pandas' own types like Timestamp and Period are removed completely. Converters for types pandas overwrites, like ``datetime.datetime``, are restored to their original value. See Also -------- deregister_matplotlib_converters", "response": "def deregister():\n \"\"\"\n Remove pandas' formatters and converters\n\n Removes the custom converters added by :func:`register`. This\n attempts to set the state of the registry back to the state before\n pandas registered its own units. Converters for pandas' own types like\n Timestamp and Period are removed completely. Converters for types\n pandas overwrites, like ``datetime.datetime``, are restored to their\n original value.\n\n See Also\n --------\n deregister_matplotlib_converters\n \"\"\"\n # Renamed in pandas.plotting.__init__\n for type_, cls in get_pairs():\n # We use type to catch our classes directly, no inheritance\n if type(units.registry.get(type_)) is cls:\n units.registry.pop(type_)\n\n # restore the old keys\n for unit, formatter in _mpl_units.items():\n if type(formatter) not in {DatetimeConverter, PeriodConverter,\n TimeConverter}:\n # make it idempotent by excluding ours.\n units.registry[unit] = formatter"} {"instruction": "Implement a function in Python 3 to\nconvert a datetime to the Gregorian date as a float days nanos seconds and microseconds.", "response": "def _dt_to_float_ordinal(dt):\n \"\"\"\n Convert :mod:`datetime` to the Gregorian date as UTC float days,\n preserving hours, minutes, seconds and microseconds. Return value\n is a :func:`float`.\n \"\"\"\n if (isinstance(dt, (np.ndarray, Index, ABCSeries)\n ) and is_datetime64_ns_dtype(dt)):\n base = dates.epoch2num(dt.asi8 / 1.0E9)\n else:\n base = dates.date2num(dt)\n return base"} {"instruction": "Make a summary of the following Python 3 code\ndef _get_default_annual_spacing(nyears):\n if nyears < 11:\n (min_spacing, maj_spacing) = (1, 1)\n elif nyears < 20:\n (min_spacing, maj_spacing) = (1, 2)\n elif nyears < 50:\n (min_spacing, maj_spacing) = (1, 5)\n elif nyears < 100:\n (min_spacing, maj_spacing) = (5, 10)\n elif nyears < 200:\n (min_spacing, maj_spacing) = (5, 25)\n elif nyears < 600:\n (min_spacing, maj_spacing) = (10, 50)\n else:\n factor = nyears // 1000 + 1\n (min_spacing, maj_spacing) = (factor * 20, factor * 100)\n return (min_spacing, maj_spacing)", "response": "Returns a default spacing between consecutive ticks for annual data."} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef period_break(dates, period):\n current = getattr(dates, period)\n previous = getattr(dates - 1 * dates.freq, period)\n return np.nonzero(current - previous)[0]", "response": "Returns the indices where the given period changes."} {"instruction": "Implement a Python 3 function for\nreturning True if the label_flags indicate there is at least one label for this level.", "response": "def has_level_label(label_flags, vmin):\n \"\"\"\n Returns true if the ``label_flags`` indicate there is at least one label\n for this level.\n\n if the minimum view limit is not an exact integer, then the first tick\n label won't be shown, so we must adjust for that.\n \"\"\"\n if label_flags.size == 0 or (label_flags.size == 1 and\n label_flags[0] == 0 and\n vmin % 1 > 0.0):\n return False\n else:\n return True"} {"instruction": "How would you explain what the following Python 3 function does\ndef axisinfo(unit, axis):\n tz = unit\n\n majloc = PandasAutoDateLocator(tz=tz)\n majfmt = PandasAutoDateFormatter(majloc, tz=tz)\n datemin = pydt.date(2000, 1, 1)\n datemax = pydt.date(2010, 1, 1)\n\n return units.AxisInfo(majloc=majloc, majfmt=majfmt, label='',\n default_limits=(datemin, datemax))", "response": "Return the axisinfo for the given unit."} {"instruction": "Can you tell what is the following Python 3 function doing\ndef get_locator(self, dmin, dmax):\n 'Pick the best locator based on a distance.'\n _check_implicitly_registered()\n delta = relativedelta(dmax, dmin)\n\n num_days = (delta.years * 12.0 + delta.months) * 31.0 + delta.days\n num_sec = (delta.hours * 60.0 + delta.minutes) * 60.0 + delta.seconds\n tot_sec = num_days * 86400. + num_sec\n\n if abs(tot_sec) < self.minticks:\n self._freq = -1\n locator = MilliSecondLocator(self.tz)\n locator.set_axis(self.axis)\n\n locator.set_view_interval(*self.axis.get_view_interval())\n locator.set_data_interval(*self.axis.get_data_interval())\n return locator\n\n return dates.AutoDateLocator.get_locator(self, dmin, dmax)", "response": "Pick the best locator based on a distance."} {"instruction": "Can you implement a function in Python 3 that\nsets the view limits to include the data range.", "response": "def autoscale(self):\n \"\"\"\n Set the view limits to include the data range.\n \"\"\"\n dmin, dmax = self.datalim_to_dt()\n if dmin > dmax:\n dmax, dmin = dmin, dmax\n\n # We need to cap at the endpoints of valid datetime\n\n # TODO(wesm): unused?\n\n # delta = relativedelta(dmax, dmin)\n # try:\n # start = dmin - delta\n # except ValueError:\n # start = _from_ordinal(1.0)\n\n # try:\n # stop = dmax + delta\n # except ValueError:\n # # The magic number!\n # stop = _from_ordinal(3652059.9999999)\n\n dmin, dmax = self.datalim_to_dt()\n\n vmin = dates.date2num(dmin)\n vmax = dates.date2num(dmax)\n\n return self.nonsingular(vmin, vmax)"} {"instruction": "Make a summary of the following Python 3 code\ndef _get_default_locs(self, vmin, vmax):\n \"Returns the default locations of ticks.\"\n\n if self.plot_obj.date_axis_info is None:\n self.plot_obj.date_axis_info = self.finder(vmin, vmax, self.freq)\n\n locator = self.plot_obj.date_axis_info\n\n if self.isminor:\n return np.compress(locator['min'], locator['val'])\n return np.compress(locator['maj'], locator['val'])", "response": "Returns the default locations of ticks."} {"instruction": "Write a Python 3 function that can\nset the view limits to the nearest multiples of base that contain the data.", "response": "def autoscale(self):\n \"\"\"\n Sets the view limits to the nearest multiples of base that contain the\n data.\n \"\"\"\n # requires matplotlib >= 0.98.0\n (vmin, vmax) = self.axis.get_data_interval()\n\n locs = self._get_default_locs(vmin, vmax)\n (vmin, vmax) = locs[[0, -1]]\n if vmin == vmax:\n vmin -= 1\n vmax += 1\n return nonsingular(vmin, vmax)"} {"instruction": "Can you implement a function in Python 3 that\nreturns the default ticks spacing.", "response": "def _set_default_format(self, vmin, vmax):\n \"Returns the default ticks spacing.\"\n\n if self.plot_obj.date_axis_info is None:\n self.plot_obj.date_axis_info = self.finder(vmin, vmax, self.freq)\n info = self.plot_obj.date_axis_info\n\n if self.isminor:\n format = np.compress(info['min'] & np.logical_not(info['maj']),\n info)\n else:\n format = np.compress(info['maj'], info)\n self.formatdict = {x: f for (x, _, _, f) in format}\n return self.formatdict"} {"instruction": "Write a Python 3 script to\nset the locations of the ticks", "response": "def set_locs(self, locs):\n 'Sets the locations of the ticks'\n # don't actually use the locs. This is just needed to work with\n # matplotlib. Force to use vmin, vmax\n _check_implicitly_registered()\n\n self.locs = locs\n\n (vmin, vmax) = vi = tuple(self.axis.get_view_interval())\n if vi != self.plot_obj.view_interval:\n self.plot_obj.date_axis_info = None\n self.plot_obj.view_interval = vi\n if vmax < vmin:\n (vmin, vmax) = (vmax, vmin)\n self._set_default_format(vmin, vmax)"} {"instruction": "Create a Python 3 function to\nset index names to index or level_x for regular or multi index.", "response": "def set_default_names(data):\n \"\"\"Sets index names to 'index' for regular, or 'level_x' for Multi\"\"\"\n if com._all_not_none(*data.index.names):\n nms = data.index.names\n if len(nms) == 1 and data.index.name == 'index':\n warnings.warn(\"Index name of 'index' is not round-trippable\")\n elif len(nms) > 1 and any(x.startswith('level_') for x in nms):\n warnings.warn(\"Index names beginning with 'level_' are not \"\n \"round-trippable\")\n return data\n\n data = data.copy()\n if data.index.nlevels > 1:\n names = [name if name is not None else 'level_{}'.format(i)\n for i, name in enumerate(data.index.names)]\n data.index.names = names\n else:\n data.index.name = data.index.name or 'index'\n return data"} {"instruction": "Write a Python 3 function that can\nconvert a JSON field descriptor into its corresponding NumPy type.", "response": "def convert_json_field_to_pandas_type(field):\n \"\"\"\n Converts a JSON field descriptor into its corresponding NumPy / pandas type\n\n Parameters\n ----------\n field\n A JSON field descriptor\n\n Returns\n -------\n dtype\n\n Raises\n -----\n ValueError\n If the type of the provided field is unknown or currently unsupported\n\n Examples\n --------\n >>> convert_json_field_to_pandas_type({'name': 'an_int',\n 'type': 'integer'})\n 'int64'\n >>> convert_json_field_to_pandas_type({'name': 'a_categorical',\n 'type': 'any',\n 'contraints': {'enum': [\n 'a', 'b', 'c']},\n 'ordered': True})\n 'CategoricalDtype(categories=['a', 'b', 'c'], ordered=True)'\n >>> convert_json_field_to_pandas_type({'name': 'a_datetime',\n 'type': 'datetime'})\n 'datetime64[ns]'\n >>> convert_json_field_to_pandas_type({'name': 'a_datetime_with_tz',\n 'type': 'datetime',\n 'tz': 'US/Central'})\n 'datetime64[ns, US/Central]'\n \"\"\"\n typ = field['type']\n if typ == 'string':\n return 'object'\n elif typ == 'integer':\n return 'int64'\n elif typ == 'number':\n return 'float64'\n elif typ == 'boolean':\n return 'bool'\n elif typ == 'duration':\n return 'timedelta64'\n elif typ == 'datetime':\n if field.get('tz'):\n return 'datetime64[ns, {tz}]'.format(tz=field['tz'])\n else:\n return 'datetime64[ns]'\n elif typ == 'any':\n if 'constraints' in field and 'ordered' in field:\n return CategoricalDtype(categories=field['constraints']['enum'],\n ordered=field['ordered'])\n else:\n return 'object'\n\n raise ValueError(\"Unsupported or invalid field type: {}\".format(typ))"} {"instruction": "Can you create a Python 3 function that\nbuilds a schema from a Series DataFrame.", "response": "def build_table_schema(data, index=True, primary_key=None, version=True):\n \"\"\"\n Create a Table schema from ``data``.\n\n Parameters\n ----------\n data : Series, DataFrame\n index : bool, default True\n Whether to include ``data.index`` in the schema.\n primary_key : bool or None, default True\n column names to designate as the primary key.\n The default `None` will set `'primaryKey'` to the index\n level or levels if the index is unique.\n version : bool, default True\n Whether to include a field `pandas_version` with the version\n of pandas that generated the schema.\n\n Returns\n -------\n schema : dict\n\n Notes\n -----\n See `_as_json_table_type` for conversion types.\n Timedeltas as converted to ISO8601 duration format with\n 9 decimal places after the seconds field for nanosecond precision.\n\n Categoricals are converted to the `any` dtype, and use the `enum` field\n constraint to list the allowed values. The `ordered` attribute is included\n in an `ordered` field.\n\n Examples\n --------\n >>> df = pd.DataFrame(\n ... {'A': [1, 2, 3],\n ... 'B': ['a', 'b', 'c'],\n ... 'C': pd.date_range('2016-01-01', freq='d', periods=3),\n ... }, index=pd.Index(range(3), name='idx'))\n >>> build_table_schema(df)\n {'fields': [{'name': 'idx', 'type': 'integer'},\n {'name': 'A', 'type': 'integer'},\n {'name': 'B', 'type': 'string'},\n {'name': 'C', 'type': 'datetime'}],\n 'pandas_version': '0.20.0',\n 'primaryKey': ['idx']}\n \"\"\"\n if index is True:\n data = set_default_names(data)\n\n schema = {}\n fields = []\n\n if index:\n if data.index.nlevels > 1:\n for level in data.index.levels:\n fields.append(convert_pandas_type_to_json_field(level))\n else:\n fields.append(convert_pandas_type_to_json_field(data.index))\n\n if data.ndim > 1:\n for column, s in data.iteritems():\n fields.append(convert_pandas_type_to_json_field(s))\n else:\n fields.append(convert_pandas_type_to_json_field(data))\n\n schema['fields'] = fields\n if index and data.index.is_unique and primary_key is None:\n if data.index.nlevels == 1:\n schema['primaryKey'] = [data.index.name]\n else:\n schema['primaryKey'] = data.index.names\n elif primary_key is not None:\n schema['primaryKey'] = primary_key\n\n if version:\n schema['pandas_version'] = '0.20.0'\n return schema"} {"instruction": "Write a Python 3 function that can\nparse a JSON table schema and returns a DataFrame.", "response": "def parse_table_schema(json, precise_float):\n \"\"\"\n Builds a DataFrame from a given schema\n\n Parameters\n ----------\n json :\n A JSON table schema\n precise_float : boolean\n Flag controlling precision when decoding string to double values, as\n dictated by ``read_json``\n\n Returns\n -------\n df : DataFrame\n\n Raises\n ------\n NotImplementedError\n If the JSON table schema contains either timezone or timedelta data\n\n Notes\n -----\n Because :func:`DataFrame.to_json` uses the string 'index' to denote a\n name-less :class:`Index`, this function sets the name of the returned\n :class:`DataFrame` to ``None`` when said string is encountered with a\n normal :class:`Index`. For a :class:`MultiIndex`, the same limitation\n applies to any strings beginning with 'level_'. Therefore, an\n :class:`Index` name of 'index' and :class:`MultiIndex` names starting\n with 'level_' are not supported.\n\n See Also\n --------\n build_table_schema : Inverse function.\n pandas.read_json\n \"\"\"\n table = loads(json, precise_float=precise_float)\n col_order = [field['name'] for field in table['schema']['fields']]\n df = DataFrame(table['data'], columns=col_order)[col_order]\n\n dtypes = {field['name']: convert_json_field_to_pandas_type(field)\n for field in table['schema']['fields']}\n\n # Cannot directly use as_type with timezone data on object; raise for now\n if any(str(x).startswith('datetime64[ns, ') for x in dtypes.values()):\n raise NotImplementedError('table=\"orient\" can not yet read timezone '\n 'data')\n\n # No ISO constructor for Timedelta as of yet, so need to raise\n if 'timedelta64' in dtypes.values():\n raise NotImplementedError('table=\"orient\" can not yet read '\n 'ISO-formatted Timedelta data')\n\n df = df.astype(dtypes)\n\n if 'primaryKey' in table['schema']:\n df = df.set_index(table['schema']['primaryKey'])\n if len(df.index.names) == 1:\n if df.index.name == 'index':\n df.index.name = None\n else:\n df.index.names = [None if x.startswith('level_') else x for x in\n df.index.names]\n\n return df"} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef get_op_result_name(left, right):\n # `left` is always a pd.Series when called from within ops\n if isinstance(right, (ABCSeries, pd.Index)):\n name = _maybe_match_name(left, right)\n else:\n name = left.name\n return name", "response": "Returns the name of the operation result that is appropriate for the given two objects."} {"instruction": "Can you create a Python 3 function that\ntries to find a name to attach to the result of an operation between a and b. If no name is found return None.", "response": "def _maybe_match_name(a, b):\n \"\"\"\n Try to find a name to attach to the result of an operation between\n a and b. If only one of these has a `name` attribute, return that\n name. Otherwise return a consensus name if they match of None if\n they have different names.\n\n Parameters\n ----------\n a : object\n b : object\n\n Returns\n -------\n name : str or None\n\n See Also\n --------\n pandas.core.common.consensus_name_attr\n \"\"\"\n a_has = hasattr(a, 'name')\n b_has = hasattr(b, 'name')\n if a_has and b_has:\n if a.name == b.name:\n return a.name\n else:\n # TODO: what if they both have np.nan for their names?\n return None\n elif a_has:\n return a.name\n elif b_has:\n return b.name\n return None"} {"instruction": "Write a Python 3 function that can\ncast non - pandas objects to pandas types to unify behavior of arithmetic and comparison operations.", "response": "def maybe_upcast_for_op(obj):\n \"\"\"\n Cast non-pandas objects to pandas types to unify behavior of arithmetic\n and comparison operations.\n\n Parameters\n ----------\n obj: object\n\n Returns\n -------\n out : object\n\n Notes\n -----\n Be careful to call this *after* determining the `name` attribute to be\n attached to the result of the arithmetic operation.\n \"\"\"\n if type(obj) is datetime.timedelta:\n # GH#22390 cast up to Timedelta to rely on Timedelta\n # implementation; otherwise operation against numeric-dtype\n # raises TypeError\n return pd.Timedelta(obj)\n elif isinstance(obj, np.timedelta64) and not isna(obj):\n # In particular non-nanosecond timedelta64 needs to be cast to\n # nanoseconds, or else we get undesired behavior like\n # np.timedelta64(3, 'D') / 2 == np.timedelta64(1, 'D')\n # The isna check is to avoid casting timedelta64(\"NaT\"), which would\n # return NaT and incorrectly be treated as a datetime-NaT.\n return pd.Timedelta(obj)\n elif isinstance(obj, np.ndarray) and is_timedelta64_dtype(obj):\n # GH#22390 Unfortunately we need to special-case right-hand\n # timedelta64 dtypes because numpy casts integer dtypes to\n # timedelta64 when operating with timedelta64\n return pd.TimedeltaIndex(obj)\n return obj"} {"instruction": "Create a Python 3 function to\nreturn a binary method that always raises a TypeError.", "response": "def make_invalid_op(name):\n \"\"\"\n Return a binary method that always raises a TypeError.\n\n Parameters\n ----------\n name : str\n\n Returns\n -------\n invalid_op : function\n \"\"\"\n def invalid_op(self, other=None):\n raise TypeError(\"cannot perform {name} with this index type: \"\n \"{typ}\".format(name=name, typ=type(self).__name__))\n\n invalid_op.__name__ = name\n return invalid_op"} {"instruction": "Can you implement a function in Python 3 that\ngenerates keyword arguments to pass to numexpr for the given operation.", "response": "def _gen_eval_kwargs(name):\n \"\"\"\n Find the keyword arguments to pass to numexpr for the given operation.\n\n Parameters\n ----------\n name : str\n\n Returns\n -------\n eval_kwargs : dict\n\n Examples\n --------\n >>> _gen_eval_kwargs(\"__add__\")\n {}\n\n >>> _gen_eval_kwargs(\"rtruediv\")\n {'reversed': True, 'truediv': True}\n \"\"\"\n kwargs = {}\n\n # Series and Panel appear to only pass __add__, __radd__, ...\n # but DataFrame gets both these dunder names _and_ non-dunder names\n # add, radd, ...\n name = name.replace('__', '')\n\n if name.startswith('r'):\n if name not in ['radd', 'rand', 'ror', 'rxor']:\n # Exclude commutative operations\n kwargs['reversed'] = True\n\n if name in ['truediv', 'rtruediv']:\n kwargs['truediv'] = True\n\n if name in ['ne']:\n kwargs['masker'] = True\n\n return kwargs"} {"instruction": "Here you have a function in Python 3, explain what it does\ndef _gen_fill_zeros(name):\n name = name.strip('__')\n if 'div' in name:\n # truediv, floordiv, div, and reversed variants\n fill_value = np.inf\n elif 'mod' in name:\n # mod, rmod\n fill_value = np.nan\n else:\n fill_value = None\n return fill_value", "response": "Returns a fill value for undefined values in the results of the given operation caused by operating on the base object."} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef _get_opstr(op, cls):\n # numexpr is available for non-sparse classes\n subtyp = getattr(cls, '_subtyp', '')\n use_numexpr = 'sparse' not in subtyp\n\n if not use_numexpr:\n # if we're not using numexpr, then don't pass a str_rep\n return None\n\n return {operator.add: '+',\n radd: '+',\n operator.mul: '*',\n rmul: '*',\n operator.sub: '-',\n rsub: '-',\n operator.truediv: '/',\n rtruediv: '/',\n operator.floordiv: '//',\n rfloordiv: '//',\n operator.mod: None, # TODO: Why None for mod but '%' for rmod?\n rmod: '%',\n operator.pow: '**',\n rpow: '**',\n operator.eq: '==',\n operator.ne: '!=',\n operator.le: '<=',\n operator.lt: '<',\n operator.ge: '>=',\n operator.gt: '>',\n operator.and_: '&',\n rand_: '&',\n operator.or_: '|',\n ror_: '|',\n operator.xor: '^',\n rxor: '^',\n divmod: None,\n rdivmod: None}[op]", "response": "Returns the string representation of the given binary operator and class."} {"instruction": "Write a Python 3 function that can\nreturn the name of the operator that should be used for this method.", "response": "def _get_op_name(op, special):\n \"\"\"\n Find the name to attach to this method according to conventions\n for special and non-special methods.\n\n Parameters\n ----------\n op : binary operator\n special : bool\n\n Returns\n -------\n op_name : str\n \"\"\"\n opname = op.__name__.strip('_')\n if special:\n opname = '__{opname}__'.format(opname=opname)\n return opname"} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef _make_flex_doc(op_name, typ):\n op_name = op_name.replace('__', '')\n op_desc = _op_descriptions[op_name]\n\n if op_desc['reversed']:\n equiv = 'other ' + op_desc['op'] + ' ' + typ\n else:\n equiv = typ + ' ' + op_desc['op'] + ' other'\n\n if typ == 'series':\n base_doc = _flex_doc_SERIES\n doc_no_examples = base_doc.format(\n desc=op_desc['desc'],\n op_name=op_name,\n equiv=equiv,\n reverse=op_desc['reverse']\n )\n if op_desc['series_examples']:\n doc = doc_no_examples + op_desc['series_examples']\n else:\n doc = doc_no_examples\n elif typ == 'dataframe':\n base_doc = _flex_doc_FRAME\n doc = base_doc.format(\n desc=op_desc['desc'],\n op_name=op_name,\n equiv=equiv,\n reverse=op_desc['reverse']\n )\n elif typ == 'panel':\n base_doc = _flex_doc_PANEL\n doc = base_doc.format(\n desc=op_desc['desc'],\n op_name=op_name,\n equiv=equiv,\n reverse=op_desc['reverse']\n )\n else:\n raise AssertionError('Invalid typ argument.')\n return doc", "response": "Create a flexible docstring for the given operation and class - typ."} {"instruction": "How would you explain what the following Python 3 function does\ndef fill_binop(left, right, fill_value):\n # TODO: can we make a no-copy implementation?\n if fill_value is not None:\n left_mask = isna(left)\n right_mask = isna(right)\n left = left.copy()\n right = right.copy()\n\n # one but not both\n mask = left_mask ^ right_mask\n left[left_mask & mask] = fill_value\n right[right_mask & mask] = fill_value\n return left, right", "response": "Fill the values in left and right with fill_value."} {"instruction": "Explain what the following Python 3 code does\ndef mask_cmp_op(x, y, op, allowed_types):\n # TODO: Can we make the allowed_types arg unnecessary?\n xrav = x.ravel()\n result = np.empty(x.size, dtype=bool)\n if isinstance(y, allowed_types):\n yrav = y.ravel()\n mask = notna(xrav) & notna(yrav)\n result[mask] = op(np.array(list(xrav[mask])),\n np.array(list(yrav[mask])))\n else:\n mask = notna(xrav)\n result[mask] = op(np.array(list(xrav[mask])), y)\n\n if op == operator.ne: # pragma: no cover\n np.putmask(result, ~mask, True)\n else:\n np.putmask(result, ~mask, False)\n result = result.reshape(x.shape)\n return result", "response": "Masks the function op to only non - null points in x and y."} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef masked_arith_op(x, y, op):\n # For Series `x` is 1D so ravel() is a no-op; calling it anyway makes\n # the logic valid for both Series and DataFrame ops.\n xrav = x.ravel()\n assert isinstance(x, (np.ndarray, ABCSeries)), type(x)\n if isinstance(y, (np.ndarray, ABCSeries, ABCIndexClass)):\n dtype = find_common_type([x.dtype, y.dtype])\n result = np.empty(x.size, dtype=dtype)\n\n # PeriodIndex.ravel() returns int64 dtype, so we have\n # to work around that case. See GH#19956\n yrav = y if is_period_dtype(y) else y.ravel()\n mask = notna(xrav) & notna(yrav)\n\n if yrav.shape != mask.shape:\n # FIXME: GH#5284, GH#5035, GH#19448\n # Without specifically raising here we get mismatched\n # errors in Py3 (TypeError) vs Py2 (ValueError)\n # Note: Only = an issue in DataFrame case\n raise ValueError('Cannot broadcast operands together.')\n\n if mask.any():\n with np.errstate(all='ignore'):\n result[mask] = op(xrav[mask],\n com.values_from_object(yrav[mask]))\n\n else:\n assert is_scalar(y), type(y)\n assert isinstance(x, np.ndarray), type(x)\n # mask is only meaningful for x\n result = np.empty(x.size, dtype=x.dtype)\n mask = notna(xrav)\n\n # 1 ** np.nan is 1. So we have to unmask those.\n if op == pow:\n mask = np.where(x == 1, False, mask)\n elif op == rpow:\n mask = np.where(y == 1, False, mask)\n\n if mask.any():\n with np.errstate(all='ignore'):\n result[mask] = op(xrav[mask], y)\n\n result, changed = maybe_upcast_putmask(result, ~mask, np.nan)\n result = result.reshape(x.shape) # 2D compat\n return result", "response": "Performs an arithmetic operation on the given array x and y."} {"instruction": "Can you tell what is the following Python 3 function doing\ndef invalid_comparison(left, right, op):\n if op is operator.eq:\n res_values = np.zeros(left.shape, dtype=bool)\n elif op is operator.ne:\n res_values = np.ones(left.shape, dtype=bool)\n else:\n raise TypeError(\"Invalid comparison between dtype={dtype} and {typ}\"\n .format(dtype=left.dtype, typ=type(right).__name__))\n return res_values", "response": "Returns a array - like array - like object with the values of the elements in the array - like object left and right that are not necessarily meaningful."} {"instruction": "Write a Python 3 function that can\ndetermine if a DataFrame operation should dispatch to its Series counterpart.", "response": "def should_series_dispatch(left, right, op):\n \"\"\"\n Identify cases where a DataFrame operation should dispatch to its\n Series counterpart.\n\n Parameters\n ----------\n left : DataFrame\n right : DataFrame\n op : binary operator\n\n Returns\n -------\n override : bool\n \"\"\"\n if left._is_mixed_type or right._is_mixed_type:\n return True\n\n if not len(left.columns) or not len(right.columns):\n # ensure obj.dtypes[0] exists for each obj\n return False\n\n ldtype = left.dtypes.iloc[0]\n rdtype = right.dtypes.iloc[0]\n\n if ((is_timedelta64_dtype(ldtype) and is_integer_dtype(rdtype)) or\n (is_timedelta64_dtype(rdtype) and is_integer_dtype(ldtype))):\n # numpy integer dtypes as timedelta64 dtypes in this scenario\n return True\n\n if is_datetime64_dtype(ldtype) and is_object_dtype(rdtype):\n # in particular case where right is an array of DateOffsets\n return True\n\n return False"} {"instruction": "How would you implement a function in Python 3 that\ndispatches the given function to the Series implementation.", "response": "def dispatch_to_series(left, right, func, str_rep=None, axis=None):\n \"\"\"\n Evaluate the frame operation func(left, right) by evaluating\n column-by-column, dispatching to the Series implementation.\n\n Parameters\n ----------\n left : DataFrame\n right : scalar or DataFrame\n func : arithmetic or comparison operator\n str_rep : str or None, default None\n axis : {None, 0, 1, \"index\", \"columns\"}\n\n Returns\n -------\n DataFrame\n \"\"\"\n # Note: we use iloc to access columns for compat with cases\n # with non-unique columns.\n import pandas.core.computation.expressions as expressions\n\n right = lib.item_from_zerodim(right)\n if lib.is_scalar(right) or np.ndim(right) == 0:\n\n def column_op(a, b):\n return {i: func(a.iloc[:, i], b)\n for i in range(len(a.columns))}\n\n elif isinstance(right, ABCDataFrame):\n assert right._indexed_same(left)\n\n def column_op(a, b):\n return {i: func(a.iloc[:, i], b.iloc[:, i])\n for i in range(len(a.columns))}\n\n elif isinstance(right, ABCSeries) and axis == \"columns\":\n # We only get here if called via left._combine_match_columns,\n # in which case we specifically want to operate row-by-row\n assert right.index.equals(left.columns)\n\n def column_op(a, b):\n return {i: func(a.iloc[:, i], b.iloc[i])\n for i in range(len(a.columns))}\n\n elif isinstance(right, ABCSeries):\n assert right.index.equals(left.index) # Handle other cases later\n\n def column_op(a, b):\n return {i: func(a.iloc[:, i], b)\n for i in range(len(a.columns))}\n\n else:\n # Remaining cases have less-obvious dispatch rules\n raise NotImplementedError(right)\n\n new_data = expressions.evaluate(column_op, str_rep, left, right)\n\n result = left._constructor(new_data, index=left.index, copy=False)\n # Pin columns instead of passing to constructor for compat with\n # non-unique columns case\n result.columns = left.columns\n return result"} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef dispatch_to_index_op(op, left, right, index_class):\n left_idx = index_class(left)\n\n # avoid accidentally allowing integer add/sub. For datetime64[tz] dtypes,\n # left_idx may inherit a freq from a cached DatetimeIndex.\n # See discussion in GH#19147.\n if getattr(left_idx, 'freq', None) is not None:\n left_idx = left_idx._shallow_copy(freq=None)\n try:\n result = op(left_idx, right)\n except NullFrequencyError:\n # DatetimeIndex and TimedeltaIndex with freq == None raise ValueError\n # on add/sub of integers (or int-like). We re-raise as a TypeError.\n raise TypeError('incompatible type for a datetime/timedelta '\n 'operation [{name}]'.format(name=op.__name__))\n return result", "response": "Dispatch an operation to the index implementation."} {"instruction": "Here you have a function in Python 3, explain what it does\ndef dispatch_to_extension_op(op, left, right):\n\n # The op calls will raise TypeError if the op is not defined\n # on the ExtensionArray\n\n # unbox Series and Index to arrays\n if isinstance(left, (ABCSeries, ABCIndexClass)):\n new_left = left._values\n else:\n new_left = left\n\n if isinstance(right, (ABCSeries, ABCIndexClass)):\n new_right = right._values\n else:\n new_right = right\n\n res_values = op(new_left, new_right)\n res_name = get_op_result_name(left, right)\n\n if op.__name__ in ['divmod', 'rdivmod']:\n return _construct_divmod_result(\n left, res_values, left.index, res_name)\n\n return _construct_result(left, res_values, left.index, res_name)", "response": "Dispatch an operator to the Series or Index of an ExtensionArray."} {"instruction": "Can you write a function in Python 3 where it\nfinds the appropriate operation - wrapping functions to use for defining flex or special functions with the given class.", "response": "def _get_method_wrappers(cls):\n \"\"\"\n Find the appropriate operation-wrappers to use when defining flex/special\n arithmetic, boolean, and comparison operations with the given class.\n\n Parameters\n ----------\n cls : class\n\n Returns\n -------\n arith_flex : function or None\n comp_flex : function or None\n arith_special : function\n comp_special : function\n bool_special : function\n\n Notes\n -----\n None is only returned for SparseArray\n \"\"\"\n if issubclass(cls, ABCSparseSeries):\n # Be sure to catch this before ABCSeries and ABCSparseArray,\n # as they will both come see SparseSeries as a subclass\n arith_flex = _flex_method_SERIES\n comp_flex = _flex_method_SERIES\n arith_special = _arith_method_SPARSE_SERIES\n comp_special = _arith_method_SPARSE_SERIES\n bool_special = _bool_method_SERIES\n # TODO: I don't think the functions defined by bool_method are tested\n elif issubclass(cls, ABCSeries):\n # Just Series; SparseSeries is caught above\n arith_flex = _flex_method_SERIES\n comp_flex = _flex_method_SERIES\n arith_special = _arith_method_SERIES\n comp_special = _comp_method_SERIES\n bool_special = _bool_method_SERIES\n elif issubclass(cls, ABCSparseArray):\n arith_flex = None\n comp_flex = None\n arith_special = _arith_method_SPARSE_ARRAY\n comp_special = _arith_method_SPARSE_ARRAY\n bool_special = _arith_method_SPARSE_ARRAY\n elif issubclass(cls, ABCPanel):\n arith_flex = _flex_method_PANEL\n comp_flex = _comp_method_PANEL\n arith_special = _arith_method_PANEL\n comp_special = _comp_method_PANEL\n bool_special = _arith_method_PANEL\n elif issubclass(cls, ABCDataFrame):\n # Same for DataFrame and SparseDataFrame\n arith_flex = _arith_method_FRAME\n comp_flex = _flex_comp_method_FRAME\n arith_special = _arith_method_FRAME\n comp_special = _comp_method_FRAME\n bool_special = _arith_method_FRAME\n return arith_flex, comp_flex, arith_special, comp_special, bool_special"} {"instruction": "How would you code a function in Python 3 to\nadd the full suite of special arithmetic methods to the class.", "response": "def add_special_arithmetic_methods(cls):\n \"\"\"\n Adds the full suite of special arithmetic methods (``__add__``,\n ``__sub__``, etc.) to the class.\n\n Parameters\n ----------\n cls : class\n special methods will be defined and pinned to this class\n \"\"\"\n _, _, arith_method, comp_method, bool_method = _get_method_wrappers(cls)\n new_methods = _create_methods(cls, arith_method, comp_method, bool_method,\n special=True)\n # inplace operators (I feel like these should get passed an `inplace=True`\n # or just be removed\n\n def _wrap_inplace_method(method):\n \"\"\"\n return an inplace wrapper for this method\n \"\"\"\n\n def f(self, other):\n result = method(self, other)\n\n # this makes sure that we are aligned like the input\n # we are updating inplace so we want to ignore is_copy\n self._update_inplace(result.reindex_like(self, copy=False)._data,\n verify_is_copy=False)\n\n return self\n\n f.__name__ = \"__i{name}__\".format(name=method.__name__.strip(\"__\"))\n return f\n\n new_methods.update(\n dict(__iadd__=_wrap_inplace_method(new_methods[\"__add__\"]),\n __isub__=_wrap_inplace_method(new_methods[\"__sub__\"]),\n __imul__=_wrap_inplace_method(new_methods[\"__mul__\"]),\n __itruediv__=_wrap_inplace_method(new_methods[\"__truediv__\"]),\n __ifloordiv__=_wrap_inplace_method(new_methods[\"__floordiv__\"]),\n __imod__=_wrap_inplace_method(new_methods[\"__mod__\"]),\n __ipow__=_wrap_inplace_method(new_methods[\"__pow__\"])))\n\n new_methods.update(\n dict(__iand__=_wrap_inplace_method(new_methods[\"__and__\"]),\n __ior__=_wrap_inplace_method(new_methods[\"__or__\"]),\n __ixor__=_wrap_inplace_method(new_methods[\"__xor__\"])))\n\n add_methods(cls, new_methods=new_methods)"} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef add_flex_arithmetic_methods(cls):\n flex_arith_method, flex_comp_method, _, _, _ = _get_method_wrappers(cls)\n new_methods = _create_methods(cls, flex_arith_method,\n flex_comp_method, bool_method=None,\n special=False)\n new_methods.update(dict(multiply=new_methods['mul'],\n subtract=new_methods['sub'],\n divide=new_methods['div']))\n # opt out of bool flex methods for now\n assert not any(kname in new_methods for kname in ('ror_', 'rxor', 'rand_'))\n\n add_methods(cls, new_methods=new_methods)", "response": "Adds the full suite of flex arithmetic methods to the class."} {"instruction": "Create a Python 3 function for\naligning lhs and rhs Series", "response": "def _align_method_SERIES(left, right, align_asobject=False):\n \"\"\" align lhs and rhs Series \"\"\"\n\n # ToDo: Different from _align_method_FRAME, list, tuple and ndarray\n # are not coerced here\n # because Series has inconsistencies described in #13637\n\n if isinstance(right, ABCSeries):\n # avoid repeated alignment\n if not left.index.equals(right.index):\n\n if align_asobject:\n # to keep original value's dtype for bool ops\n left = left.astype(object)\n right = right.astype(object)\n\n left, right = left.align(right, copy=False)\n\n return left, right"} {"instruction": "How would you code a function in Python 3 to\nconstruct a raw op result object.", "response": "def _construct_result(left, result, index, name, dtype=None):\n \"\"\"\n If the raw op result has a non-None name (e.g. it is an Index object) and\n the name argument is None, then passing name to the constructor will\n not be enough; we still need to override the name attribute.\n \"\"\"\n out = left._constructor(result, index=index, dtype=dtype)\n out = out.__finalize__(left)\n out.name = name\n return out"} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef _arith_method_SERIES(cls, op, special):\n str_rep = _get_opstr(op, cls)\n op_name = _get_op_name(op, special)\n eval_kwargs = _gen_eval_kwargs(op_name)\n fill_zeros = _gen_fill_zeros(op_name)\n construct_result = (_construct_divmod_result\n if op in [divmod, rdivmod] else _construct_result)\n\n def na_op(x, y):\n import pandas.core.computation.expressions as expressions\n try:\n result = expressions.evaluate(op, str_rep, x, y, **eval_kwargs)\n except TypeError:\n result = masked_arith_op(x, y, op)\n\n result = missing.fill_zeros(result, x, y, op_name, fill_zeros)\n return result\n\n def safe_na_op(lvalues, rvalues):\n \"\"\"\n return the result of evaluating na_op on the passed in values\n\n try coercion to object type if the native types are not compatible\n\n Parameters\n ----------\n lvalues : array-like\n rvalues : array-like\n\n Raises\n ------\n TypeError: invalid operation\n \"\"\"\n try:\n with np.errstate(all='ignore'):\n return na_op(lvalues, rvalues)\n except Exception:\n if is_object_dtype(lvalues):\n return libalgos.arrmap_object(lvalues,\n lambda x: op(x, rvalues))\n raise\n\n def wrapper(left, right):\n if isinstance(right, ABCDataFrame):\n return NotImplemented\n\n left, right = _align_method_SERIES(left, right)\n res_name = get_op_result_name(left, right)\n right = maybe_upcast_for_op(right)\n\n if is_categorical_dtype(left):\n raise TypeError(\"{typ} cannot perform the operation \"\n \"{op}\".format(typ=type(left).__name__, op=str_rep))\n\n elif is_datetime64_dtype(left) or is_datetime64tz_dtype(left):\n # Give dispatch_to_index_op a chance for tests like\n # test_dt64_series_add_intlike, which the index dispatching handles\n # specifically.\n result = dispatch_to_index_op(op, left, right, pd.DatetimeIndex)\n return construct_result(left, result,\n index=left.index, name=res_name,\n dtype=result.dtype)\n\n elif (is_extension_array_dtype(left) or\n (is_extension_array_dtype(right) and not is_scalar(right))):\n # GH#22378 disallow scalar to exclude e.g. \"category\", \"Int64\"\n return dispatch_to_extension_op(op, left, right)\n\n elif is_timedelta64_dtype(left):\n result = dispatch_to_index_op(op, left, right, pd.TimedeltaIndex)\n return construct_result(left, result,\n index=left.index, name=res_name)\n\n elif is_timedelta64_dtype(right):\n # We should only get here with non-scalar or timedelta64('NaT')\n # values for right\n # Note: we cannot use dispatch_to_index_op because\n # that may incorrectly raise TypeError when we\n # should get NullFrequencyError\n result = op(pd.Index(left), right)\n return construct_result(left, result,\n index=left.index, name=res_name,\n dtype=result.dtype)\n\n lvalues = left.values\n rvalues = right\n if isinstance(rvalues, ABCSeries):\n rvalues = rvalues.values\n\n result = safe_na_op(lvalues, rvalues)\n return construct_result(left, result,\n index=left.index, name=res_name, dtype=None)\n\n wrapper.__name__ = op_name\n return wrapper", "response": "Wrapper function for arithmetic operations for Series arithmetic operations."} {"instruction": "Create a Python 3 function for\nwrapping function for Series arithmetic operations to avoid code duplication.", "response": "def _comp_method_SERIES(cls, op, special):\n \"\"\"\n Wrapper function for Series arithmetic operations, to avoid\n code duplication.\n \"\"\"\n op_name = _get_op_name(op, special)\n masker = _gen_eval_kwargs(op_name).get('masker', False)\n\n def na_op(x, y):\n # TODO:\n # should have guarantess on what x, y can be type-wise\n # Extension Dtypes are not called here\n\n # Checking that cases that were once handled here are no longer\n # reachable.\n assert not (is_categorical_dtype(y) and not is_scalar(y))\n\n if is_object_dtype(x.dtype):\n result = _comp_method_OBJECT_ARRAY(op, x, y)\n\n elif is_datetimelike_v_numeric(x, y):\n return invalid_comparison(x, y, op)\n\n else:\n\n # we want to compare like types\n # we only want to convert to integer like if\n # we are not NotImplemented, otherwise\n # we would allow datetime64 (but viewed as i8) against\n # integer comparisons\n\n # we have a datetime/timedelta and may need to convert\n assert not needs_i8_conversion(x)\n mask = None\n if not is_scalar(y) and needs_i8_conversion(y):\n mask = isna(x) | isna(y)\n y = y.view('i8')\n x = x.view('i8')\n\n method = getattr(x, op_name, None)\n if method is not None:\n with np.errstate(all='ignore'):\n result = method(y)\n if result is NotImplemented:\n return invalid_comparison(x, y, op)\n else:\n result = op(x, y)\n\n if mask is not None and mask.any():\n result[mask] = masker\n\n return result\n\n def wrapper(self, other, axis=None):\n # Validate the axis parameter\n if axis is not None:\n self._get_axis_number(axis)\n\n res_name = get_op_result_name(self, other)\n\n if isinstance(other, list):\n # TODO: same for tuples?\n other = np.asarray(other)\n\n if isinstance(other, ABCDataFrame): # pragma: no cover\n # Defer to DataFrame implementation; fail early\n return NotImplemented\n\n elif isinstance(other, ABCSeries) and not self._indexed_same(other):\n raise ValueError(\"Can only compare identically-labeled \"\n \"Series objects\")\n\n elif is_categorical_dtype(self):\n # Dispatch to Categorical implementation; pd.CategoricalIndex\n # behavior is non-canonical GH#19513\n res_values = dispatch_to_index_op(op, self, other, pd.Categorical)\n return self._constructor(res_values, index=self.index,\n name=res_name)\n\n elif is_datetime64_dtype(self) or is_datetime64tz_dtype(self):\n # Dispatch to DatetimeIndex to ensure identical\n # Series/Index behavior\n if (isinstance(other, datetime.date) and\n not isinstance(other, datetime.datetime)):\n # https://github.com/pandas-dev/pandas/issues/21152\n # Compatibility for difference between Series comparison w/\n # datetime and date\n msg = (\n \"Comparing Series of datetimes with 'datetime.date'. \"\n \"Currently, the 'datetime.date' is coerced to a \"\n \"datetime. In the future pandas will not coerce, \"\n \"and {future}. \"\n \"To retain the current behavior, \"\n \"convert the 'datetime.date' to a datetime with \"\n \"'pd.Timestamp'.\"\n )\n\n if op in {operator.lt, operator.le, operator.gt, operator.ge}:\n future = \"a TypeError will be raised\"\n else:\n future = (\n \"'the values will not compare equal to the \"\n \"'datetime.date'\"\n )\n msg = '\\n'.join(textwrap.wrap(msg.format(future=future)))\n warnings.warn(msg, FutureWarning, stacklevel=2)\n other = pd.Timestamp(other)\n\n res_values = dispatch_to_index_op(op, self, other,\n pd.DatetimeIndex)\n\n return self._constructor(res_values, index=self.index,\n name=res_name)\n\n elif is_timedelta64_dtype(self):\n res_values = dispatch_to_index_op(op, self, other,\n pd.TimedeltaIndex)\n return self._constructor(res_values, index=self.index,\n name=res_name)\n\n elif (is_extension_array_dtype(self) or\n (is_extension_array_dtype(other) and not is_scalar(other))):\n # Note: the `not is_scalar(other)` condition rules out\n # e.g. other == \"category\"\n return dispatch_to_extension_op(op, self, other)\n\n elif isinstance(other, ABCSeries):\n # By this point we have checked that self._indexed_same(other)\n res_values = na_op(self.values, other.values)\n # rename is needed in case res_name is None and res_values.name\n # is not.\n return self._constructor(res_values, index=self.index,\n name=res_name).rename(res_name)\n\n elif isinstance(other, (np.ndarray, pd.Index)):\n # do not check length of zerodim array\n # as it will broadcast\n if other.ndim != 0 and len(self) != len(other):\n raise ValueError('Lengths must match to compare')\n\n res_values = na_op(self.values, np.asarray(other))\n result = self._constructor(res_values, index=self.index)\n # rename is needed in case res_name is None and self.name\n # is not.\n return result.__finalize__(self).rename(res_name)\n\n elif is_scalar(other) and isna(other):\n # numpy does not like comparisons vs None\n if op is operator.ne:\n res_values = np.ones(len(self), dtype=bool)\n else:\n res_values = np.zeros(len(self), dtype=bool)\n return self._constructor(res_values, index=self.index,\n name=res_name, dtype='bool')\n\n else:\n values = self.get_values()\n\n with np.errstate(all='ignore'):\n res = na_op(values, other)\n if is_scalar(res):\n raise TypeError('Could not compare {typ} type with Series'\n .format(typ=type(other)))\n\n # always return a full value series here\n res_values = com.values_from_object(res)\n return self._constructor(res_values, index=self.index,\n name=res_name, dtype='bool')\n\n wrapper.__name__ = op_name\n return wrapper"} {"instruction": "Create a Python 3 function to\nwrap function for Series arithmetic operations to avoid code duplication.", "response": "def _bool_method_SERIES(cls, op, special):\n \"\"\"\n Wrapper function for Series arithmetic operations, to avoid\n code duplication.\n \"\"\"\n op_name = _get_op_name(op, special)\n\n def na_op(x, y):\n try:\n result = op(x, y)\n except TypeError:\n assert not isinstance(y, (list, ABCSeries, ABCIndexClass))\n if isinstance(y, np.ndarray):\n # bool-bool dtype operations should be OK, should not get here\n assert not (is_bool_dtype(x) and is_bool_dtype(y))\n x = ensure_object(x)\n y = ensure_object(y)\n result = libops.vec_binop(x, y, op)\n else:\n # let null fall thru\n assert lib.is_scalar(y)\n if not isna(y):\n y = bool(y)\n try:\n result = libops.scalar_binop(x, y, op)\n except (TypeError, ValueError, AttributeError,\n OverflowError, NotImplementedError):\n raise TypeError(\"cannot compare a dtyped [{dtype}] array \"\n \"with a scalar of type [{typ}]\"\n .format(dtype=x.dtype,\n typ=type(y).__name__))\n\n return result\n\n fill_int = lambda x: x.fillna(0)\n fill_bool = lambda x: x.fillna(False).astype(bool)\n\n def wrapper(self, other):\n is_self_int_dtype = is_integer_dtype(self.dtype)\n\n self, other = _align_method_SERIES(self, other, align_asobject=True)\n res_name = get_op_result_name(self, other)\n\n if isinstance(other, ABCDataFrame):\n # Defer to DataFrame implementation; fail early\n return NotImplemented\n\n elif isinstance(other, (ABCSeries, ABCIndexClass)):\n is_other_int_dtype = is_integer_dtype(other.dtype)\n other = fill_int(other) if is_other_int_dtype else fill_bool(other)\n\n ovalues = other.values\n finalizer = lambda x: x\n\n else:\n # scalars, list, tuple, np.array\n is_other_int_dtype = is_integer_dtype(np.asarray(other))\n if is_list_like(other) and not isinstance(other, np.ndarray):\n # TODO: Can we do this before the is_integer_dtype check?\n # could the is_integer_dtype check be checking the wrong\n # thing? e.g. other = [[0, 1], [2, 3], [4, 5]]?\n other = construct_1d_object_array_from_listlike(other)\n\n ovalues = other\n finalizer = lambda x: x.__finalize__(self)\n\n # For int vs int `^`, `|`, `&` are bitwise operators and return\n # integer dtypes. Otherwise these are boolean ops\n filler = (fill_int if is_self_int_dtype and is_other_int_dtype\n else fill_bool)\n res_values = na_op(self.values, ovalues)\n unfilled = self._constructor(res_values,\n index=self.index, name=res_name)\n filled = filler(unfilled)\n return finalizer(filled)\n\n wrapper.__name__ = op_name\n return wrapper"} {"instruction": "Write a Python 3 function that can\ncombine two Series objects using a binary operator func to self and other using alignment and fill_value conventions determined by the fill_value axis and level kwargs.", "response": "def _combine_series_frame(self, other, func, fill_value=None, axis=None,\n level=None):\n \"\"\"\n Apply binary operator `func` to self, other using alignment and fill\n conventions determined by the fill_value, axis, and level kwargs.\n\n Parameters\n ----------\n self : DataFrame\n other : Series\n func : binary operator\n fill_value : object, default None\n axis : {0, 1, 'columns', 'index', None}, default None\n level : int or None, default None\n\n Returns\n -------\n result : DataFrame\n \"\"\"\n if fill_value is not None:\n raise NotImplementedError(\"fill_value {fill} not supported.\"\n .format(fill=fill_value))\n\n if axis is not None:\n axis = self._get_axis_number(axis)\n if axis == 0:\n return self._combine_match_index(other, func, level=level)\n else:\n return self._combine_match_columns(other, func, level=level)\n else:\n if not len(other):\n return self * np.nan\n\n if not len(self):\n # Ambiguous case, use _series so works with DataFrame\n return self._constructor(data=self._series, index=self.index,\n columns=self.columns)\n\n # default axis is columns\n return self._combine_match_columns(other, func, level=level)"} {"instruction": "Can you implement a function in Python 3 that\naligns the method frame to meet lhs dims", "response": "def _align_method_FRAME(left, right, axis):\n \"\"\" convert rhs to meet lhs dims if input is list, tuple or np.ndarray \"\"\"\n\n def to_series(right):\n msg = ('Unable to coerce to Series, length must be {req_len}: '\n 'given {given_len}')\n if axis is not None and left._get_axis_name(axis) == 'index':\n if len(left.index) != len(right):\n raise ValueError(msg.format(req_len=len(left.index),\n given_len=len(right)))\n right = left._constructor_sliced(right, index=left.index)\n else:\n if len(left.columns) != len(right):\n raise ValueError(msg.format(req_len=len(left.columns),\n given_len=len(right)))\n right = left._constructor_sliced(right, index=left.columns)\n return right\n\n if isinstance(right, np.ndarray):\n\n if right.ndim == 1:\n right = to_series(right)\n\n elif right.ndim == 2:\n if right.shape == left.shape:\n right = left._constructor(right, index=left.index,\n columns=left.columns)\n\n elif right.shape[0] == left.shape[0] and right.shape[1] == 1:\n # Broadcast across columns\n right = np.broadcast_to(right, left.shape)\n right = left._constructor(right,\n index=left.index,\n columns=left.columns)\n\n elif right.shape[1] == left.shape[1] and right.shape[0] == 1:\n # Broadcast along rows\n right = to_series(right[0, :])\n\n else:\n raise ValueError(\"Unable to coerce to DataFrame, shape \"\n \"must be {req_shape}: given {given_shape}\"\n .format(req_shape=left.shape,\n given_shape=right.shape))\n\n elif right.ndim > 2:\n raise ValueError('Unable to coerce to Series/DataFrame, dim '\n 'must be <= 2: {dim}'.format(dim=right.shape))\n\n elif (is_list_like(right) and\n not isinstance(right, (ABCSeries, ABCDataFrame))):\n # GH17901\n right = to_series(right)\n\n return right"} {"instruction": "How would you implement a function in Python 3 that\ncoerces left and right to float64 if result is expected to have NaN or inf values.", "response": "def _cast_sparse_series_op(left, right, opname):\n \"\"\"\n For SparseSeries operation, coerce to float64 if the result is expected\n to have NaN or inf values\n\n Parameters\n ----------\n left : SparseArray\n right : SparseArray\n opname : str\n\n Returns\n -------\n left : SparseArray\n right : SparseArray\n \"\"\"\n from pandas.core.sparse.api import SparseDtype\n\n opname = opname.strip('_')\n\n # TODO: This should be moved to the array?\n if is_integer_dtype(left) and is_integer_dtype(right):\n # series coerces to float64 if result should have NaN/inf\n if opname in ('floordiv', 'mod') and (right.values == 0).any():\n left = left.astype(SparseDtype(np.float64, left.fill_value))\n right = right.astype(SparseDtype(np.float64, right.fill_value))\n elif opname in ('rfloordiv', 'rmod') and (left.values == 0).any():\n left = left.astype(SparseDtype(np.float64, left.fill_value))\n right = right.astype(SparseDtype(np.float64, right.fill_value))\n\n return left, right"} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef _arith_method_SPARSE_SERIES(cls, op, special):\n op_name = _get_op_name(op, special)\n\n def wrapper(self, other):\n if isinstance(other, ABCDataFrame):\n return NotImplemented\n elif isinstance(other, ABCSeries):\n if not isinstance(other, ABCSparseSeries):\n other = other.to_sparse(fill_value=self.fill_value)\n return _sparse_series_op(self, other, op, op_name)\n elif is_scalar(other):\n with np.errstate(all='ignore'):\n new_values = op(self.values, other)\n return self._constructor(new_values,\n index=self.index,\n name=self.name)\n else: # pragma: no cover\n raise TypeError('operation with {other} not supported'\n .format(other=type(other)))\n\n wrapper.__name__ = op_name\n return wrapper", "response": "Wrapper function for arithmetic operations for Series arithmetic operations."} {"instruction": "Can you tell what is the following Python 3 function doing\ndef _arith_method_SPARSE_ARRAY(cls, op, special):\n op_name = _get_op_name(op, special)\n\n def wrapper(self, other):\n from pandas.core.arrays.sparse.array import (\n SparseArray, _sparse_array_op, _wrap_result, _get_fill)\n if isinstance(other, np.ndarray):\n if len(self) != len(other):\n raise AssertionError(\"length mismatch: {self} vs. {other}\"\n .format(self=len(self), other=len(other)))\n if not isinstance(other, SparseArray):\n dtype = getattr(other, 'dtype', None)\n other = SparseArray(other, fill_value=self.fill_value,\n dtype=dtype)\n return _sparse_array_op(self, other, op, op_name)\n elif is_scalar(other):\n with np.errstate(all='ignore'):\n fill = op(_get_fill(self), np.asarray(other))\n result = op(self.sp_values, other)\n\n return _wrap_result(op_name, result, self.sp_index, fill)\n else: # pragma: no cover\n raise TypeError('operation with {other} not supported'\n .format(other=type(other)))\n\n wrapper.__name__ = op_name\n return wrapper", "response": "Wrapper function for Series arithmetic operations."} {"instruction": "Can you tell what is the following Python 3 function doing\ndef validate_periods(periods):\n if periods is not None:\n if lib.is_float(periods):\n periods = int(periods)\n elif not lib.is_integer(periods):\n raise TypeError('periods must be a number, got {periods}'\n .format(periods=periods))\n return periods", "response": "Validates that a periods argument is passed to the Datetime / Timedelta Array / Index\n constructor."} {"instruction": "Write a Python 3 script for\nvalidating that the given closed argument is among valid endpoints.", "response": "def validate_endpoints(closed):\n \"\"\"\n Check that the `closed` argument is among [None, \"left\", \"right\"]\n\n Parameters\n ----------\n closed : {None, \"left\", \"right\"}\n\n Returns\n -------\n left_closed : bool\n right_closed : bool\n\n Raises\n ------\n ValueError : if argument is not among valid values\n \"\"\"\n left_closed = False\n right_closed = False\n\n if closed is None:\n left_closed = True\n right_closed = True\n elif closed == \"left\":\n left_closed = True\n elif closed == \"right\":\n right_closed = True\n else:\n raise ValueError(\"Closed has to be either 'left', 'right' or None\")\n\n return left_closed, right_closed"} {"instruction": "Create a Python 3 function to\nvalidate that inferred frequency is valid.", "response": "def validate_inferred_freq(freq, inferred_freq, freq_infer):\n \"\"\"\n If the user passes a freq and another freq is inferred from passed data,\n require that they match.\n\n Parameters\n ----------\n freq : DateOffset or None\n inferred_freq : DateOffset or None\n freq_infer : bool\n\n Returns\n -------\n freq : DateOffset or None\n freq_infer : bool\n\n Notes\n -----\n We assume at this point that `maybe_infer_freq` has been called, so\n `freq` is either a DateOffset object or None.\n \"\"\"\n if inferred_freq is not None:\n if freq is not None and freq != inferred_freq:\n raise ValueError('Inferred frequency {inferred} from passed '\n 'values does not conform to passed frequency '\n '{passed}'\n .format(inferred=inferred_freq,\n passed=freq.freqstr))\n elif freq is None:\n freq = inferred_freq\n freq_infer = False\n\n return freq, freq_infer"} {"instruction": "Given the following Python 3 function, write the documentation\ndef maybe_infer_freq(freq):\n freq_infer = False\n if not isinstance(freq, DateOffset):\n # if a passed freq is None, don't infer automatically\n if freq != 'infer':\n freq = frequencies.to_offset(freq)\n else:\n freq_infer = True\n freq = None\n return freq, freq_infer", "response": "If a passed freq is None infer from the current object."} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef _ensure_datetimelike_to_i8(other, to_utc=False):\n from pandas import Index\n from pandas.core.arrays import PeriodArray\n\n if lib.is_scalar(other) and isna(other):\n return iNaT\n elif isinstance(other, (PeriodArray, ABCIndexClass,\n DatetimeLikeArrayMixin)):\n # convert tz if needed\n if getattr(other, 'tz', None) is not None:\n if to_utc:\n other = other.tz_convert('UTC')\n else:\n other = other.tz_localize(None)\n else:\n try:\n return np.array(other, copy=False).view('i8')\n except TypeError:\n # period array cannot be coerced to int\n other = Index(other)\n return other.asi8", "response": "Coerces a scalar or array to i8."} {"instruction": "Write a Python 3 script to\nconstruct a scalar type from a string.", "response": "def _scalar_from_string(\n self,\n value: str,\n ) -> Union[Period, Timestamp, Timedelta, NaTType]:\n \"\"\"\n Construct a scalar type from a string.\n\n Parameters\n ----------\n value : str\n\n Returns\n -------\n Period, Timestamp, or Timedelta, or NaT\n Whatever the type of ``self._scalar_type`` is.\n\n Notes\n -----\n This should call ``self._check_compatible_with`` before\n unboxing the result.\n \"\"\"\n raise AbstractMethodError(self)"} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef _check_compatible_with(\n self,\n other: Union[Period, Timestamp, Timedelta, NaTType],\n ) -> None:\n \"\"\"\n Verify that `self` and `other` are compatible.\n\n * DatetimeArray verifies that the timezones (if any) match\n * PeriodArray verifies that the freq matches\n * Timedelta has no verification\n\n In each case, NaT is considered compatible.\n\n Parameters\n ----------\n other\n\n Raises\n ------\n Exception\n \"\"\"\n raise AbstractMethodError(self)", "response": "Checks that the two items are compatible with the current one."} {"instruction": "Create a Python 3 function to\nconvert to Index using specified date_format.", "response": "def strftime(self, date_format):\n \"\"\"\n Convert to Index using specified date_format.\n\n Return an Index of formatted strings specified by date_format, which\n supports the same string format as the python standard library. Details\n of the string format can be found in `python string format\n doc <%(URL)s>`__.\n\n Parameters\n ----------\n date_format : str\n Date format string (e.g. \"%%Y-%%m-%%d\").\n\n Returns\n -------\n Index\n Index of formatted strings.\n\n See Also\n --------\n to_datetime : Convert the given argument to datetime.\n DatetimeIndex.normalize : Return DatetimeIndex with times to midnight.\n DatetimeIndex.round : Round the DatetimeIndex to the specified freq.\n DatetimeIndex.floor : Floor the DatetimeIndex to the specified freq.\n\n Examples\n --------\n >>> rng = pd.date_range(pd.Timestamp(\"2018-03-10 09:00\"),\n ... periods=3, freq='s')\n >>> rng.strftime('%%B %%d, %%Y, %%r')\n Index(['March 10, 2018, 09:00:00 AM', 'March 10, 2018, 09:00:01 AM',\n 'March 10, 2018, 09:00:02 AM'],\n dtype='object')\n \"\"\"\n from pandas import Index\n return Index(self._format_native_types(date_format=date_format))"} {"instruction": "Write a Python 3 function that can\nfind indices where elements should be inserted to maintain order. Find the indices into a sorted array `self` such that, if the corresponding elements in `value` were inserted before the indices, the order of `self` would be preserved. Parameters ---------- value : array_like Values to insert into `self`. side : {'left', 'right'}, optional If 'left', the index of the first suitable location found is given. If 'right', return the last such index. If there is no suitable index, return either 0 or N (where N is the length of `self`). sorter : 1-D array_like, optional Optional array of integer indices that sort `self` into ascending order. They are typically the result of ``np.argsort``. Returns ------- indices : array of ints Array of insertion points with the same shape as `value`.", "response": "def searchsorted(self, value, side='left', sorter=None):\n \"\"\"\n Find indices where elements should be inserted to maintain order.\n\n Find the indices into a sorted array `self` such that, if the\n corresponding elements in `value` were inserted before the indices,\n the order of `self` would be preserved.\n\n Parameters\n ----------\n value : array_like\n Values to insert into `self`.\n side : {'left', 'right'}, optional\n If 'left', the index of the first suitable location found is given.\n If 'right', return the last such index. If there is no suitable\n index, return either 0 or N (where N is the length of `self`).\n sorter : 1-D array_like, optional\n Optional array of integer indices that sort `self` into ascending\n order. They are typically the result of ``np.argsort``.\n\n Returns\n -------\n indices : array of ints\n Array of insertion points with the same shape as `value`.\n \"\"\"\n if isinstance(value, str):\n value = self._scalar_from_string(value)\n\n if not (isinstance(value, (self._scalar_type, type(self)))\n or isna(value)):\n raise ValueError(\"Unexpected type for 'value': {valtype}\"\n .format(valtype=type(value)))\n\n self._check_compatible_with(value)\n if isinstance(value, type(self)):\n value = value.asi8\n else:\n value = self._unbox_scalar(value)\n\n return self.asi8.searchsorted(value, side=side, sorter=sorter)"} {"instruction": "Write a Python 3 script for\nrepeating elements of an array.", "response": "def repeat(self, repeats, *args, **kwargs):\n \"\"\"\n Repeat elements of an array.\n\n See Also\n --------\n numpy.ndarray.repeat\n \"\"\"\n nv.validate_repeat(args, kwargs)\n values = self._data.repeat(repeats)\n return type(self)(values.view('i8'), dtype=self.dtype)"} {"instruction": "Write a Python 3 script to\nreturn a Series containing counts of unique values.", "response": "def value_counts(self, dropna=False):\n \"\"\"\n Return a Series containing counts of unique values.\n\n Parameters\n ----------\n dropna : boolean, default True\n Don't include counts of NaT values.\n\n Returns\n -------\n Series\n \"\"\"\n from pandas import Series, Index\n\n if dropna:\n values = self[~self.isna()]._data\n else:\n values = self._data\n\n cls = type(self)\n\n result = value_counts(values, sort=False, dropna=dropna)\n index = Index(cls(result.index.view('i8'), dtype=self.dtype),\n name=result.index.name)\n return Series(result.values, index=index, name=result.name)"} {"instruction": "Can you write a function in Python 3 where it\nmasks the result array if needed.", "response": "def _maybe_mask_results(self, result, fill_value=iNaT, convert=None):\n \"\"\"\n Parameters\n ----------\n result : a ndarray\n fill_value : object, default iNaT\n convert : string/dtype or None\n\n Returns\n -------\n result : ndarray with values replace by the fill_value\n\n mask the result if needed, convert to the provided dtype if its not\n None\n\n This is an internal routine.\n \"\"\"\n\n if self._hasnans:\n if convert:\n result = result.astype(convert)\n if fill_value is None:\n fill_value = np.nan\n result[self._isnan] = fill_value\n return result"} {"instruction": "Can you write a function in Python 3 where it\nvalidates that a frequency of a given date is compatible with the values of a given index or a given TimedeltaIndex or TimedeltaIndex.", "response": "def _validate_frequency(cls, index, freq, **kwargs):\n \"\"\"\n Validate that a frequency is compatible with the values of a given\n Datetime Array/Index or Timedelta Array/Index\n\n Parameters\n ----------\n index : DatetimeIndex or TimedeltaIndex\n The index on which to determine if the given frequency is valid\n freq : DateOffset\n The frequency to validate\n \"\"\"\n if is_period_dtype(cls):\n # Frequency validation is not meaningful for Period Array/Index\n return None\n\n inferred = index.inferred_freq\n if index.size == 0 or inferred == freq.freqstr:\n return None\n\n try:\n on_freq = cls._generate_range(start=index[0], end=None,\n periods=len(index), freq=freq,\n **kwargs)\n if not np.array_equal(index.asi8, on_freq.asi8):\n raise ValueError\n except ValueError as e:\n if \"non-fixed\" in str(e):\n # non-fixed frequencies are not meaningful for timedelta64;\n # we retain that error message\n raise e\n # GH#11587 the main way this is reached is if the `np.array_equal`\n # check above is False. This can also be reached if index[0]\n # is `NaT`, in which case the call to `cls._generate_range` will\n # raise a ValueError, which we re-raise with a more targeted\n # message.\n raise ValueError('Inferred frequency {infer} from passed values '\n 'does not conform to passed frequency {passed}'\n .format(infer=inferred, passed=freq.freqstr))"} {"instruction": "How would you implement a function in Python 3 that\nadds a timedelta - like Tick or TimedeltaIndex - like object to self yielding an int64 numpy array containing the values of the delta.", "response": "def _add_delta(self, other):\n \"\"\"\n Add a timedelta-like, Tick or TimedeltaIndex-like object\n to self, yielding an int64 numpy array\n\n Parameters\n ----------\n delta : {timedelta, np.timedelta64, Tick,\n TimedeltaIndex, ndarray[timedelta64]}\n\n Returns\n -------\n result : ndarray[int64]\n\n Notes\n -----\n The result's name is set outside of _add_delta by the calling\n method (__add__ or __sub__), if necessary (i.e. for Indexes).\n \"\"\"\n if isinstance(other, (Tick, timedelta, np.timedelta64)):\n new_values = self._add_timedeltalike_scalar(other)\n elif is_timedelta64_dtype(other):\n # ndarray[timedelta64] or TimedeltaArray/index\n new_values = self._add_delta_tdi(other)\n\n return new_values"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef _add_timedeltalike_scalar(self, other):\n if isna(other):\n # i.e np.timedelta64(\"NaT\"), not recognized by delta_to_nanoseconds\n new_values = np.empty(len(self), dtype='i8')\n new_values[:] = iNaT\n return new_values\n\n inc = delta_to_nanoseconds(other)\n new_values = checked_add_with_arr(self.asi8, inc,\n arr_mask=self._isnan).view('i8')\n new_values = self._maybe_mask_results(new_values)\n return new_values.view('i8')", "response": "Add a delta of a timedeltalike\n return the i8 result view\n "} {"instruction": "Write a Python 3 function that can\nadd a delta of a TimedeltaIndex return the i8 result view", "response": "def _add_delta_tdi(self, other):\n \"\"\"\n Add a delta of a TimedeltaIndex\n return the i8 result view\n \"\"\"\n if len(self) != len(other):\n raise ValueError(\"cannot add indices of unequal length\")\n\n if isinstance(other, np.ndarray):\n # ndarray[timedelta64]; wrap in TimedeltaIndex for op\n from pandas import TimedeltaIndex\n other = TimedeltaIndex(other)\n\n self_i8 = self.asi8\n other_i8 = other.asi8\n new_values = checked_add_with_arr(self_i8, other_i8,\n arr_mask=self._isnan,\n b_mask=other._isnan)\n if self._hasnans or other._hasnans:\n mask = (self._isnan) | (other._isnan)\n new_values[mask] = iNaT\n return new_values.view('i8')"} {"instruction": "Make a summary of the following Python 3 code\ndef _add_nat(self):\n if is_period_dtype(self):\n raise TypeError('Cannot add {cls} and {typ}'\n .format(cls=type(self).__name__,\n typ=type(NaT).__name__))\n\n # GH#19124 pd.NaT is treated like a timedelta for both timedelta\n # and datetime dtypes\n result = np.zeros(len(self), dtype=np.int64)\n result.fill(iNaT)\n return type(self)(result, dtype=self.dtype, freq=None)", "response": "Add pd. NaT to self\n "} {"instruction": "Can you create a Python 3 function that\nsubtracts pd. NaT from self", "response": "def _sub_nat(self):\n \"\"\"\n Subtract pd.NaT from self\n \"\"\"\n # GH#19124 Timedelta - datetime is not in general well-defined.\n # We make an exception for pd.NaT, which in this case quacks\n # like a timedelta.\n # For datetime64 dtypes by convention we treat NaT as a datetime, so\n # this subtraction returns a timedelta64 dtype.\n # For period dtype, timedelta64 is a close-enough return dtype.\n result = np.zeros(len(self), dtype=np.int64)\n result.fill(iNaT)\n return result.view('timedelta64[ns]')"} {"instruction": "Can you implement a function in Python 3 that\nsubtracts a Period Array or Index from self.", "response": "def _sub_period_array(self, other):\n \"\"\"\n Subtract a Period Array/Index from self. This is only valid if self\n is itself a Period Array/Index, raises otherwise. Both objects must\n have the same frequency.\n\n Parameters\n ----------\n other : PeriodIndex or PeriodArray\n\n Returns\n -------\n result : np.ndarray[object]\n Array of DateOffset objects; nulls represented by NaT.\n \"\"\"\n if not is_period_dtype(self):\n raise TypeError(\"cannot subtract {dtype}-dtype from {cls}\"\n .format(dtype=other.dtype,\n cls=type(self).__name__))\n\n if len(self) != len(other):\n raise ValueError(\"cannot subtract arrays/indices of \"\n \"unequal length\")\n if self.freq != other.freq:\n msg = DIFFERENT_FREQ.format(cls=type(self).__name__,\n own_freq=self.freqstr,\n other_freq=other.freqstr)\n raise IncompatibleFrequency(msg)\n\n new_values = checked_add_with_arr(self.asi8, -other.asi8,\n arr_mask=self._isnan,\n b_mask=other._isnan)\n\n new_values = np.array([self.freq.base * x for x in new_values])\n if self._hasnans or other._hasnans:\n mask = (self._isnan) | (other._isnan)\n new_values[mask] = NaT\n return new_values"} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef _addsub_int_array(self, other, op):\n # _addsub_int_array is overriden by PeriodArray\n assert not is_period_dtype(self)\n assert op in [operator.add, operator.sub]\n\n if self.freq is None:\n # GH#19123\n raise NullFrequencyError(\"Cannot shift with no freq\")\n\n elif isinstance(self.freq, Tick):\n # easy case where we can convert to timedelta64 operation\n td = Timedelta(self.freq)\n return op(self, td * other)\n\n # We should only get here with DatetimeIndex; dispatch\n # to _addsub_offset_array\n assert not is_timedelta64_dtype(self)\n return op(self, np.array(other) * self.freq)", "response": "Add or subtract array - like of integers equivalent to applying _time_shift pointwise."} {"instruction": "Write a Python 3 script to\nadd or subtract array-like of DateOffset objects Parameters ---------- other : Index, np.ndarray object-dtype containing pd.DateOffset objects op : {operator.add, operator.sub} Returns ------- result : same class as self", "response": "def _addsub_offset_array(self, other, op):\n \"\"\"\n Add or subtract array-like of DateOffset objects\n\n Parameters\n ----------\n other : Index, np.ndarray\n object-dtype containing pd.DateOffset objects\n op : {operator.add, operator.sub}\n\n Returns\n -------\n result : same class as self\n \"\"\"\n assert op in [operator.add, operator.sub]\n if len(other) == 1:\n return op(self, other[0])\n\n warnings.warn(\"Adding/subtracting array of DateOffsets to \"\n \"{cls} not vectorized\"\n .format(cls=type(self).__name__), PerformanceWarning)\n\n # For EA self.astype('O') returns a numpy array, not an Index\n left = lib.values_from_object(self.astype('O'))\n\n res_values = op(left, np.array(other))\n kwargs = {}\n if not is_period_dtype(self):\n kwargs['freq'] = 'infer'\n return self._from_sequence(res_values, **kwargs)"} {"instruction": "Make a summary of the following Python 3 code\ndef _time_shift(self, periods, freq=None):\n if freq is not None and freq != self.freq:\n if isinstance(freq, str):\n freq = frequencies.to_offset(freq)\n offset = periods * freq\n result = self + offset\n return result\n\n if periods == 0:\n # immutable so OK\n return self.copy()\n\n if self.freq is None:\n raise NullFrequencyError(\"Cannot shift with no freq\")\n\n start = self[0] + periods * self.freq\n end = self[-1] + periods * self.freq\n\n # Note: in the DatetimeTZ case, _generate_range will infer the\n # appropriate timezone from `start` and `end`, so tz does not need\n # to be passed explicitly.\n return self._generate_range(start=start, end=end, periods=None,\n freq=self.freq)", "response": "Shifts each value of the array by periods."} {"instruction": "Explain what the following Python 3 code does\ndef _ensure_localized(self, arg, ambiguous='raise', nonexistent='raise',\n from_utc=False):\n \"\"\"\n Ensure that we are re-localized.\n\n This is for compat as we can then call this on all datetimelike\n arrays generally (ignored for Period/Timedelta)\n\n Parameters\n ----------\n arg : Union[DatetimeLikeArray, DatetimeIndexOpsMixin, ndarray]\n ambiguous : str, bool, or bool-ndarray, default 'raise'\n nonexistent : str, default 'raise'\n from_utc : bool, default False\n If True, localize the i8 ndarray to UTC first before converting to\n the appropriate tz. If False, localize directly to the tz.\n\n Returns\n -------\n localized array\n \"\"\"\n\n # reconvert to local tz\n tz = getattr(self, 'tz', None)\n if tz is not None:\n if not isinstance(arg, type(self)):\n arg = self._simple_new(arg)\n if from_utc:\n arg = arg.tz_localize('UTC').tz_convert(self.tz)\n else:\n arg = arg.tz_localize(\n self.tz, ambiguous=ambiguous, nonexistent=nonexistent\n )\n return arg", "response": "Ensures that the array is localized to the appropriate tz."} {"instruction": "How would you explain what the following Python 3 function does\ndef min(self, axis=None, skipna=True, *args, **kwargs):\n nv.validate_min(args, kwargs)\n nv.validate_minmax_axis(axis)\n\n result = nanops.nanmin(self.asi8, skipna=skipna, mask=self.isna())\n if isna(result):\n # Period._from_ordinal does not handle np.nan gracefully\n return NaT\n return self._box_func(result)", "response": "Return the minimum value of the array or minimum along an axis."} {"instruction": "Write a Python 3 script for\nreturning the maximum value of the Array or maximum along an axis.", "response": "def max(self, axis=None, skipna=True, *args, **kwargs):\n \"\"\"\n Return the maximum value of the Array or maximum along\n an axis.\n\n See Also\n --------\n numpy.ndarray.max\n Index.max : Return the maximum value in an Index.\n Series.max : Return the maximum value in a Series.\n \"\"\"\n # TODO: skipna is broken with max.\n # See https://github.com/pandas-dev/pandas/issues/24265\n nv.validate_max(args, kwargs)\n nv.validate_minmax_axis(axis)\n\n mask = self.isna()\n if skipna:\n values = self[~mask].asi8\n elif mask.any():\n return NaT\n else:\n values = self.asi8\n\n if not len(values):\n # short-circut for empty max / min\n return NaT\n\n result = nanops.nanmax(values, skipna=skipna)\n # Don't have to worry about NA `result`, since no NA went in.\n return self._box_func(result)"} {"instruction": "Make a summary of the following Python 3 code\ndef _period_array_cmp(cls, op):\n opname = '__{name}__'.format(name=op.__name__)\n nat_result = opname == '__ne__'\n\n def wrapper(self, other):\n op = getattr(self.asi8, opname)\n\n if isinstance(other, (ABCDataFrame, ABCSeries, ABCIndexClass)):\n return NotImplemented\n\n if is_list_like(other) and len(other) != len(self):\n raise ValueError(\"Lengths must match\")\n\n if isinstance(other, Period):\n self._check_compatible_with(other)\n\n result = op(other.ordinal)\n elif isinstance(other, cls):\n self._check_compatible_with(other)\n\n result = op(other.asi8)\n\n mask = self._isnan | other._isnan\n if mask.any():\n result[mask] = nat_result\n\n return result\n elif other is NaT:\n result = np.empty(len(self.asi8), dtype=bool)\n result.fill(nat_result)\n else:\n other = Period(other, freq=self.freq)\n result = op(other.ordinal)\n\n if self._hasnans:\n result[self._isnan] = nat_result\n\n return result\n\n return compat.set_function_name(wrapper, opname, cls)", "response": "Wrap comparison operations to convert Period - like to PeriodDtype\n "} {"instruction": "How would you explain what the following Python 3 function does\ndef _raise_on_incompatible(left, right):\n # GH#24283 error message format depends on whether right is scalar\n if isinstance(right, np.ndarray):\n other_freq = None\n elif isinstance(right, (ABCPeriodIndex, PeriodArray, Period, DateOffset)):\n other_freq = right.freqstr\n else:\n other_freq = _delta_to_tick(Timedelta(right)).freqstr\n\n msg = DIFFERENT_FREQ.format(cls=type(left).__name__,\n own_freq=left.freqstr,\n other_freq=other_freq)\n raise IncompatibleFrequency(msg)", "response": "Helper function to render a consistent error message when raising IncompatibleFrequency."} {"instruction": "Create a Python 3 function to\nconstruct a new PeriodArray from a sequence of Period scalars.", "response": "def period_array(\n data: Sequence[Optional[Period]],\n freq: Optional[Tick] = None,\n copy: bool = False,\n) -> PeriodArray:\n \"\"\"\n Construct a new PeriodArray from a sequence of Period scalars.\n\n Parameters\n ----------\n data : Sequence of Period objects\n A sequence of Period objects. These are required to all have\n the same ``freq.`` Missing values can be indicated by ``None``\n or ``pandas.NaT``.\n freq : str, Tick, or Offset\n The frequency of every element of the array. This can be specified\n to avoid inferring the `freq` from `data`.\n copy : bool, default False\n Whether to ensure a copy of the data is made.\n\n Returns\n -------\n PeriodArray\n\n See Also\n --------\n PeriodArray\n pandas.PeriodIndex\n\n Examples\n --------\n >>> period_array([pd.Period('2017', freq='A'),\n ... pd.Period('2018', freq='A')])\n \n ['2017', '2018']\n Length: 2, dtype: period[A-DEC]\n\n >>> period_array([pd.Period('2017', freq='A'),\n ... pd.Period('2018', freq='A'),\n ... pd.NaT])\n \n ['2017', '2018', 'NaT']\n Length: 3, dtype: period[A-DEC]\n\n Integers that look like years are handled\n\n >>> period_array([2000, 2001, 2002], freq='D')\n ['2000-01-01', '2001-01-01', '2002-01-01']\n Length: 3, dtype: period[D]\n\n Datetime-like strings may also be passed\n\n >>> period_array(['2000-Q1', '2000-Q2', '2000-Q3', '2000-Q4'], freq='Q')\n \n ['2000Q1', '2000Q2', '2000Q3', '2000Q4']\n Length: 4, dtype: period[Q-DEC]\n \"\"\"\n if is_datetime64_dtype(data):\n return PeriodArray._from_datetime64(data, freq)\n if isinstance(data, (ABCPeriodIndex, ABCSeries, PeriodArray)):\n return PeriodArray(data, freq)\n\n # other iterable of some kind\n if not isinstance(data, (np.ndarray, list, tuple)):\n data = list(data)\n\n data = np.asarray(data)\n\n if freq:\n dtype = PeriodDtype(freq)\n else:\n dtype = None\n\n if is_float_dtype(data) and len(data) > 0:\n raise TypeError(\"PeriodIndex does not allow \"\n \"floating point in construction\")\n\n data = ensure_object(data)\n\n return PeriodArray._from_sequence(data, dtype=dtype)"} {"instruction": "How would you explain what the following Python 3 function does\ndef validate_dtype_freq(dtype, freq):\n if freq is not None:\n freq = frequencies.to_offset(freq)\n\n if dtype is not None:\n dtype = pandas_dtype(dtype)\n if not is_period_dtype(dtype):\n raise ValueError('dtype must be PeriodDtype')\n if freq is None:\n freq = dtype.freq\n elif freq != dtype.freq:\n raise IncompatibleFrequency('specified freq and dtype '\n 'are different')\n return freq", "response": "Validate that the specified dtype and frequency are valid."} {"instruction": "Make a summary of the following Python 3 code\ndef dt64arr_to_periodarr(data, freq, tz=None):\n if data.dtype != np.dtype('M8[ns]'):\n raise ValueError('Wrong dtype: {dtype}'.format(dtype=data.dtype))\n\n if freq is None:\n if isinstance(data, ABCIndexClass):\n data, freq = data._values, data.freq\n elif isinstance(data, ABCSeries):\n data, freq = data._values, data.dt.freq\n\n freq = Period._maybe_convert_freq(freq)\n\n if isinstance(data, (ABCIndexClass, ABCSeries)):\n data = data._values\n\n base, mult = libfrequencies.get_freq_code(freq)\n return libperiod.dt64arr_to_periodarr(data.view('i8'), base, tz), freq", "response": "Convert an array of datetime - like objects to Period arrays."} {"instruction": "Write a Python 3 script for\nconstructing a PeriodArray from a datetime64 array.", "response": "def _from_datetime64(cls, data, freq, tz=None):\n \"\"\"\n Construct a PeriodArray from a datetime64 array\n\n Parameters\n ----------\n data : ndarray[datetime64[ns], datetime64[ns, tz]]\n freq : str or Tick\n tz : tzinfo, optional\n\n Returns\n -------\n PeriodArray[freq]\n \"\"\"\n data, freq = dt64arr_to_periodarr(data, freq, tz)\n return cls(data, freq=freq)"} {"instruction": "Explain what the following Python 3 code does\ndef to_timestamp(self, freq=None, how='start'):\n from pandas.core.arrays import DatetimeArray\n\n how = libperiod._validate_end_alias(how)\n\n end = how == 'E'\n if end:\n if freq == 'B':\n # roll forward to ensure we land on B date\n adjust = Timedelta(1, 'D') - Timedelta(1, 'ns')\n return self.to_timestamp(how='start') + adjust\n else:\n adjust = Timedelta(1, 'ns')\n return (self + self.freq).to_timestamp(how='start') - adjust\n\n if freq is None:\n base, mult = libfrequencies.get_freq_code(self.freq)\n freq = libfrequencies.get_to_timestamp_base(base)\n else:\n freq = Period._maybe_convert_freq(freq)\n\n base, mult = libfrequencies.get_freq_code(freq)\n new_data = self.asfreq(freq, how=how)\n\n new_data = libperiod.periodarr_to_dt64arr(new_data.asi8, base)\n return DatetimeArray._from_sequence(new_data, freq='infer')", "response": "Cast to datetime. datetime64"} {"instruction": "How would you implement a function in Python 3 that\nshifts each value by periods.", "response": "def _time_shift(self, periods, freq=None):\n \"\"\"\n Shift each value by `periods`.\n\n Note this is different from ExtensionArray.shift, which\n shifts the *position* of each element, padding the end with\n missing values.\n\n Parameters\n ----------\n periods : int\n Number of periods to shift by.\n freq : pandas.DateOffset, pandas.Timedelta, or string\n Frequency increment to shift by.\n \"\"\"\n if freq is not None:\n raise TypeError(\"`freq` argument is not supported for \"\n \"{cls}._time_shift\"\n .format(cls=type(self).__name__))\n values = self.asi8 + periods * self.freq.n\n if self._hasnans:\n values[self._isnan] = iNaT\n return type(self)(values, freq=self.freq)"} {"instruction": "How would you implement a function in Python 3 that\nconverts the Period Array/Index to the specified frequency.", "response": "def asfreq(self, freq=None, how='E'):\n \"\"\"\n Convert the Period Array/Index to the specified frequency `freq`.\n\n Parameters\n ----------\n freq : str\n a frequency\n how : str {'E', 'S'}\n 'E', 'END', or 'FINISH' for end,\n 'S', 'START', or 'BEGIN' for start.\n Whether the elements should be aligned to the end\n or start within pa period. January 31st ('END') vs.\n January 1st ('START') for example.\n\n Returns\n -------\n new : Period Array/Index with the new frequency\n\n Examples\n --------\n >>> pidx = pd.period_range('2010-01-01', '2015-01-01', freq='A')\n >>> pidx\n PeriodIndex(['2010', '2011', '2012', '2013', '2014', '2015'],\n dtype='period[A-DEC]', freq='A-DEC')\n\n >>> pidx.asfreq('M')\n PeriodIndex(['2010-12', '2011-12', '2012-12', '2013-12', '2014-12',\n '2015-12'], dtype='period[M]', freq='M')\n\n >>> pidx.asfreq('M', how='S')\n PeriodIndex(['2010-01', '2011-01', '2012-01', '2013-01', '2014-01',\n '2015-01'], dtype='period[M]', freq='M')\n \"\"\"\n how = libperiod._validate_end_alias(how)\n\n freq = Period._maybe_convert_freq(freq)\n\n base1, mult1 = libfrequencies.get_freq_code(self.freq)\n base2, mult2 = libfrequencies.get_freq_code(freq)\n\n asi8 = self.asi8\n # mult1 can't be negative or 0\n end = how == 'E'\n if end:\n ordinal = asi8 + mult1 - 1\n else:\n ordinal = asi8\n\n new_data = period_asfreq_arr(ordinal, base1, base2, end)\n\n if self._hasnans:\n new_data[self._isnan] = iNaT\n\n return type(self)(new_data, freq=freq)"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef _format_native_types(self, na_rep='NaT', date_format=None, **kwargs):\n values = self.astype(object)\n\n if date_format:\n formatter = lambda dt: dt.strftime(date_format)\n else:\n formatter = lambda dt: '%s' % dt\n\n if self._hasnans:\n mask = self._isnan\n values[mask] = na_rep\n imask = ~mask\n values[imask] = np.array([formatter(dt) for dt\n in values[imask]])\n else:\n values = np.array([formatter(dt) for dt in values])\n return values", "response": "format the native types of the object"} {"instruction": "How would you explain what the following Python 3 function does\ndef _add_timedeltalike_scalar(self, other):\n assert isinstance(self.freq, Tick) # checked by calling function\n assert isinstance(other, (timedelta, np.timedelta64, Tick))\n\n if notna(other):\n # special handling for np.timedelta64(\"NaT\"), avoid calling\n # _check_timedeltalike_freq_compat as that would raise TypeError\n other = self._check_timedeltalike_freq_compat(other)\n\n # Note: when calling parent class's _add_timedeltalike_scalar,\n # it will call delta_to_nanoseconds(delta). Because delta here\n # is an integer, delta_to_nanoseconds will return it unchanged.\n ordinals = super()._add_timedeltalike_scalar(other)\n return ordinals", "response": "Add a Timedeltalike scalar to the internal array."} {"instruction": "Create a Python 3 function to\nadd the delta of self to other.", "response": "def _add_delta_tdi(self, other):\n \"\"\"\n Parameters\n ----------\n other : TimedeltaArray or ndarray[timedelta64]\n\n Returns\n -------\n result : ndarray[int64]\n \"\"\"\n assert isinstance(self.freq, Tick) # checked by calling function\n\n delta = self._check_timedeltalike_freq_compat(other)\n return self._addsub_int_array(delta, operator.add).asi8"} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef _add_delta(self, other):\n if not isinstance(self.freq, Tick):\n # We cannot add timedelta-like to non-tick PeriodArray\n _raise_on_incompatible(self, other)\n\n new_ordinals = super()._add_delta(other)\n return type(self)(new_ordinals, freq=self.freq)", "response": "Add a timedelta - like Tick or TimedeltaIndex - like object\n to self yielding a new PeriodArray\n "} {"instruction": "How would you explain what the following Python 3 function does\ndef _check_timedeltalike_freq_compat(self, other):\n assert isinstance(self.freq, Tick) # checked by calling function\n own_offset = frequencies.to_offset(self.freq.rule_code)\n base_nanos = delta_to_nanoseconds(own_offset)\n\n if isinstance(other, (timedelta, np.timedelta64, Tick)):\n nanos = delta_to_nanoseconds(other)\n\n elif isinstance(other, np.ndarray):\n # numpy timedelta64 array; all entries must be compatible\n assert other.dtype.kind == 'm'\n if other.dtype != _TD_DTYPE:\n # i.e. non-nano unit\n # TODO: disallow unit-less timedelta64\n other = other.astype(_TD_DTYPE)\n nanos = other.view('i8')\n else:\n # TimedeltaArray/Index\n nanos = other.asi8\n\n if np.all(nanos % base_nanos == 0):\n # nanos being added is an integer multiple of the\n # base-frequency to self.freq\n delta = nanos // base_nanos\n # delta is the integer (or integer-array) number of periods\n # by which will be added to self.\n return delta\n\n _raise_on_incompatible(self, other)", "response": "Internal method to check that self. freq is compatible with other."} {"instruction": "Can you implement a function in Python 3 that\ndetects missing values. Treat NaN INF as null.", "response": "def _isna_old(obj):\n \"\"\"Detect missing values. Treat None, NaN, INF, -INF as null.\n\n Parameters\n ----------\n arr: ndarray or object value\n\n Returns\n -------\n boolean ndarray or boolean\n \"\"\"\n if is_scalar(obj):\n return libmissing.checknull_old(obj)\n # hack (for now) because MI registers as ndarray\n elif isinstance(obj, ABCMultiIndex):\n raise NotImplementedError(\"isna is not defined for MultiIndex\")\n elif isinstance(obj, (ABCSeries, np.ndarray, ABCIndexClass)):\n return _isna_ndarraylike_old(obj)\n elif isinstance(obj, ABCGeneric):\n return obj._constructor(obj._data.isna(func=_isna_old))\n elif isinstance(obj, list):\n return _isna_ndarraylike_old(np.asarray(obj, dtype=object))\n elif hasattr(obj, '__array__'):\n return _isna_ndarraylike_old(np.asarray(obj))\n else:\n return obj is None"} {"instruction": "Write a Python 3 script to\noption change callback for na / inf behaviour", "response": "def _use_inf_as_na(key):\n \"\"\"Option change callback for na/inf behaviour\n Choose which replacement for numpy.isnan / -numpy.isfinite is used.\n\n Parameters\n ----------\n flag: bool\n True means treat None, NaN, INF, -INF as null (old way),\n False means None and NaN are null, but INF, -INF are not null\n (new way).\n\n Notes\n -----\n This approach to setting global module values is discussed and\n approved here:\n\n * http://stackoverflow.com/questions/4859217/\n programmatically-creating-variables-in-python/4859312#4859312\n \"\"\"\n from pandas._config import get_option\n flag = get_option(key)\n if flag:\n globals()['_isna'] = _isna_old\n else:\n globals()['_isna'] = _isna_new"} {"instruction": "Create a Python 3 function for\nchecking if array is NaN - compatible", "response": "def _isna_compat(arr, fill_value=np.nan):\n \"\"\"\n Parameters\n ----------\n arr: a numpy array\n fill_value: fill value, default to np.nan\n\n Returns\n -------\n True if we can fill using this fill_value\n \"\"\"\n dtype = arr.dtype\n if isna(fill_value):\n return not (is_bool_dtype(dtype) or\n is_integer_dtype(dtype))\n return True"} {"instruction": "Write a Python 3 function for\ntruing if two arrays left and right are equivalent.", "response": "def array_equivalent(left, right, strict_nan=False):\n \"\"\"\n True if two arrays, left and right, have equal non-NaN elements, and NaNs\n in corresponding locations. False otherwise. It is assumed that left and\n right are NumPy arrays of the same dtype. The behavior of this function\n (particularly with respect to NaNs) is not defined if the dtypes are\n different.\n\n Parameters\n ----------\n left, right : ndarrays\n strict_nan : bool, default False\n If True, consider NaN and None to be different.\n\n Returns\n -------\n b : bool\n Returns True if the arrays are equivalent.\n\n Examples\n --------\n >>> array_equivalent(\n ... np.array([1, 2, np.nan]),\n ... np.array([1, 2, np.nan]))\n True\n >>> array_equivalent(\n ... np.array([1, np.nan, 2]),\n ... np.array([1, 2, np.nan]))\n False\n \"\"\"\n\n left, right = np.asarray(left), np.asarray(right)\n\n # shape compat\n if left.shape != right.shape:\n return False\n\n # Object arrays can contain None, NaN and NaT.\n # string dtypes must be come to this path for NumPy 1.7.1 compat\n if is_string_dtype(left) or is_string_dtype(right):\n\n if not strict_nan:\n # isna considers NaN and None to be equivalent.\n return lib.array_equivalent_object(\n ensure_object(left.ravel()), ensure_object(right.ravel()))\n\n for left_value, right_value in zip(left, right):\n if left_value is NaT and right_value is not NaT:\n return False\n\n elif isinstance(left_value, float) and np.isnan(left_value):\n if (not isinstance(right_value, float) or\n not np.isnan(right_value)):\n return False\n else:\n if left_value != right_value:\n return False\n return True\n\n # NaNs can occur in float and complex arrays.\n if is_float_dtype(left) or is_complex_dtype(left):\n\n # empty\n if not (np.prod(left.shape) and np.prod(right.shape)):\n return True\n return ((left == right) | (isna(left) & isna(right))).all()\n\n # numpy will will not allow this type of datetimelike vs integer comparison\n elif is_datetimelike_v_numeric(left, right):\n return False\n\n # M8/m8\n elif needs_i8_conversion(left) and needs_i8_conversion(right):\n if not is_dtype_equal(left.dtype, right.dtype):\n return False\n\n left = left.view('i8')\n right = right.view('i8')\n\n # if we have structured dtypes, compare first\n if (left.dtype.type is np.void or\n right.dtype.type is np.void):\n if left.dtype != right.dtype:\n return False\n\n return np.array_equal(left, right)"} {"instruction": "Given the following Python 3 function, write the documentation\ndef _infer_fill_value(val):\n\n if not is_list_like(val):\n val = [val]\n val = np.array(val, copy=False)\n if is_datetimelike(val):\n return np.array('NaT', dtype=val.dtype)\n elif is_object_dtype(val.dtype):\n dtype = lib.infer_dtype(ensure_object(val), skipna=False)\n if dtype in ['datetime', 'datetime64']:\n return np.array('NaT', dtype=_NS_DTYPE)\n elif dtype in ['timedelta', 'timedelta64']:\n return np.array('NaT', dtype=_TD_DTYPE)\n return np.nan", "response": "Infer the fill value for the nan or NaT from the provided scalar or list - like."} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef _maybe_fill(arr, fill_value=np.nan):\n if _isna_compat(arr, fill_value):\n arr.fill(fill_value)\n return arr", "response": "Fill the array with fill_value if arr is not a compatible fill_value."} {"instruction": "Write a Python 3 function for\nreturning a dtype compatible na value for the base object.", "response": "def na_value_for_dtype(dtype, compat=True):\n \"\"\"\n Return a dtype compat na value\n\n Parameters\n ----------\n dtype : string / dtype\n compat : boolean, default True\n\n Returns\n -------\n np.dtype or a pandas dtype\n\n Examples\n --------\n >>> na_value_for_dtype(np.dtype('int64'))\n 0\n >>> na_value_for_dtype(np.dtype('int64'), compat=False)\n nan\n >>> na_value_for_dtype(np.dtype('float64'))\n nan\n >>> na_value_for_dtype(np.dtype('bool'))\n False\n >>> na_value_for_dtype(np.dtype('datetime64[ns]'))\n NaT\n \"\"\"\n dtype = pandas_dtype(dtype)\n\n if is_extension_array_dtype(dtype):\n return dtype.na_value\n if (is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype) or\n is_timedelta64_dtype(dtype) or is_period_dtype(dtype)):\n return NaT\n elif is_float_dtype(dtype):\n return np.nan\n elif is_integer_dtype(dtype):\n if compat:\n return 0\n return np.nan\n elif is_bool_dtype(dtype):\n return False\n return np.nan"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef remove_na_arraylike(arr):\n if is_extension_array_dtype(arr):\n return arr[notna(arr)]\n else:\n return arr[notna(lib.values_from_object(arr))]", "response": "Return array - like containing only true / non - NaN values possibly empty."} {"instruction": "Create a Python 3 function for\ncreating a set of subplots and return a new figure object.", "response": "def _subplots(naxes=None, sharex=False, sharey=False, squeeze=True,\n subplot_kw=None, ax=None, layout=None, layout_type='box',\n **fig_kw):\n \"\"\"Create a figure with a set of subplots already made.\n\n This utility wrapper makes it convenient to create common layouts of\n subplots, including the enclosing figure object, in a single call.\n\n Keyword arguments:\n\n naxes : int\n Number of required axes. Exceeded axes are set invisible. Default is\n nrows * ncols.\n\n sharex : bool\n If True, the X axis will be shared amongst all subplots.\n\n sharey : bool\n If True, the Y axis will be shared amongst all subplots.\n\n squeeze : bool\n\n If True, extra dimensions are squeezed out from the returned axis object:\n - if only one subplot is constructed (nrows=ncols=1), the resulting\n single Axis object is returned as a scalar.\n - for Nx1 or 1xN subplots, the returned object is a 1-d numpy object\n array of Axis objects are returned as numpy 1-d arrays.\n - for NxM subplots with N>1 and M>1 are returned as a 2d array.\n\n If False, no squeezing is done: the returned axis object is always\n a 2-d array containing Axis instances, even if it ends up being 1x1.\n\n subplot_kw : dict\n Dict with keywords passed to the add_subplot() call used to create each\n subplots.\n\n ax : Matplotlib axis object, optional\n\n layout : tuple\n Number of rows and columns of the subplot grid.\n If not specified, calculated from naxes and layout_type\n\n layout_type : {'box', 'horziontal', 'vertical'}, default 'box'\n Specify how to layout the subplot grid.\n\n fig_kw : Other keyword arguments to be passed to the figure() call.\n Note that all keywords not recognized above will be\n automatically included here.\n\n Returns:\n\n fig, ax : tuple\n - fig is the Matplotlib Figure object\n - ax can be either a single axis object or an array of axis objects if\n more than one subplot was created. The dimensions of the resulting array\n can be controlled with the squeeze keyword, see above.\n\n **Examples:**\n\n x = np.linspace(0, 2*np.pi, 400)\n y = np.sin(x**2)\n\n # Just a figure and one subplot\n f, ax = plt.subplots()\n ax.plot(x, y)\n ax.set_title('Simple plot')\n\n # Two subplots, unpack the output array immediately\n f, (ax1, ax2) = plt.subplots(1, 2, sharey=True)\n ax1.plot(x, y)\n ax1.set_title('Sharing Y axis')\n ax2.scatter(x, y)\n\n # Four polar axes\n plt.subplots(2, 2, subplot_kw=dict(polar=True))\n \"\"\"\n import matplotlib.pyplot as plt\n\n if subplot_kw is None:\n subplot_kw = {}\n\n if ax is None:\n fig = plt.figure(**fig_kw)\n else:\n if is_list_like(ax):\n ax = _flatten(ax)\n if layout is not None:\n warnings.warn(\"When passing multiple axes, layout keyword is \"\n \"ignored\", UserWarning)\n if sharex or sharey:\n warnings.warn(\"When passing multiple axes, sharex and sharey \"\n \"are ignored. These settings must be specified \"\n \"when creating axes\", UserWarning,\n stacklevel=4)\n if len(ax) == naxes:\n fig = ax[0].get_figure()\n return fig, ax\n else:\n raise ValueError(\"The number of passed axes must be {0}, the \"\n \"same as the output plot\".format(naxes))\n\n fig = ax.get_figure()\n # if ax is passed and a number of subplots is 1, return ax as it is\n if naxes == 1:\n if squeeze:\n return fig, ax\n else:\n return fig, _flatten(ax)\n else:\n warnings.warn(\"To output multiple subplots, the figure containing \"\n \"the passed axes is being cleared\", UserWarning,\n stacklevel=4)\n fig.clear()\n\n nrows, ncols = _get_layout(naxes, layout=layout, layout_type=layout_type)\n nplots = nrows * ncols\n\n # Create empty object array to hold all axes. It's easiest to make it 1-d\n # so we can just append subplots upon creation, and then\n axarr = np.empty(nplots, dtype=object)\n\n # Create first subplot separately, so we can share it if requested\n ax0 = fig.add_subplot(nrows, ncols, 1, **subplot_kw)\n\n if sharex:\n subplot_kw['sharex'] = ax0\n if sharey:\n subplot_kw['sharey'] = ax0\n axarr[0] = ax0\n\n # Note off-by-one counting because add_subplot uses the MATLAB 1-based\n # convention.\n for i in range(1, nplots):\n kwds = subplot_kw.copy()\n # Set sharex and sharey to None for blank/dummy axes, these can\n # interfere with proper axis limits on the visible axes if\n # they share axes e.g. issue #7528\n if i >= naxes:\n kwds['sharex'] = None\n kwds['sharey'] = None\n ax = fig.add_subplot(nrows, ncols, i + 1, **kwds)\n axarr[i] = ax\n\n if naxes != nplots:\n for ax in axarr[naxes:]:\n ax.set_visible(False)\n\n _handle_shared_axes(axarr, nplots, naxes, nrows, ncols, sharex, sharey)\n\n if squeeze:\n # Reshape the array to have the final desired dimension (nrow,ncol),\n # though discarding unneeded dimensions that equal 1. If we only have\n # one subplot, just return it instead of a 1-element array.\n if nplots == 1:\n axes = axarr[0]\n else:\n axes = axarr.reshape(nrows, ncols).squeeze()\n else:\n # returned axis array will be always 2-d, even if nrows=ncols=1\n axes = axarr.reshape(nrows, ncols)\n\n return fig, axes"} {"instruction": "Write a Python 3 script to\nrender tempita templates before calling cythonize", "response": "def maybe_cythonize(extensions, *args, **kwargs):\n \"\"\"\n Render tempita templates before calling cythonize\n \"\"\"\n if len(sys.argv) > 1 and 'clean' in sys.argv:\n # Avoid running cythonize on `python setup.py clean`\n # See https://github.com/cython/cython/issues/1495\n return extensions\n if not cython:\n # Avoid trying to look up numpy when installing from sdist\n # https://github.com/pandas-dev/pandas/issues/25193\n # TODO: See if this can be removed after pyproject.toml added.\n return extensions\n\n numpy_incl = pkg_resources.resource_filename('numpy', 'core/include')\n # TODO: Is this really necessary here?\n for ext in extensions:\n if (hasattr(ext, 'include_dirs') and\n numpy_incl not in ext.include_dirs):\n ext.include_dirs.append(numpy_incl)\n\n build_ext.render_templates(_pxifiles)\n return cythonize(extensions, *args, **kwargs)"} {"instruction": "Write a Python 3 function that can\nfast transform path for aggregations", "response": "def _transform_fast(self, result, obj, func_nm):\n \"\"\"\n Fast transform path for aggregations\n \"\"\"\n # if there were groups with no observations (Categorical only?)\n # try casting data to original dtype\n cast = self._transform_should_cast(func_nm)\n\n # for each col, reshape to to size of original frame\n # by take operation\n ids, _, ngroup = self.grouper.group_info\n output = []\n for i, _ in enumerate(result.columns):\n res = algorithms.take_1d(result.iloc[:, i].values, ids)\n if cast:\n res = self._try_cast(res, obj.iloc[:, i])\n output.append(res)\n\n return DataFrame._from_arrays(output, columns=result.columns,\n index=obj.index)"} {"instruction": "Write a Python 3 script for\nreturning a copy of the DataFrame excluding elements from groups that do not satisfy the boolean criterion specified by func.", "response": "def filter(self, func, dropna=True, *args, **kwargs): # noqa\n \"\"\"\n Return a copy of a DataFrame excluding elements from groups that\n do not satisfy the boolean criterion specified by func.\n\n Parameters\n ----------\n f : function\n Function to apply to each subframe. Should return True or False.\n dropna : Drop groups that do not pass the filter. True by default;\n if False, groups that evaluate False are filled with NaNs.\n\n Returns\n -------\n filtered : DataFrame\n\n Notes\n -----\n Each subframe is endowed the attribute 'name' in case you need to know\n which group you are working on.\n\n Examples\n --------\n >>> df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',\n ... 'foo', 'bar'],\n ... 'B' : [1, 2, 3, 4, 5, 6],\n ... 'C' : [2.0, 5., 8., 1., 2., 9.]})\n >>> grouped = df.groupby('A')\n >>> grouped.filter(lambda x: x['B'].mean() > 3.)\n A B C\n 1 bar 2 5.0\n 3 bar 4 1.0\n 5 bar 6 9.0\n \"\"\"\n\n indices = []\n\n obj = self._selected_obj\n gen = self.grouper.get_iterator(obj, axis=self.axis)\n\n for name, group in gen:\n object.__setattr__(group, 'name', name)\n\n res = func(group, *args, **kwargs)\n\n try:\n res = res.squeeze()\n except AttributeError: # allow e.g., scalars and frames to pass\n pass\n\n # interpret the result of the filter\n if is_bool(res) or (is_scalar(res) and isna(res)):\n if res and notna(res):\n indices.append(self._get_index(name))\n else:\n # non scalars aren't allowed\n raise TypeError(\"filter function returned a %s, \"\n \"but expected a scalar bool\" %\n type(res).__name__)\n\n return self._apply_filter(indices, dropna)"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef _wrap_output(self, output, index, names=None):\n output = output[self._selection_name]\n\n if names is not None:\n return DataFrame(output, index=index, columns=names)\n else:\n name = self._selection_name\n if name is None:\n name = self._selected_obj.name\n return Series(output, index=index, name=name)", "response": "common agg / transform wrapping logic"} {"instruction": "Can you implement a function in Python 3 that\nfasts version of transform only applicable to builtin / cythonizable functions", "response": "def _transform_fast(self, func, func_nm):\n \"\"\"\n fast version of transform, only applicable to\n builtin/cythonizable functions\n \"\"\"\n if isinstance(func, str):\n func = getattr(self, func)\n\n ids, _, ngroup = self.grouper.group_info\n cast = self._transform_should_cast(func_nm)\n out = algorithms.take_1d(func()._values, ids)\n if cast:\n out = self._try_cast(out, self.obj)\n return Series(out, index=self.obj.index, name=self.obj.name)"} {"instruction": "Explain what the following Python 3 code does\ndef filter(self, func, dropna=True, *args, **kwargs): # noqa\n if isinstance(func, str):\n wrapper = lambda x: getattr(x, func)(*args, **kwargs)\n else:\n wrapper = lambda x: func(x, *args, **kwargs)\n\n # Interpret np.nan as False.\n def true_and_notna(x, *args, **kwargs):\n b = wrapper(x, *args, **kwargs)\n return b and notna(b)\n\n try:\n indices = [self._get_index(name) for name, group in self\n if true_and_notna(group)]\n except ValueError:\n raise TypeError(\"the filter must return a boolean result\")\n except TypeError:\n raise TypeError(\"the filter must return a boolean result\")\n\n filtered = self._apply_filter(indices, dropna)\n return filtered", "response": "Returns a Series containing only the elements from the groups that satisfy the boolean criterion func."} {"instruction": "Can you tell what is the following Python 3 function doing\ndef nunique(self, dropna=True):\n ids, _, _ = self.grouper.group_info\n\n val = self.obj.get_values()\n\n try:\n sorter = np.lexsort((val, ids))\n except TypeError: # catches object dtypes\n msg = 'val.dtype must be object, got {}'.format(val.dtype)\n assert val.dtype == object, msg\n val, _ = algorithms.factorize(val, sort=False)\n sorter = np.lexsort((val, ids))\n _isna = lambda a: a == -1\n else:\n _isna = isna\n\n ids, val = ids[sorter], val[sorter]\n\n # group boundaries are where group ids change\n # unique observations are where sorted values change\n idx = np.r_[0, 1 + np.nonzero(ids[1:] != ids[:-1])[0]]\n inc = np.r_[1, val[1:] != val[:-1]]\n\n # 1st item of each group is a new unique observation\n mask = _isna(val)\n if dropna:\n inc[idx] = 1\n inc[mask] = 0\n else:\n inc[mask & np.r_[False, mask[:-1]]] = 0\n inc[idx] = 1\n\n out = np.add.reduceat(inc, idx).astype('int64', copy=False)\n if len(ids):\n # NaN/NaT group exists if the head of ids is -1,\n # so remove it from res and exclude its index from idx\n if ids[0] == -1:\n res = out[1:]\n idx = idx[np.flatnonzero(idx)]\n else:\n res = out\n else:\n res = out[1:]\n ri = self.grouper.result_index\n\n # we might have duplications among the bins\n if len(res) != len(ri):\n res, out = np.zeros(len(ri), dtype=out.dtype), res\n res[ids[idx]] = out\n\n return Series(res,\n index=ri,\n name=self._selection_name)", "response": "Return the number of unique elements in the group."} {"instruction": "Implement a function in Python 3 to\ncompute count of group entries excluding missing values", "response": "def count(self):\n \"\"\" Compute count of group, excluding missing values \"\"\"\n ids, _, ngroups = self.grouper.group_info\n val = self.obj.get_values()\n\n mask = (ids != -1) & ~isna(val)\n ids = ensure_platform_int(ids)\n minlength = ngroups or 0\n out = np.bincount(ids[mask], minlength=minlength)\n\n return Series(out,\n index=self.grouper.result_index,\n name=self._selection_name,\n dtype='int64')"} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef pct_change(self, periods=1, fill_method='pad', limit=None, freq=None):\n # TODO: Remove this conditional when #23918 is fixed\n if freq:\n return self.apply(lambda x: x.pct_change(periods=periods,\n fill_method=fill_method,\n limit=limit, freq=freq))\n filled = getattr(self, fill_method)(limit=limit)\n fill_grp = filled.groupby(self.grouper.labels)\n shifted = fill_grp.shift(periods=periods, freq=freq)\n\n return (filled / shifted) - 1", "response": "Calcuate pct_change of each value to previous entry in group"} {"instruction": "Here you have a function in Python 3, explain what it does\ndef _gotitem(self, key, ndim, subset=None):\n\n if ndim == 2:\n if subset is None:\n subset = self.obj\n return DataFrameGroupBy(subset, self.grouper, selection=key,\n grouper=self.grouper,\n exclusions=self.exclusions,\n as_index=self.as_index,\n observed=self.observed)\n elif ndim == 1:\n if subset is None:\n subset = self.obj[key]\n return SeriesGroupBy(subset, selection=key,\n grouper=self.grouper)\n\n raise AssertionError(\"invalid ndim for _gotitem\")", "response": "Internal function to define the items in the object."} {"instruction": "Make a summary of the following Python 3 code\ndef _reindex_output(self, result):\n\n # we need to re-expand the output space to accomodate all values\n # whether observed or not in the cartesian product of our groupes\n groupings = self.grouper.groupings\n if groupings is None:\n return result\n elif len(groupings) == 1:\n return result\n\n # if we only care about the observed values\n # we are done\n elif self.observed:\n return result\n\n # reindexing only applies to a Categorical grouper\n elif not any(isinstance(ping.grouper, (Categorical, CategoricalIndex))\n for ping in groupings):\n return result\n\n levels_list = [ping.group_index for ping in groupings]\n index, _ = MultiIndex.from_product(\n levels_list, names=self.grouper.names).sortlevel()\n\n if self.as_index:\n d = {self.obj._get_axis_name(self.axis): index, 'copy': False}\n return result.reindex(**d)\n\n # GH 13204\n # Here, the categorical in-axis groupers, which need to be fully\n # expanded, are columns in `result`. An idea is to do:\n # result = result.set_index(self.grouper.names)\n # .reindex(index).reset_index()\n # but special care has to be taken because of possible not-in-axis\n # groupers.\n # So, we manually select and drop the in-axis grouper columns,\n # reindex `result`, and then reset the in-axis grouper columns.\n\n # Select in-axis groupers\n in_axis_grps = ((i, ping.name) for (i, ping)\n in enumerate(groupings) if ping.in_axis)\n g_nums, g_names = zip(*in_axis_grps)\n\n result = result.drop(labels=list(g_names), axis=1)\n\n # Set a temp index and reindex (possibly expanding)\n result = result.set_index(self.grouper.result_index\n ).reindex(index, copy=False)\n\n # Reset in-axis grouper columns\n # (using level numbers `g_nums` because level names may not be unique)\n result = result.reset_index(level=g_nums)\n\n return result.reset_index(drop=True)", "response": "Reindex the output of the internal functions."} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef _fill(self, direction, limit=None):\n res = super()._fill(direction, limit=limit)\n output = OrderedDict(\n (grp.name, grp.grouper) for grp in self.grouper.groupings)\n\n from pandas import concat\n return concat((self._wrap_transformed_output(output), res), axis=1)", "response": "Overridden method to join grouped columns in output"} {"instruction": "How would you implement a function in Python 3 that\ncomputes count of group entries excluding missing values", "response": "def count(self):\n \"\"\" Compute count of group, excluding missing values \"\"\"\n from pandas.core.dtypes.missing import _isna_ndarraylike as _isna\n\n data, _ = self._get_data_to_aggregate()\n ids, _, ngroups = self.grouper.group_info\n mask = ids != -1\n\n val = ((mask & ~_isna(np.atleast_2d(blk.get_values())))\n for blk in data.blocks)\n loc = (blk.mgr_locs for blk in data.blocks)\n\n counter = partial(\n lib.count_level_2d, labels=ids, max_bin=ngroups, axis=1)\n blk = map(make_block, map(counter, val), loc)\n\n return self._wrap_agged_blocks(data.items, list(blk))"} {"instruction": "How would you explain what the following Python 3 function does\ndef nunique(self, dropna=True):\n\n obj = self._selected_obj\n\n def groupby_series(obj, col=None):\n return SeriesGroupBy(obj,\n selection=col,\n grouper=self.grouper).nunique(dropna=dropna)\n\n if isinstance(obj, Series):\n results = groupby_series(obj)\n else:\n from pandas.core.reshape.concat import concat\n results = [groupby_series(obj[col], col) for col in obj.columns]\n results = concat(results, axis=1)\n results.columns.names = obj.columns.names\n\n if not self.as_index:\n results.index = ibase.default_index(len(results))\n return results", "response": "Return DataFrame with number of distinct observations per group for the current entry."} {"instruction": "Explain what the following Python 3 code does\ndef extract_array(obj, extract_numpy=False):\n if isinstance(obj, (ABCIndexClass, ABCSeries)):\n obj = obj.array\n\n if extract_numpy and isinstance(obj, ABCPandasArray):\n obj = obj.to_numpy()\n\n return obj", "response": "Extract the ndarray or ExtensionArray from an object."} {"instruction": "Explain what the following Python 3 code does\ndef flatten(l):\n for el in l:\n if _iterable_not_string(el):\n for s in flatten(el):\n yield s\n else:\n yield el", "response": "Flatten an arbitrarily nested sequence."} {"instruction": "Can you tell what is the following Python 3 function doing\ndef is_bool_indexer(key: Any) -> bool:\n na_msg = 'cannot index with vector containing NA / NaN values'\n if (isinstance(key, (ABCSeries, np.ndarray, ABCIndex)) or\n (is_array_like(key) and is_extension_array_dtype(key.dtype))):\n if key.dtype == np.object_:\n key = np.asarray(values_from_object(key))\n\n if not lib.is_bool_array(key):\n if isna(key).any():\n raise ValueError(na_msg)\n return False\n return True\n elif is_bool_dtype(key.dtype):\n # an ndarray with bool-dtype by definition has no missing values.\n # So we only need to check for NAs in ExtensionArrays\n if is_extension_array_dtype(key.dtype):\n if np.any(key.isna()):\n raise ValueError(na_msg)\n return True\n elif isinstance(key, list):\n try:\n arr = np.asarray(key)\n return arr.dtype == np.bool_ and len(arr) == len(key)\n except TypeError: # pragma: no cover\n return False\n\n return False", "response": "Checks whether the given key is a valid boolean indexer."} {"instruction": "Write a Python 3 script for\ncasting scalar to integer where valid.", "response": "def cast_scalar_indexer(val):\n \"\"\"\n To avoid numpy DeprecationWarnings, cast float to integer where valid.\n\n Parameters\n ----------\n val : scalar\n\n Returns\n -------\n outval : scalar\n \"\"\"\n # assumes lib.is_scalar(val)\n if lib.is_float(val) and val == int(val):\n return int(val)\n return val"} {"instruction": "Given the following Python 3 function, write the documentation\ndef index_labels_to_array(labels, dtype=None):\n if isinstance(labels, (str, tuple)):\n labels = [labels]\n\n if not isinstance(labels, (list, np.ndarray)):\n try:\n labels = list(labels)\n except TypeError: # non-iterable\n labels = [labels]\n\n labels = asarray_tuplesafe(labels, dtype=dtype)\n\n return labels", "response": "Transform label or iterable of labels to array for use in Index."} {"instruction": "Can you create a Python 3 function that\nchecks if a slice is null.", "response": "def is_null_slice(obj):\n \"\"\"\n We have a null slice.\n \"\"\"\n return (isinstance(obj, slice) and obj.start is None and\n obj.stop is None and obj.step is None)"} {"instruction": "Make a summary of the following Python 3 code\ndef is_full_slice(obj, l):\n return (isinstance(obj, slice) and obj.start == 0 and obj.stop == l and\n obj.step is None)", "response": "Check if a slice is full length."} {"instruction": "Given the following Python 3 function, write the documentation\ndef apply_if_callable(maybe_callable, obj, **kwargs):\n\n if callable(maybe_callable):\n return maybe_callable(obj, **kwargs)\n\n return maybe_callable", "response": "Evaluate possibly callable input using obj and kwargs and return as it is."} {"instruction": "Implement a function in Python 3 to\nreturn a standardized version of the supplied Mapping.", "response": "def standardize_mapping(into):\n \"\"\"\n Helper function to standardize a supplied mapping.\n\n .. versionadded:: 0.21.0\n\n Parameters\n ----------\n into : instance or subclass of collections.abc.Mapping\n Must be a class, an initialized collections.defaultdict,\n or an instance of a collections.abc.Mapping subclass.\n\n Returns\n -------\n mapping : a collections.abc.Mapping subclass or other constructor\n a callable object that can accept an iterator to create\n the desired Mapping.\n\n See Also\n --------\n DataFrame.to_dict\n Series.to_dict\n \"\"\"\n if not inspect.isclass(into):\n if isinstance(into, collections.defaultdict):\n return partial(\n collections.defaultdict, into.default_factory)\n into = type(into)\n if not issubclass(into, abc.Mapping):\n raise TypeError('unsupported type: {into}'.format(into=into))\n elif into == collections.defaultdict:\n raise TypeError(\n 'to_dict() only accepts initialized defaultdicts')\n return into"} {"instruction": "Make a summary of the following Python 3 code\ndef random_state(state=None):\n\n if is_integer(state):\n return np.random.RandomState(state)\n elif isinstance(state, np.random.RandomState):\n return state\n elif state is None:\n return np.random\n else:\n raise ValueError(\"random_state must be an integer, a numpy \"\n \"RandomState, or None\")", "response": "Returns a random_state object for the current set of random_states."} {"instruction": "Create a Python 3 function to\napply a function ``func`` to object ``obj`` either by passing obj as the first argument to the function or, in the case that the func is a tuple, interpret the first element of the tuple as a function and pass the obj to that function as a keyword argument whose key is the value of the second element of the tuple. Parameters ---------- func : callable or tuple of (callable, string) Function to apply to this object or, alternatively, a ``(callable, data_keyword)`` tuple where ``data_keyword`` is a string indicating the keyword of `callable`` that expects the object. args : iterable, optional positional arguments passed into ``func``. kwargs : dict, optional a dictionary of keyword arguments passed into ``func``. Returns ------- object : the return type of ``func``.", "response": "def _pipe(obj, func, *args, **kwargs):\n \"\"\"\n Apply a function ``func`` to object ``obj`` either by passing obj as the\n first argument to the function or, in the case that the func is a tuple,\n interpret the first element of the tuple as a function and pass the obj to\n that function as a keyword argument whose key is the value of the second\n element of the tuple.\n\n Parameters\n ----------\n func : callable or tuple of (callable, string)\n Function to apply to this object or, alternatively, a\n ``(callable, data_keyword)`` tuple where ``data_keyword`` is a\n string indicating the keyword of `callable`` that expects the\n object.\n args : iterable, optional\n positional arguments passed into ``func``.\n kwargs : dict, optional\n a dictionary of keyword arguments passed into ``func``.\n\n Returns\n -------\n object : the return type of ``func``.\n \"\"\"\n if isinstance(func, tuple):\n func, target = func\n if target in kwargs:\n msg = '%s is both the pipe target and a keyword argument' % target\n raise ValueError(msg)\n kwargs[target] = obj\n return func(*args, **kwargs)\n else:\n return func(obj, *args, **kwargs)"} {"instruction": "Write a Python 3 function that can\nreturn a function that will map names and labels dependent if mapper is a dict or Series.", "response": "def _get_rename_function(mapper):\n \"\"\"\n Returns a function that will map names/labels, dependent if mapper\n is a dict, Series or just a function.\n \"\"\"\n if isinstance(mapper, (abc.Mapping, ABCSeries)):\n\n def f(x):\n if x in mapper:\n return mapper[x]\n else:\n return x\n else:\n f = mapper\n\n return f"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef _get_fill_value(dtype, fill_value=None, fill_value_typ=None):\n if fill_value is not None:\n return fill_value\n if _na_ok_dtype(dtype):\n if fill_value_typ is None:\n return np.nan\n else:\n if fill_value_typ == '+inf':\n return np.inf\n else:\n return -np.inf\n else:\n if fill_value_typ is None:\n return tslibs.iNaT\n else:\n if fill_value_typ == '+inf':\n # need the max int here\n return _int64_max\n else:\n return tslibs.iNaT", "response": "get the correct fill value for the dtype of the values"} {"instruction": "How would you implement a function in Python 3 that\nwraps our results if needed", "response": "def _wrap_results(result, dtype, fill_value=None):\n \"\"\" wrap our results if needed \"\"\"\n\n if is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype):\n if fill_value is None:\n # GH#24293\n fill_value = iNaT\n if not isinstance(result, np.ndarray):\n tz = getattr(dtype, 'tz', None)\n assert not isna(fill_value), \"Expected non-null fill_value\"\n if result == fill_value:\n result = np.nan\n result = tslibs.Timestamp(result, tz=tz)\n else:\n result = result.view(dtype)\n elif is_timedelta64_dtype(dtype):\n if not isinstance(result, np.ndarray):\n if result == fill_value:\n result = np.nan\n\n # raise if we have a timedelta64[ns] which is too large\n if np.fabs(result) > _int64_max:\n raise ValueError(\"overflow in timedelta operation\")\n\n result = tslibs.Timedelta(result, unit='ns')\n else:\n result = result.astype('i8').view(dtype)\n\n return result"} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef _na_for_min_count(values, axis):\n # we either return np.nan or pd.NaT\n if is_numeric_dtype(values):\n values = values.astype('float64')\n fill_value = na_value_for_dtype(values.dtype)\n\n if values.ndim == 1:\n return fill_value\n else:\n result_shape = (values.shape[:axis] +\n values.shape[axis + 1:])\n result = np.empty(result_shape, dtype=values.dtype)\n result.fill(fill_value)\n return result", "response": "Return the missing value for values along the specified axis."} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef nanany(values, axis=None, skipna=True, mask=None):\n values, mask, dtype, _, _ = _get_values(values, skipna, False, copy=skipna,\n mask=mask)\n return values.any(axis)", "response": "Check if any elements along an axis evaluate to True."} {"instruction": "Create a Python 3 function for\nchecking if all elements along an axis evaluate to True.", "response": "def nanall(values, axis=None, skipna=True, mask=None):\n \"\"\"\n Check if all elements along an axis evaluate to True.\n\n Parameters\n ----------\n values : ndarray\n axis: int, optional\n skipna : bool, default True\n mask : ndarray[bool], optional\n nan-mask if known\n\n Returns\n -------\n result : bool\n\n Examples\n --------\n >>> import pandas.core.nanops as nanops\n >>> s = pd.Series([1, 2, np.nan])\n >>> nanops.nanall(s)\n True\n\n >>> import pandas.core.nanops as nanops\n >>> s = pd.Series([1, 0])\n >>> nanops.nanall(s)\n False\n \"\"\"\n values, mask, dtype, _, _ = _get_values(values, skipna, True, copy=skipna,\n mask=mask)\n return values.all(axis)"} {"instruction": "Explain what the following Python 3 code does\ndef nansum(values, axis=None, skipna=True, min_count=0, mask=None):\n values, mask, dtype, dtype_max, _ = _get_values(values,\n skipna, 0, mask=mask)\n dtype_sum = dtype_max\n if is_float_dtype(dtype):\n dtype_sum = dtype\n elif is_timedelta64_dtype(dtype):\n dtype_sum = np.float64\n the_sum = values.sum(axis, dtype=dtype_sum)\n the_sum = _maybe_null_out(the_sum, axis, mask, min_count=min_count)\n\n return _wrap_results(the_sum, dtype)", "response": "Sum the elements along an axis ignoring NaNs"} {"instruction": "Write a Python 3 function for\ncomputing the mean of the element along an axis ignoring NaNs", "response": "def nanmean(values, axis=None, skipna=True, mask=None):\n \"\"\"\n Compute the mean of the element along an axis ignoring NaNs\n\n Parameters\n ----------\n values : ndarray\n axis: int, optional\n skipna : bool, default True\n mask : ndarray[bool], optional\n nan-mask if known\n\n Returns\n -------\n result : float\n Unless input is a float array, in which case use the same\n precision as the input array.\n\n Examples\n --------\n >>> import pandas.core.nanops as nanops\n >>> s = pd.Series([1, 2, np.nan])\n >>> nanops.nanmean(s)\n 1.5\n \"\"\"\n values, mask, dtype, dtype_max, _ = _get_values(\n values, skipna, 0, mask=mask)\n dtype_sum = dtype_max\n dtype_count = np.float64\n if (is_integer_dtype(dtype) or is_timedelta64_dtype(dtype) or\n is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype)):\n dtype_sum = np.float64\n elif is_float_dtype(dtype):\n dtype_sum = dtype\n dtype_count = dtype\n count = _get_counts(mask, axis, dtype=dtype_count)\n the_sum = _ensure_numeric(values.sum(axis, dtype=dtype_sum))\n\n if axis is not None and getattr(the_sum, 'ndim', False):\n with np.errstate(all=\"ignore\"):\n # suppress division by zero warnings\n the_mean = the_sum / count\n ct_mask = count == 0\n if ct_mask.any():\n the_mean[ct_mask] = np.nan\n else:\n the_mean = the_sum / count if count > 0 else np.nan\n\n return _wrap_results(the_mean, dtype)"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef nanmedian(values, axis=None, skipna=True, mask=None):\n def get_median(x):\n mask = notna(x)\n if not skipna and not mask.all():\n return np.nan\n return np.nanmedian(x[mask])\n\n values, mask, dtype, dtype_max, _ = _get_values(values, skipna, mask=mask)\n if not is_float_dtype(values):\n values = values.astype('f8')\n values[mask] = np.nan\n\n if axis is None:\n values = values.ravel()\n\n notempty = values.size\n\n # an array from a frame\n if values.ndim > 1:\n\n # there's a non-empty array to apply over otherwise numpy raises\n if notempty:\n if not skipna:\n return _wrap_results(\n np.apply_along_axis(get_median, axis, values), dtype)\n\n # fastpath for the skipna case\n return _wrap_results(np.nanmedian(values, axis), dtype)\n\n # must return the correct shape, but median is not defined for the\n # empty set so return nans of shape \"everything but the passed axis\"\n # since \"axis\" is where the reduction would occur if we had a nonempty\n # array\n shp = np.array(values.shape)\n dims = np.arange(values.ndim)\n ret = np.empty(shp[dims != axis])\n ret.fill(np.nan)\n return _wrap_results(ret, dtype)\n\n # otherwise return a scalar value\n return _wrap_results(get_median(values) if notempty else np.nan, dtype)", "response": "nanmedian - like function that takes a 1D array and returns the median of the known ones."} {"instruction": "Write a Python 3 function that can\ncompute the standard deviation along given axis while ignoring NaNs.", "response": "def nanstd(values, axis=None, skipna=True, ddof=1, mask=None):\n \"\"\"\n Compute the standard deviation along given axis while ignoring NaNs\n\n Parameters\n ----------\n values : ndarray\n axis: int, optional\n skipna : bool, default True\n ddof : int, default 1\n Delta Degrees of Freedom. The divisor used in calculations is N - ddof,\n where N represents the number of elements.\n mask : ndarray[bool], optional\n nan-mask if known\n\n Returns\n -------\n result : float\n Unless input is a float array, in which case use the same\n precision as the input array.\n\n Examples\n --------\n >>> import pandas.core.nanops as nanops\n >>> s = pd.Series([1, np.nan, 2, 3])\n >>> nanops.nanstd(s)\n 1.0\n \"\"\"\n result = np.sqrt(nanvar(values, axis=axis, skipna=skipna, ddof=ddof,\n mask=mask))\n return _wrap_results(result, values.dtype)"} {"instruction": "How would you explain what the following Python 3 function does\ndef nanvar(values, axis=None, skipna=True, ddof=1, mask=None):\n values = com.values_from_object(values)\n dtype = values.dtype\n if mask is None:\n mask = isna(values)\n if is_any_int_dtype(values):\n values = values.astype('f8')\n values[mask] = np.nan\n\n if is_float_dtype(values):\n count, d = _get_counts_nanvar(mask, axis, ddof, values.dtype)\n else:\n count, d = _get_counts_nanvar(mask, axis, ddof)\n\n if skipna:\n values = values.copy()\n np.putmask(values, mask, 0)\n\n # xref GH10242\n # Compute variance via two-pass algorithm, which is stable against\n # cancellation errors and relatively accurate for small numbers of\n # observations.\n #\n # See https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance\n avg = _ensure_numeric(values.sum(axis=axis, dtype=np.float64)) / count\n if axis is not None:\n avg = np.expand_dims(avg, axis)\n sqr = _ensure_numeric((avg - values) ** 2)\n np.putmask(sqr, mask, 0)\n result = sqr.sum(axis=axis, dtype=np.float64) / d\n\n # Return variance as np.float64 (the datatype used in the accumulator),\n # unless we were dealing with a float array, in which case use the same\n # precision as the original values array.\n if is_float_dtype(dtype):\n result = result.astype(dtype)\n return _wrap_results(result, values.dtype)", "response": "Compute the variance along given axis ignoring NaNs."} {"instruction": "Implement a Python 3 function for\ncomputing the standard error in the mean along given axis ignoring NaNs.", "response": "def nansem(values, axis=None, skipna=True, ddof=1, mask=None):\n \"\"\"\n Compute the standard error in the mean along given axis while ignoring NaNs\n\n Parameters\n ----------\n values : ndarray\n axis: int, optional\n skipna : bool, default True\n ddof : int, default 1\n Delta Degrees of Freedom. The divisor used in calculations is N - ddof,\n where N represents the number of elements.\n mask : ndarray[bool], optional\n nan-mask if known\n\n Returns\n -------\n result : float64\n Unless input is a float array, in which case use the same\n precision as the input array.\n\n Examples\n --------\n >>> import pandas.core.nanops as nanops\n >>> s = pd.Series([1, np.nan, 2, 3])\n >>> nanops.nansem(s)\n 0.5773502691896258\n \"\"\"\n\n # This checks if non-numeric-like data is passed with numeric_only=False\n # and raises a TypeError otherwise\n nanvar(values, axis, skipna, ddof=ddof, mask=mask)\n\n if mask is None:\n mask = isna(values)\n if not is_float_dtype(values.dtype):\n values = values.astype('f8')\n count, _ = _get_counts_nanvar(mask, axis, ddof, values.dtype)\n var = nanvar(values, axis, skipna, ddof=ddof)\n\n return np.sqrt(var) / np.sqrt(count)"} {"instruction": "Create a Python 3 function for\nreturning the index of the maximum value in specified axis.", "response": "def nanargmax(values, axis=None, skipna=True, mask=None):\n \"\"\"\n Parameters\n ----------\n values : ndarray\n axis: int, optional\n skipna : bool, default True\n mask : ndarray[bool], optional\n nan-mask if known\n\n Returns\n --------\n result : int\n The index of max value in specified axis or -1 in the NA case\n\n Examples\n --------\n >>> import pandas.core.nanops as nanops\n >>> s = pd.Series([1, 2, 3, np.nan, 4])\n >>> nanops.nanargmax(s)\n 4\n \"\"\"\n values, mask, dtype, _, _ = _get_values(\n values, skipna, fill_value_typ='-inf', mask=mask)\n result = values.argmax(axis)\n result = _maybe_arg_null_out(result, axis, mask, skipna)\n return result"} {"instruction": "Make a summary of the following Python 3 code\ndef nanargmin(values, axis=None, skipna=True, mask=None):\n values, mask, dtype, _, _ = _get_values(\n values, skipna, fill_value_typ='+inf', mask=mask)\n result = values.argmin(axis)\n result = _maybe_arg_null_out(result, axis, mask, skipna)\n return result", "response": "Returns the index of the minimum value in the specified axis or - 1 if the input array is missing."} {"instruction": "How would you implement a function in Python 3 that\ncomputes the sample skewness of a set of values.", "response": "def nanskew(values, axis=None, skipna=True, mask=None):\n \"\"\" Compute the sample skewness.\n\n The statistic computed here is the adjusted Fisher-Pearson standardized\n moment coefficient G1. The algorithm computes this coefficient directly\n from the second and third central moment.\n\n Parameters\n ----------\n values : ndarray\n axis: int, optional\n skipna : bool, default True\n mask : ndarray[bool], optional\n nan-mask if known\n\n Returns\n -------\n result : float64\n Unless input is a float array, in which case use the same\n precision as the input array.\n\n Examples\n --------\n >>> import pandas.core.nanops as nanops\n >>> s = pd.Series([1,np.nan, 1, 2])\n >>> nanops.nanskew(s)\n 1.7320508075688787\n \"\"\"\n values = com.values_from_object(values)\n if mask is None:\n mask = isna(values)\n if not is_float_dtype(values.dtype):\n values = values.astype('f8')\n count = _get_counts(mask, axis)\n else:\n count = _get_counts(mask, axis, dtype=values.dtype)\n\n if skipna:\n values = values.copy()\n np.putmask(values, mask, 0)\n\n mean = values.sum(axis, dtype=np.float64) / count\n if axis is not None:\n mean = np.expand_dims(mean, axis)\n\n adjusted = values - mean\n if skipna:\n np.putmask(adjusted, mask, 0)\n adjusted2 = adjusted ** 2\n adjusted3 = adjusted2 * adjusted\n m2 = adjusted2.sum(axis, dtype=np.float64)\n m3 = adjusted3.sum(axis, dtype=np.float64)\n\n # floating point error\n #\n # #18044 in _libs/windows.pyx calc_skew follow this behavior\n # to fix the fperr to treat m2 <1e-14 as zero\n m2 = _zero_out_fperr(m2)\n m3 = _zero_out_fperr(m3)\n\n with np.errstate(invalid='ignore', divide='ignore'):\n result = (count * (count - 1) ** 0.5 / (count - 2)) * (m3 / m2 ** 1.5)\n\n dtype = values.dtype\n if is_float_dtype(dtype):\n result = result.astype(dtype)\n\n if isinstance(result, np.ndarray):\n result = np.where(m2 == 0, 0, result)\n result[count < 3] = np.nan\n return result\n else:\n result = 0 if m2 == 0 else result\n if count < 3:\n return np.nan\n return result"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef nankurt(values, axis=None, skipna=True, mask=None):\n values = com.values_from_object(values)\n if mask is None:\n mask = isna(values)\n if not is_float_dtype(values.dtype):\n values = values.astype('f8')\n count = _get_counts(mask, axis)\n else:\n count = _get_counts(mask, axis, dtype=values.dtype)\n\n if skipna:\n values = values.copy()\n np.putmask(values, mask, 0)\n\n mean = values.sum(axis, dtype=np.float64) / count\n if axis is not None:\n mean = np.expand_dims(mean, axis)\n\n adjusted = values - mean\n if skipna:\n np.putmask(adjusted, mask, 0)\n adjusted2 = adjusted ** 2\n adjusted4 = adjusted2 ** 2\n m2 = adjusted2.sum(axis, dtype=np.float64)\n m4 = adjusted4.sum(axis, dtype=np.float64)\n\n with np.errstate(invalid='ignore', divide='ignore'):\n adj = 3 * (count - 1) ** 2 / ((count - 2) * (count - 3))\n numer = count * (count + 1) * (count - 1) * m4\n denom = (count - 2) * (count - 3) * m2 ** 2\n\n # floating point error\n #\n # #18044 in _libs/windows.pyx calc_kurt follow this behavior\n # to fix the fperr to treat denom <1e-14 as zero\n numer = _zero_out_fperr(numer)\n denom = _zero_out_fperr(denom)\n\n if not isinstance(denom, np.ndarray):\n # if ``denom`` is a scalar, check these corner cases first before\n # doing division\n if count < 4:\n return np.nan\n if denom == 0:\n return 0\n\n with np.errstate(invalid='ignore', divide='ignore'):\n result = numer / denom - adj\n\n dtype = values.dtype\n if is_float_dtype(dtype):\n result = result.astype(dtype)\n\n if isinstance(result, np.ndarray):\n result = np.where(denom == 0, 0, result)\n result[count < 4] = np.nan\n\n return result", "response": "Compute the sample excess kurtosis of a single object."} {"instruction": "Given the following Python 3 function, write the documentation\ndef nanprod(values, axis=None, skipna=True, min_count=0, mask=None):\n if mask is None:\n mask = isna(values)\n if skipna and not is_any_int_dtype(values):\n values = values.copy()\n values[mask] = 1\n result = values.prod(axis)\n return _maybe_null_out(result, axis, mask, min_count=min_count)", "response": "nanprod - like version of nanprod"} {"instruction": "Here you have a function in Python 3, explain what it does\ndef nancorr(a, b, method='pearson', min_periods=None):\n if len(a) != len(b):\n raise AssertionError('Operands to nancorr must have same size')\n\n if min_periods is None:\n min_periods = 1\n\n valid = notna(a) & notna(b)\n if not valid.all():\n a = a[valid]\n b = b[valid]\n\n if len(a) < min_periods:\n return np.nan\n\n f = get_corr_func(method)\n return f(a, b)", "response": "Compute nan correlation between two arrays a and b."} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef nanpercentile(values, q, axis, na_value, mask, ndim, interpolation):\n if not lib.is_scalar(mask) and mask.any():\n if ndim == 1:\n return _nanpercentile_1d(values, mask, q, na_value,\n interpolation=interpolation)\n else:\n # for nonconsolidatable blocks mask is 1D, but values 2D\n if mask.ndim < values.ndim:\n mask = mask.reshape(values.shape)\n if axis == 0:\n values = values.T\n mask = mask.T\n result = [_nanpercentile_1d(val, m, q, na_value,\n interpolation=interpolation)\n for (val, m) in zip(list(values), list(mask))]\n result = np.array(result, dtype=values.dtype, copy=False).T\n return result\n else:\n return np.percentile(values, q, axis=axis, interpolation=interpolation)", "response": "This function will take a numpy array of values and quantile values and return a new array of quantiles."} {"instruction": "Make a summary of the following Python 3 code\ndef write_th(self, s, header=False, indent=0, tags=None):\n if header and self.fmt.col_space is not None:\n tags = (tags or \"\")\n tags += ('style=\"min-width: {colspace};\"'\n .format(colspace=self.fmt.col_space))\n\n return self._write_cell(s, kind='th', indent=indent, tags=tags)", "response": "Method for writing a formatted th cell."} {"instruction": "Make a summary of the following Python 3 code\ndef read_clipboard(sep=r'\\s+', **kwargs): # pragma: no cover\n encoding = kwargs.pop('encoding', 'utf-8')\n\n # only utf-8 is valid for passed value because that's what clipboard\n # supports\n if encoding is not None and encoding.lower().replace('-', '') != 'utf8':\n raise NotImplementedError(\n 'reading from clipboard only supports utf-8 encoding')\n\n from pandas.io.clipboard import clipboard_get\n from pandas.io.parsers import read_csv\n text = clipboard_get()\n\n # Try to decode (if needed, as \"text\" might already be a string here).\n try:\n text = text.decode(kwargs.get('encoding')\n or get_option('display.encoding'))\n except AttributeError:\n pass\n\n # Excel copies into clipboard with \\t separation\n # inspect no more then the 10 first lines, if they\n # all contain an equal number (>0) of tabs, infer\n # that this came from excel and set 'sep' accordingly\n lines = text[:10000].split('\\n')[:-1][:10]\n\n # Need to remove leading white space, since read_csv\n # accepts:\n # a b\n # 0 1 2\n # 1 3 4\n\n counts = {x.lstrip().count('\\t') for x in lines}\n if len(lines) > 1 and len(counts) == 1 and counts.pop() != 0:\n sep = '\\t'\n\n # Edge case where sep is specified to be None, return to default\n if sep is None and kwargs.get('delim_whitespace') is None:\n sep = r'\\s+'\n\n # Regex separator currently only works with python engine.\n # Default to python if separator is multi-character (regex)\n if len(sep) > 1 and kwargs.get('engine') is None:\n kwargs['engine'] = 'python'\n elif len(sep) > 1 and kwargs.get('engine') == 'c':\n warnings.warn('read_clipboard with regex separator does not work'\n ' properly with c engine')\n\n return read_csv(StringIO(text), sep=sep, **kwargs)", "response": "r Reads text from clipboard and returns DataFrame containing the parsed data."} {"instruction": "Create a Python 3 function for\nwriting the object to the system clipboard.", "response": "def to_clipboard(obj, excel=True, sep=None, **kwargs): # pragma: no cover\n \"\"\"\n Attempt to write text representation of object to the system clipboard\n The clipboard can be then pasted into Excel for example.\n\n Parameters\n ----------\n obj : the object to write to the clipboard\n excel : boolean, defaults to True\n if True, use the provided separator, writing in a csv\n format for allowing easy pasting into excel.\n if False, write a string representation of the object\n to the clipboard\n sep : optional, defaults to tab\n other keywords are passed to to_csv\n\n Notes\n -----\n Requirements for your platform\n - Linux: xclip, or xsel (with gtk or PyQt4 modules)\n - Windows:\n - OS X:\n \"\"\"\n encoding = kwargs.pop('encoding', 'utf-8')\n\n # testing if an invalid encoding is passed to clipboard\n if encoding is not None and encoding.lower().replace('-', '') != 'utf8':\n raise ValueError('clipboard only supports utf-8 encoding')\n\n from pandas.io.clipboard import clipboard_set\n if excel is None:\n excel = True\n\n if excel:\n try:\n if sep is None:\n sep = '\\t'\n buf = StringIO()\n\n # clipboard_set (pyperclip) expects unicode\n obj.to_csv(buf, sep=sep, encoding='utf-8', **kwargs)\n text = buf.getvalue()\n\n clipboard_set(text)\n return\n except TypeError:\n warnings.warn('to_clipboard in excel mode requires a single '\n 'character separator.')\n elif sep is not None:\n warnings.warn('to_clipboard with excel=False ignores the sep argument')\n\n if isinstance(obj, ABCDataFrame):\n # str(df) has various unhelpful defaults, like truncation\n with option_context('display.max_colwidth', 999999):\n objstr = obj.to_string(**kwargs)\n else:\n objstr = str(obj)\n clipboard_set(objstr)"} {"instruction": "Can you implement a function in Python 3 that\nreturns an iterator given an integer slice or container.", "response": "def _get_skiprows(skiprows):\n \"\"\"Get an iterator given an integer, slice or container.\n\n Parameters\n ----------\n skiprows : int, slice, container\n The iterator to use to skip rows; can also be a slice.\n\n Raises\n ------\n TypeError\n * If `skiprows` is not a slice, integer, or Container\n\n Returns\n -------\n it : iterable\n A proper iterator to use to skip rows of a DataFrame.\n \"\"\"\n if isinstance(skiprows, slice):\n return lrange(skiprows.start or 0, skiprows.stop, skiprows.step or 1)\n elif isinstance(skiprows, numbers.Integral) or is_list_like(skiprows):\n return skiprows\n elif skiprows is None:\n return 0\n raise TypeError('%r is not a valid type for skipping rows' %\n type(skiprows).__name__)"} {"instruction": "Implement a Python 3 function for\ntrying to read from a file or url.", "response": "def _read(obj):\n \"\"\"Try to read from a url, file or string.\n\n Parameters\n ----------\n obj : str, unicode, or file-like\n\n Returns\n -------\n raw_text : str\n \"\"\"\n if _is_url(obj):\n with urlopen(obj) as url:\n text = url.read()\n elif hasattr(obj, 'read'):\n text = obj.read()\n elif isinstance(obj, (str, bytes)):\n text = obj\n try:\n if os.path.isfile(text):\n with open(text, 'rb') as f:\n return f.read()\n except (TypeError, ValueError):\n pass\n else:\n raise TypeError(\"Cannot read object of type %r\" % type(obj).__name__)\n return text"} {"instruction": "Make a summary of the following Python 3 code\ndef _build_xpath_expr(attrs):\n # give class attribute as class_ because class is a python keyword\n if 'class_' in attrs:\n attrs['class'] = attrs.pop('class_')\n\n s = [\"@{key}={val!r}\".format(key=k, val=v) for k, v in attrs.items()]\n return '[{expr}]'.format(expr=' and '.join(s))", "response": "Build an xpath expression to simulate bs4 s ability to pass in kwargs to\n search for attributes when using the lxml parser."} {"instruction": "How would you explain what the following Python 3 function does\ndef _parser_dispatch(flavor):\n valid_parsers = list(_valid_parsers.keys())\n if flavor not in valid_parsers:\n raise ValueError('{invalid!r} is not a valid flavor, valid flavors '\n 'are {valid}'\n .format(invalid=flavor, valid=valid_parsers))\n\n if flavor in ('bs4', 'html5lib'):\n if not _HAS_HTML5LIB:\n raise ImportError(\"html5lib not found, please install it\")\n if not _HAS_BS4:\n raise ImportError(\n \"BeautifulSoup4 (bs4) not found, please install it\")\n import bs4\n if LooseVersion(bs4.__version__) <= LooseVersion('4.2.0'):\n raise ValueError(\"A minimum version of BeautifulSoup 4.2.1 \"\n \"is required\")\n\n else:\n if not _HAS_LXML:\n raise ImportError(\"lxml not found, please install it\")\n return _valid_parsers[flavor]", "response": "Dispatches the input flavor to the appropriate parser class."} {"instruction": "Given the following Python 3 function, write the documentation\ndef read_html(io, match='.+', flavor=None, header=None, index_col=None,\n skiprows=None, attrs=None, parse_dates=False,\n tupleize_cols=None, thousands=',', encoding=None,\n decimal='.', converters=None, na_values=None,\n keep_default_na=True, displayed_only=True):\n r\"\"\"Read HTML tables into a ``list`` of ``DataFrame`` objects.\n\n Parameters\n ----------\n io : str or file-like\n A URL, a file-like object, or a raw string containing HTML. Note that\n lxml only accepts the http, ftp and file url protocols. If you have a\n URL that starts with ``'https'`` you might try removing the ``'s'``.\n\n match : str or compiled regular expression, optional\n The set of tables containing text matching this regex or string will be\n returned. Unless the HTML is extremely simple you will probably need to\n pass a non-empty string here. Defaults to '.+' (match any non-empty\n string). The default value will return all tables contained on a page.\n This value is converted to a regular expression so that there is\n consistent behavior between Beautiful Soup and lxml.\n\n flavor : str or None, container of strings\n The parsing engine to use. 'bs4' and 'html5lib' are synonymous with\n each other, they are both there for backwards compatibility. The\n default of ``None`` tries to use ``lxml`` to parse and if that fails it\n falls back on ``bs4`` + ``html5lib``.\n\n header : int or list-like or None, optional\n The row (or list of rows for a :class:`~pandas.MultiIndex`) to use to\n make the columns headers.\n\n index_col : int or list-like or None, optional\n The column (or list of columns) to use to create the index.\n\n skiprows : int or list-like or slice or None, optional\n 0-based. Number of rows to skip after parsing the column integer. If a\n sequence of integers or a slice is given, will skip the rows indexed by\n that sequence. Note that a single element sequence means 'skip the nth\n row' whereas an integer means 'skip n rows'.\n\n attrs : dict or None, optional\n This is a dictionary of attributes that you can pass to use to identify\n the table in the HTML. These are not checked for validity before being\n passed to lxml or Beautiful Soup. However, these attributes must be\n valid HTML table attributes to work correctly. For example, ::\n\n attrs = {'id': 'table'}\n\n is a valid attribute dictionary because the 'id' HTML tag attribute is\n a valid HTML attribute for *any* HTML tag as per `this document\n `__. ::\n\n attrs = {'asdf': 'table'}\n\n is *not* a valid attribute dictionary because 'asdf' is not a valid\n HTML attribute even if it is a valid XML attribute. Valid HTML 4.01\n table attributes can be found `here\n `__. A\n working draft of the HTML 5 spec can be found `here\n `__. It contains the\n latest information on table attributes for the modern web.\n\n parse_dates : bool, optional\n See :func:`~read_csv` for more details.\n\n tupleize_cols : bool, optional\n If ``False`` try to parse multiple header rows into a\n :class:`~pandas.MultiIndex`, otherwise return raw tuples. Defaults to\n ``False``.\n\n .. deprecated:: 0.21.0\n This argument will be removed and will always convert to MultiIndex\n\n thousands : str, optional\n Separator to use to parse thousands. Defaults to ``','``.\n\n encoding : str or None, optional\n The encoding used to decode the web page. Defaults to ``None``.``None``\n preserves the previous encoding behavior, which depends on the\n underlying parser library (e.g., the parser library will try to use\n the encoding provided by the document).\n\n decimal : str, default '.'\n Character to recognize as decimal point (e.g. use ',' for European\n data).\n\n .. versionadded:: 0.19.0\n\n converters : dict, default None\n Dict of functions for converting values in certain columns. Keys can\n either be integers or column labels, values are functions that take one\n input argument, the cell (not column) content, and return the\n transformed content.\n\n .. versionadded:: 0.19.0\n\n na_values : iterable, default None\n Custom NA values\n\n .. versionadded:: 0.19.0\n\n keep_default_na : bool, default True\n If na_values are specified and keep_default_na is False the default NaN\n values are overridden, otherwise they're appended to\n\n .. versionadded:: 0.19.0\n\n displayed_only : bool, default True\n Whether elements with \"display: none\" should be parsed\n\n .. versionadded:: 0.23.0\n\n Returns\n -------\n dfs : list of DataFrames\n\n See Also\n --------\n read_csv\n\n Notes\n -----\n Before using this function you should read the :ref:`gotchas about the\n HTML parsing libraries `.\n\n Expect to do some cleanup after you call this function. For example, you\n might need to manually assign column names if the column names are\n converted to NaN when you pass the `header=0` argument. We try to assume as\n little as possible about the structure of the table and push the\n idiosyncrasies of the HTML contained in the table to the user.\n\n This function searches for ``
`` elements and only for ````\n and ```` or ```` argument, it is used to construct\n the header, otherwise the function attempts to find the header within\n the body (by putting rows with only ``, , elements to identify\n header, body, and footer, otherwise:\n - Put all rows into body\n - Move rows from top of body to header only if\n all elements inside row are . Move the top all- or \n while body_rows and row_is_all_th(body_rows[0]):\n header_rows.append(body_rows.pop(0))\n\n header = self._expand_colspan_rowspan(header_rows)\n body = self._expand_colspan_rowspan(body_rows)\n footer = self._expand_colspan_rowspan(footer_rows)\n\n return header, body, footer"} {"instruction": "Write a Python 3 script to\ngive a list of s return a list of text rows with rowspan and colspan.", "response": "def _expand_colspan_rowspan(self, rows):\n \"\"\"\n Given a list of s, return a list of text rows.\n\n Parameters\n ----------\n rows : list of node-like\n List of s\n\n Returns\n -------\n list of list\n Each returned row is a list of str text.\n\n Notes\n -----\n Any cell with ``rowspan`` or ``colspan`` will have its contents copied\n to subsequent cells.\n \"\"\"\n\n all_texts = [] # list of rows, each a list of str\n remainder = [] # list of (index, text, nrows)\n\n for tr in rows:\n texts = [] # the output for this row\n next_remainder = []\n\n index = 0\n tds = self._parse_td(tr)\n for td in tds:\n # Append texts from previous rows with rowspan>1 that come\n # before this
`` rows and ```` elements within each ``
``\n element in the table. ```` stands for \"table data\". This function\n attempts to properly handle ``colspan`` and ``rowspan`` attributes.\n If the function has a ``
`` elements into the header).\n\n .. versionadded:: 0.21.0\n\n Similar to :func:`~read_csv` the `header` argument is applied\n **after** `skiprows` is applied.\n\n This function will *always* return a list of :class:`DataFrame` *or*\n it will fail, e.g., it will *not* return an empty list.\n\n Examples\n --------\n See the :ref:`read_html documentation in the IO section of the docs\n ` for some examples of reading in HTML tables.\n \"\"\"\n _importers()\n\n # Type check here. We don't want to parse only to fail because of an\n # invalid value of an integer skiprows.\n if isinstance(skiprows, numbers.Integral) and skiprows < 0:\n raise ValueError('cannot skip rows starting from the end of the '\n 'data (you passed a negative value)')\n _validate_header_arg(header)\n return _parse(flavor=flavor, io=io, match=match, header=header,\n index_col=index_col, skiprows=skiprows,\n parse_dates=parse_dates, tupleize_cols=tupleize_cols,\n thousands=thousands, attrs=attrs, encoding=encoding,\n decimal=decimal, converters=converters, na_values=na_values,\n keep_default_na=keep_default_na,\n displayed_only=displayed_only)", "response": "r Reads the HTML tables into a list of DataFrame objects."} {"instruction": "Here you have a function in Python 3, explain what it does\ndef parse_tables(self):\n tables = self._parse_tables(self._build_doc(), self.match, self.attrs)\n return (self._parse_thead_tbody_tfoot(table) for table in tables)", "response": "Parse and return all tables from the DOM."} {"instruction": "Write a Python 3 function for\ngiving a table return parsed header body and foot.", "response": "def _parse_thead_tbody_tfoot(self, table_html):\n \"\"\"\n Given a table, return parsed header, body, and foot.\n\n Parameters\n ----------\n table_html : node-like\n\n Returns\n -------\n tuple of (header, body, footer), each a list of list-of-text rows.\n\n Notes\n -----\n Header and body are lists-of-lists. Top level list is a list of\n rows. Each row is a list of str text.\n\n Logic: Use
\n - Move rows from bottom of body to footer only if\n all elements inside row are \n \"\"\"\n\n header_rows = self._parse_thead_tr(table_html)\n body_rows = self._parse_tbody_tr(table_html)\n footer_rows = self._parse_tfoot_tr(table_html)\n\n def row_is_all_th(row):\n return all(self._equals_tag(t, 'th') for t in\n self._parse_td(row))\n\n if not header_rows:\n # The table has no
rows from\n # body_rows to header_rows. (This is a common case because many\n # tables in the wild have no
\n while remainder and remainder[0][0] <= index:\n prev_i, prev_text, prev_rowspan = remainder.pop(0)\n texts.append(prev_text)\n if prev_rowspan > 1:\n next_remainder.append((prev_i, prev_text,\n prev_rowspan - 1))\n index += 1\n\n # Append the text from this , colspan times\n text = _remove_whitespace(self._text_getter(td))\n rowspan = int(self._attr_getter(td, 'rowspan') or 1)\n colspan = int(self._attr_getter(td, 'colspan') or 1)\n\n for _ in range(colspan):\n texts.append(text)\n if rowspan > 1:\n next_remainder.append((index, text, rowspan - 1))\n index += 1\n\n # Append texts from previous rows at the final position\n for prev_i, prev_text, prev_rowspan in remainder:\n texts.append(prev_text)\n if prev_rowspan > 1:\n next_remainder.append((prev_i, prev_text,\n prev_rowspan - 1))\n\n all_texts.append(texts)\n remainder = next_remainder\n\n # Append rows that only appear because the previous row had non-1\n # rowspan\n while remainder:\n next_remainder = []\n texts = []\n for prev_i, prev_text, prev_rowspan in remainder:\n texts.append(prev_text)\n if prev_rowspan > 1:\n next_remainder.append((prev_i, prev_text,\n prev_rowspan - 1))\n all_texts.append(texts)\n remainder = next_remainder\n\n return all_texts"} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef _handle_hidden_tables(self, tbl_list, attr_name):\n if not self.displayed_only:\n return tbl_list\n\n return [x for x in tbl_list if \"display:none\" not in\n getattr(x, attr_name).get('style', '').replace(\" \", \"\")]", "response": "Returns list of tables potentially removing hidden elements"} {"instruction": "Here you have a function in Python 3, explain what it does\ndef _build_doc(self):\n from lxml.html import parse, fromstring, HTMLParser\n from lxml.etree import XMLSyntaxError\n parser = HTMLParser(recover=True, encoding=self.encoding)\n\n try:\n if _is_url(self.io):\n with urlopen(self.io) as f:\n r = parse(f, parser=parser)\n else:\n # try to parse the input in the simplest way\n r = parse(self.io, parser=parser)\n try:\n r = r.getroot()\n except AttributeError:\n pass\n except (UnicodeDecodeError, IOError) as e:\n # if the input is a blob of html goop\n if not _is_url(self.io):\n r = fromstring(self.io, parser=parser)\n\n try:\n r = r.getroot()\n except AttributeError:\n pass\n else:\n raise e\n else:\n if not hasattr(r, 'text_content'):\n raise XMLSyntaxError(\"no text parsed from document\", 0, 0, 0)\n return r", "response": "Parses the input and returns the doc."} {"instruction": "Write a Python 3 script to\nreturn a set of kinds that exist in this list of arrays", "response": "def get_dtype_kinds(l):\n \"\"\"\n Parameters\n ----------\n l : list of arrays\n\n Returns\n -------\n a set of kinds that exist in this list of arrays\n \"\"\"\n\n typs = set()\n for arr in l:\n\n dtype = arr.dtype\n if is_categorical_dtype(dtype):\n typ = 'category'\n elif is_sparse(arr):\n typ = 'sparse'\n elif isinstance(arr, ABCRangeIndex):\n typ = 'range'\n elif is_datetime64tz_dtype(arr):\n # if to_concat contains different tz,\n # the result must be object dtype\n typ = str(arr.dtype)\n elif is_datetime64_dtype(dtype):\n typ = 'datetime'\n elif is_timedelta64_dtype(dtype):\n typ = 'timedelta'\n elif is_object_dtype(dtype):\n typ = 'object'\n elif is_bool_dtype(dtype):\n typ = 'bool'\n elif is_extension_array_dtype(dtype):\n typ = str(arr.dtype)\n else:\n typ = dtype.kind\n typs.add(typ)\n return typs"} {"instruction": "Implement a function in Python 3 to\nreturn appropriate class of Series concat input is either dict or array - like", "response": "def _get_series_result_type(result, objs=None):\n \"\"\"\n return appropriate class of Series concat\n input is either dict or array-like\n \"\"\"\n from pandas import SparseSeries, SparseDataFrame, DataFrame\n\n # concat Series with axis 1\n if isinstance(result, dict):\n # concat Series with axis 1\n if all(isinstance(c, (SparseSeries, SparseDataFrame))\n for c in result.values()):\n return SparseDataFrame\n else:\n return DataFrame\n\n # otherwise it is a SingleBlockManager (axis = 0)\n if result._block.is_sparse:\n return SparseSeries\n else:\n return objs[0]._constructor"} {"instruction": "Can you write a function in Python 3 where it\nreturns appropriate class of DataFrame - like concat if all blocks are sparse return SparseDataFrame otherwise return 1st obj", "response": "def _get_frame_result_type(result, objs):\n \"\"\"\n return appropriate class of DataFrame-like concat\n if all blocks are sparse, return SparseDataFrame\n otherwise, return 1st obj\n \"\"\"\n\n if (result.blocks and (\n any(isinstance(obj, ABCSparseDataFrame) for obj in objs))):\n from pandas.core.sparse.api import SparseDataFrame\n return SparseDataFrame\n else:\n return next(obj for obj in objs if not isinstance(obj,\n ABCSparseDataFrame))"} {"instruction": "How would you code a function in Python 3 to\nprovide concatenation of an array of arrays each of which is a single 'normalized' dtypes (in that for example, if it's object, then it is a non-datetimelike and provide a combined dtype for the resulting array that preserves the overall dtype if possible) Parameters ---------- to_concat : array of arrays axis : axis to provide concatenation Returns ------- a single array, preserving the combined dtypes", "response": "def _concat_compat(to_concat, axis=0):\n \"\"\"\n provide concatenation of an array of arrays each of which is a single\n 'normalized' dtypes (in that for example, if it's object, then it is a\n non-datetimelike and provide a combined dtype for the resulting array that\n preserves the overall dtype if possible)\n\n Parameters\n ----------\n to_concat : array of arrays\n axis : axis to provide concatenation\n\n Returns\n -------\n a single array, preserving the combined dtypes\n \"\"\"\n\n # filter empty arrays\n # 1-d dtypes always are included here\n def is_nonempty(x):\n try:\n return x.shape[axis] > 0\n except Exception:\n return True\n\n # If all arrays are empty, there's nothing to convert, just short-cut to\n # the concatenation, #3121.\n #\n # Creating an empty array directly is tempting, but the winnings would be\n # marginal given that it would still require shape & dtype calculation and\n # np.concatenate which has them both implemented is compiled.\n\n typs = get_dtype_kinds(to_concat)\n _contains_datetime = any(typ.startswith('datetime') for typ in typs)\n _contains_period = any(typ.startswith('period') for typ in typs)\n\n if 'category' in typs:\n # this must be priort to _concat_datetime,\n # to support Categorical + datetime-like\n return _concat_categorical(to_concat, axis=axis)\n\n elif _contains_datetime or 'timedelta' in typs or _contains_period:\n return _concat_datetime(to_concat, axis=axis, typs=typs)\n\n # these are mandated to handle empties as well\n elif 'sparse' in typs:\n return _concat_sparse(to_concat, axis=axis, typs=typs)\n\n all_empty = all(not is_nonempty(x) for x in to_concat)\n if any(is_extension_array_dtype(x) for x in to_concat) and axis == 1:\n to_concat = [np.atleast_2d(x.astype('object')) for x in to_concat]\n\n if all_empty:\n # we have all empties, but may need to coerce the result dtype to\n # object if we have non-numeric type operands (numpy would otherwise\n # cast this to float)\n typs = get_dtype_kinds(to_concat)\n if len(typs) != 1:\n\n if (not len(typs - {'i', 'u', 'f'}) or\n not len(typs - {'bool', 'i', 'u'})):\n # let numpy coerce\n pass\n else:\n # coerce to object\n to_concat = [x.astype('object') for x in to_concat]\n\n return np.concatenate(to_concat, axis=axis)"} {"instruction": "Given the following Python 3 function, write the documentation\ndef _concat_categorical(to_concat, axis=0):\n\n # we could have object blocks and categoricals here\n # if we only have a single categoricals then combine everything\n # else its a non-compat categorical\n categoricals = [x for x in to_concat if is_categorical_dtype(x.dtype)]\n\n # validate the categories\n if len(categoricals) != len(to_concat):\n pass\n else:\n # when all categories are identical\n first = to_concat[0]\n if all(first.is_dtype_equal(other) for other in to_concat[1:]):\n return union_categoricals(categoricals)\n\n # extract the categoricals & coerce to object if needed\n to_concat = [x.get_values() if is_categorical_dtype(x.dtype)\n else np.asarray(x).ravel() if not is_datetime64tz_dtype(x)\n else np.asarray(x.astype(object)) for x in to_concat]\n result = _concat_compat(to_concat)\n if axis == 1:\n result = result.reshape(1, len(result))\n return result", "response": "Concatenate an object or categorical array of arrays each of which is a\n single dtype and preserving the combined dtypes"} {"instruction": "Write a Python 3 function that can\ncombine a list - like of Categorical - like and unioning categories.", "response": "def union_categoricals(to_union, sort_categories=False, ignore_order=False):\n \"\"\"\n Combine list-like of Categorical-like, unioning categories. All\n categories must have the same dtype.\n\n .. versionadded:: 0.19.0\n\n Parameters\n ----------\n to_union : list-like of Categorical, CategoricalIndex,\n or Series with dtype='category'\n sort_categories : boolean, default False\n If true, resulting categories will be lexsorted, otherwise\n they will be ordered as they appear in the data.\n ignore_order : boolean, default False\n If true, the ordered attribute of the Categoricals will be ignored.\n Results in an unordered categorical.\n\n .. versionadded:: 0.20.0\n\n Returns\n -------\n result : Categorical\n\n Raises\n ------\n TypeError\n - all inputs do not have the same dtype\n - all inputs do not have the same ordered property\n - all inputs are ordered and their categories are not identical\n - sort_categories=True and Categoricals are ordered\n ValueError\n Empty list of categoricals passed\n\n Notes\n -----\n\n To learn more about categories, see `link\n `__\n\n Examples\n --------\n\n >>> from pandas.api.types import union_categoricals\n\n If you want to combine categoricals that do not necessarily have\n the same categories, `union_categoricals` will combine a list-like\n of categoricals. The new categories will be the union of the\n categories being combined.\n\n >>> a = pd.Categorical([\"b\", \"c\"])\n >>> b = pd.Categorical([\"a\", \"b\"])\n >>> union_categoricals([a, b])\n [b, c, a, b]\n Categories (3, object): [b, c, a]\n\n By default, the resulting categories will be ordered as they appear\n in the `categories` of the data. If you want the categories to be\n lexsorted, use `sort_categories=True` argument.\n\n >>> union_categoricals([a, b], sort_categories=True)\n [b, c, a, b]\n Categories (3, object): [a, b, c]\n\n `union_categoricals` also works with the case of combining two\n categoricals of the same categories and order information (e.g. what\n you could also `append` for).\n\n >>> a = pd.Categorical([\"a\", \"b\"], ordered=True)\n >>> b = pd.Categorical([\"a\", \"b\", \"a\"], ordered=True)\n >>> union_categoricals([a, b])\n [a, b, a, b, a]\n Categories (2, object): [a < b]\n\n Raises `TypeError` because the categories are ordered and not identical.\n\n >>> a = pd.Categorical([\"a\", \"b\"], ordered=True)\n >>> b = pd.Categorical([\"a\", \"b\", \"c\"], ordered=True)\n >>> union_categoricals([a, b])\n TypeError: to union ordered Categoricals, all categories must be the same\n\n New in version 0.20.0\n\n Ordered categoricals with different categories or orderings can be\n combined by using the `ignore_ordered=True` argument.\n\n >>> a = pd.Categorical([\"a\", \"b\", \"c\"], ordered=True)\n >>> b = pd.Categorical([\"c\", \"b\", \"a\"], ordered=True)\n >>> union_categoricals([a, b], ignore_order=True)\n [a, b, c, c, b, a]\n Categories (3, object): [a, b, c]\n\n `union_categoricals` also works with a `CategoricalIndex`, or `Series`\n containing categorical data, but note that the resulting array will\n always be a plain `Categorical`\n\n >>> a = pd.Series([\"b\", \"c\"], dtype='category')\n >>> b = pd.Series([\"a\", \"b\"], dtype='category')\n >>> union_categoricals([a, b])\n [b, c, a, b]\n Categories (3, object): [b, c, a]\n \"\"\"\n from pandas import Index, Categorical, CategoricalIndex, Series\n from pandas.core.arrays.categorical import _recode_for_categories\n\n if len(to_union) == 0:\n raise ValueError('No Categoricals to union')\n\n def _maybe_unwrap(x):\n if isinstance(x, (CategoricalIndex, Series)):\n return x.values\n elif isinstance(x, Categorical):\n return x\n else:\n raise TypeError(\"all components to combine must be Categorical\")\n\n to_union = [_maybe_unwrap(x) for x in to_union]\n first = to_union[0]\n\n if not all(is_dtype_equal(other.categories.dtype, first.categories.dtype)\n for other in to_union[1:]):\n raise TypeError(\"dtype of categories must be the same\")\n\n ordered = False\n if all(first.is_dtype_equal(other) for other in to_union[1:]):\n # identical categories - fastpath\n categories = first.categories\n ordered = first.ordered\n\n if all(first.categories.equals(other.categories)\n for other in to_union[1:]):\n new_codes = np.concatenate([c.codes for c in to_union])\n else:\n codes = [first.codes] + [_recode_for_categories(other.codes,\n other.categories,\n first.categories)\n for other in to_union[1:]]\n new_codes = np.concatenate(codes)\n\n if sort_categories and not ignore_order and ordered:\n raise TypeError(\"Cannot use sort_categories=True with \"\n \"ordered Categoricals\")\n\n if sort_categories and not categories.is_monotonic_increasing:\n categories = categories.sort_values()\n indexer = categories.get_indexer(first.categories)\n\n from pandas.core.algorithms import take_1d\n new_codes = take_1d(indexer, new_codes, fill_value=-1)\n elif ignore_order or all(not c.ordered for c in to_union):\n # different categories - union and recode\n cats = first.categories.append([c.categories for c in to_union[1:]])\n categories = Index(cats.unique())\n if sort_categories:\n categories = categories.sort_values()\n\n new_codes = [_recode_for_categories(c.codes, c.categories, categories)\n for c in to_union]\n new_codes = np.concatenate(new_codes)\n else:\n # ordered - to show a proper error message\n if all(c.ordered for c in to_union):\n msg = (\"to union ordered Categoricals, \"\n \"all categories must be the same\")\n raise TypeError(msg)\n else:\n raise TypeError('Categorical.ordered must be the same')\n\n if ignore_order:\n ordered = False\n\n return Categorical(new_codes, categories=categories, ordered=ordered,\n fastpath=True)"} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef _concat_datetime(to_concat, axis=0, typs=None):\n\n if typs is None:\n typs = get_dtype_kinds(to_concat)\n\n # multiple types, need to coerce to object\n if len(typs) != 1:\n return _concatenate_2d([_convert_datetimelike_to_object(x)\n for x in to_concat],\n axis=axis)\n\n # must be single dtype\n if any(typ.startswith('datetime') for typ in typs):\n\n if 'datetime' in typs:\n to_concat = [x.astype(np.int64, copy=False) for x in to_concat]\n return _concatenate_2d(to_concat, axis=axis).view(_NS_DTYPE)\n else:\n # when to_concat has different tz, len(typs) > 1.\n # thus no need to care\n return _concat_datetimetz(to_concat)\n\n elif 'timedelta' in typs:\n return _concatenate_2d([x.view(np.int64) for x in to_concat],\n axis=axis).view(_TD_DTYPE)\n\n elif any(typ.startswith('period') for typ in typs):\n assert len(typs) == 1\n cls = to_concat[0]\n new_values = cls._concat_same_type(to_concat)\n return new_values", "response": "concatenate a datetimelike array of arrays each of which is a\n single M8 [ ns tz or m8 [ tz ] or m8 [ tz ] or m8 [ ns tz ] or m8 [ tz ] or m8 [ tz ]"} {"instruction": "Here you have a function in Python 3, explain what it does\ndef _concat_datetimetz(to_concat, name=None):\n # Right now, internals will pass a List[DatetimeArray] here\n # for reductions like quantile. I would like to disentangle\n # all this before we get here.\n sample = to_concat[0]\n\n if isinstance(sample, ABCIndexClass):\n return sample._concat_same_dtype(to_concat, name=name)\n elif isinstance(sample, ABCDatetimeArray):\n return sample._concat_same_type(to_concat)", "response": "concat DatetimeIndex with the same tz\n "} {"instruction": "Here you have a function in Python 3, explain what it does\ndef _concat_sparse(to_concat, axis=0, typs=None):\n\n from pandas.core.arrays import SparseArray\n\n fill_values = [x.fill_value for x in to_concat\n if isinstance(x, SparseArray)]\n fill_value = fill_values[0]\n\n # TODO: Fix join unit generation so we aren't passed this.\n to_concat = [x if isinstance(x, SparseArray)\n else SparseArray(x.squeeze(), fill_value=fill_value)\n for x in to_concat]\n\n return SparseArray._concat_same_type(to_concat)", "response": "concat_sparse returns a sparse array of arrays each of which is a\n single dtype"} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef _concat_rangeindex_same_dtype(indexes):\n from pandas import Int64Index, RangeIndex\n\n start = step = next = None\n\n # Filter the empty indexes\n non_empty_indexes = [obj for obj in indexes if len(obj)]\n\n for obj in non_empty_indexes:\n\n if start is None:\n # This is set by the first non-empty index\n start = obj._start\n if step is None and len(obj) > 1:\n step = obj._step\n elif step is None:\n # First non-empty index had only one element\n if obj._start == start:\n return _concat_index_same_dtype(indexes, klass=Int64Index)\n step = obj._start - start\n\n non_consecutive = ((step != obj._step and len(obj) > 1) or\n (next is not None and obj._start != next))\n if non_consecutive:\n return _concat_index_same_dtype(indexes, klass=Int64Index)\n\n if step is not None:\n next = obj[-1] + step\n\n if non_empty_indexes:\n # Get the stop value from \"next\" or alternatively\n # from the last non-empty index\n stop = non_empty_indexes[-1]._stop if next is None else next\n return RangeIndex(start, stop, step)\n\n # Here all \"indexes\" had 0 length, i.e. were empty.\n # In this case return an empty range index.\n return RangeIndex(0, 0)", "response": "Concatenates multiple RangeIndex instances into a single Int64Index."} {"instruction": "Given the following Python 3 function, write the documentation\ndef rewrite_exception(old_name, new_name):\n try:\n yield\n except Exception as e:\n msg = e.args[0]\n msg = msg.replace(old_name, new_name)\n args = (msg,)\n if len(e.args) > 1:\n args = args + e.args[1:]\n e.args = args\n raise", "response": "Rewrite the message of an exception."} {"instruction": "Explain what the following Python 3 code does\ndef _get_level_lengths(index, hidden_elements=None):\n sentinel = object()\n levels = index.format(sparsify=sentinel, adjoin=False, names=False)\n\n if hidden_elements is None:\n hidden_elements = []\n\n lengths = {}\n if index.nlevels == 1:\n for i, value in enumerate(levels):\n if(i not in hidden_elements):\n lengths[(0, i)] = 1\n return lengths\n\n for i, lvl in enumerate(levels):\n for j, row in enumerate(lvl):\n if not get_option('display.multi_sparse'):\n lengths[(i, j)] = 1\n elif (row != sentinel) and (j not in hidden_elements):\n last_label = j\n lengths[(i, last_label)] = 1\n elif (row != sentinel):\n # even if its hidden, keep track of it in case\n # length >1 and later elements are visible\n last_label = j\n lengths[(i, last_label)] = 0\n elif(j not in hidden_elements):\n lengths[(i, last_label)] += 1\n\n non_zero_lengths = {\n element: length for element, length in lengths.items() if length >= 1}\n\n return non_zero_lengths", "response": "Given an index find the level lengths for each element."} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef _translate(self):\n table_styles = self.table_styles or []\n caption = self.caption\n ctx = self.ctx\n precision = self.precision\n hidden_index = self.hidden_index\n hidden_columns = self.hidden_columns\n uuid = self.uuid or str(uuid1()).replace(\"-\", \"_\")\n ROW_HEADING_CLASS = \"row_heading\"\n COL_HEADING_CLASS = \"col_heading\"\n INDEX_NAME_CLASS = \"index_name\"\n\n DATA_CLASS = \"data\"\n BLANK_CLASS = \"blank\"\n BLANK_VALUE = \"\"\n\n def format_attr(pair):\n return \"{key}={value}\".format(**pair)\n\n # for sparsifying a MultiIndex\n idx_lengths = _get_level_lengths(self.index)\n col_lengths = _get_level_lengths(self.columns, hidden_columns)\n\n cell_context = dict()\n\n n_rlvls = self.data.index.nlevels\n n_clvls = self.data.columns.nlevels\n rlabels = self.data.index.tolist()\n clabels = self.data.columns.tolist()\n\n if n_rlvls == 1:\n rlabels = [[x] for x in rlabels]\n if n_clvls == 1:\n clabels = [[x] for x in clabels]\n clabels = list(zip(*clabels))\n\n cellstyle = []\n head = []\n\n for r in range(n_clvls):\n # Blank for Index columns...\n row_es = [{\"type\": \"th\",\n \"value\": BLANK_VALUE,\n \"display_value\": BLANK_VALUE,\n \"is_visible\": not hidden_index,\n \"class\": \" \".join([BLANK_CLASS])}] * (n_rlvls - 1)\n\n # ... except maybe the last for columns.names\n name = self.data.columns.names[r]\n cs = [BLANK_CLASS if name is None else INDEX_NAME_CLASS,\n \"level{lvl}\".format(lvl=r)]\n name = BLANK_VALUE if name is None else name\n row_es.append({\"type\": \"th\",\n \"value\": name,\n \"display_value\": name,\n \"class\": \" \".join(cs),\n \"is_visible\": not hidden_index})\n\n if clabels:\n for c, value in enumerate(clabels[r]):\n cs = [COL_HEADING_CLASS, \"level{lvl}\".format(lvl=r),\n \"col{col}\".format(col=c)]\n cs.extend(cell_context.get(\n \"col_headings\", {}).get(r, {}).get(c, []))\n es = {\n \"type\": \"th\",\n \"value\": value,\n \"display_value\": value,\n \"class\": \" \".join(cs),\n \"is_visible\": _is_visible(c, r, col_lengths),\n }\n colspan = col_lengths.get((r, c), 0)\n if colspan > 1:\n es[\"attributes\"] = [\n format_attr({\"key\": \"colspan\", \"value\": colspan})\n ]\n row_es.append(es)\n head.append(row_es)\n\n if (self.data.index.names and\n com._any_not_none(*self.data.index.names) and\n not hidden_index):\n index_header_row = []\n\n for c, name in enumerate(self.data.index.names):\n cs = [INDEX_NAME_CLASS,\n \"level{lvl}\".format(lvl=c)]\n name = '' if name is None else name\n index_header_row.append({\"type\": \"th\", \"value\": name,\n \"class\": \" \".join(cs)})\n\n index_header_row.extend(\n [{\"type\": \"th\",\n \"value\": BLANK_VALUE,\n \"class\": \" \".join([BLANK_CLASS])\n }] * (len(clabels[0]) - len(hidden_columns)))\n\n head.append(index_header_row)\n\n body = []\n for r, idx in enumerate(self.data.index):\n row_es = []\n for c, value in enumerate(rlabels[r]):\n rid = [ROW_HEADING_CLASS, \"level{lvl}\".format(lvl=c),\n \"row{row}\".format(row=r)]\n es = {\n \"type\": \"th\",\n \"is_visible\": (_is_visible(r, c, idx_lengths) and\n not hidden_index),\n \"value\": value,\n \"display_value\": value,\n \"id\": \"_\".join(rid[1:]),\n \"class\": \" \".join(rid)\n }\n rowspan = idx_lengths.get((c, r), 0)\n if rowspan > 1:\n es[\"attributes\"] = [\n format_attr({\"key\": \"rowspan\", \"value\": rowspan})\n ]\n row_es.append(es)\n\n for c, col in enumerate(self.data.columns):\n cs = [DATA_CLASS, \"row{row}\".format(row=r),\n \"col{col}\".format(col=c)]\n cs.extend(cell_context.get(\"data\", {}).get(r, {}).get(c, []))\n formatter = self._display_funcs[(r, c)]\n value = self.data.iloc[r, c]\n row_dict = {\"type\": \"td\",\n \"value\": value,\n \"class\": \" \".join(cs),\n \"display_value\": formatter(value),\n \"is_visible\": (c not in hidden_columns)}\n # only add an id if the cell has a style\n if (self.cell_ids or\n not(len(ctx[r, c]) == 1 and ctx[r, c][0] == '')):\n row_dict[\"id\"] = \"_\".join(cs[1:])\n row_es.append(row_dict)\n props = []\n for x in ctx[r, c]:\n # have to handle empty styles like ['']\n if x.count(\":\"):\n props.append(x.split(\":\"))\n else:\n props.append(['', ''])\n cellstyle.append({'props': props,\n 'selector': \"row{row}_col{col}\"\n .format(row=r, col=c)})\n body.append(row_es)\n\n table_attr = self.table_attributes\n use_mathjax = get_option(\"display.html.use_mathjax\")\n if not use_mathjax:\n table_attr = table_attr or ''\n if 'class=\"' in table_attr:\n table_attr = table_attr.replace('class=\"',\n 'class=\"tex2jax_ignore ')\n else:\n table_attr += ' class=\"tex2jax_ignore\"'\n\n return dict(head=head, cellstyle=cellstyle, body=body, uuid=uuid,\n precision=precision, table_styles=table_styles,\n caption=caption, table_attributes=table_attr)", "response": "Translate the DataFrame in self. data into a dictionary of head body uuid cellstyle."} {"instruction": "Can you tell what is the following Python 3 function doing\ndef format(self, formatter, subset=None):\n if subset is None:\n row_locs = range(len(self.data))\n col_locs = range(len(self.data.columns))\n else:\n subset = _non_reducing_slice(subset)\n if len(subset) == 1:\n subset = subset, self.data.columns\n\n sub_df = self.data.loc[subset]\n row_locs = self.data.index.get_indexer_for(sub_df.index)\n col_locs = self.data.columns.get_indexer_for(sub_df.columns)\n\n if is_dict_like(formatter):\n for col, col_formatter in formatter.items():\n # formatter must be callable, so '{}' are converted to lambdas\n col_formatter = _maybe_wrap_formatter(col_formatter)\n col_num = self.data.columns.get_indexer_for([col])[0]\n\n for row_num in row_locs:\n self._display_funcs[(row_num, col_num)] = col_formatter\n else:\n # single scalar to format all cells with\n locs = product(*(row_locs, col_locs))\n for i, j in locs:\n formatter = _maybe_wrap_formatter(formatter)\n self._display_funcs[(i, j)] = formatter\n return self", "response": "Formats the text display value of the entries of the set of entries in the set of entries in the set of entries in the set of entries."} {"instruction": "Can you write a function in Python 3 where it\nrenders the built up styles to HTML.", "response": "def render(self, **kwargs):\n \"\"\"\n Render the built up styles to HTML.\n\n Parameters\n ----------\n **kwargs\n Any additional keyword arguments are passed\n through to ``self.template.render``.\n This is useful when you need to provide\n additional variables for a custom template.\n\n .. versionadded:: 0.20\n\n Returns\n -------\n rendered : str\n The rendered HTML.\n\n Notes\n -----\n ``Styler`` objects have defined the ``_repr_html_`` method\n which automatically calls ``self.render()`` when it's the\n last item in a Notebook cell. When calling ``Styler.render()``\n directly, wrap the result in ``IPython.display.HTML`` to view\n the rendered HTML in the notebook.\n\n Pandas uses the following keys in render. Arguments passed\n in ``**kwargs`` take precedence, so think carefully if you want\n to override them:\n\n * head\n * cellstyle\n * body\n * uuid\n * precision\n * table_styles\n * caption\n * table_attributes\n \"\"\"\n self._compute()\n # TODO: namespace all the pandas keys\n d = self._translate()\n # filter out empty styles, every cell will have a class\n # but the list of props may just be [['', '']].\n # so we have the neested anys below\n trimmed = [x for x in d['cellstyle']\n if any(any(y) for y in x['props'])]\n d['cellstyle'] = trimmed\n d.update(kwargs)\n return self.template.render(**d)"} {"instruction": "Implement a function in Python 3 to\nupdate the state of the Styler.", "response": "def _update_ctx(self, attrs):\n \"\"\"\n Update the state of the Styler.\n\n Collects a mapping of {index_label: [': ']}.\n\n attrs : Series or DataFrame\n should contain strings of ': ;: '\n Whitespace shouldn't matter and the final trailing ';' shouldn't\n matter.\n \"\"\"\n for row_label, v in attrs.iterrows():\n for col_label, col in v.iteritems():\n i = self.index.get_indexer([row_label])[0]\n j = self.columns.get_indexer([col_label])[0]\n for pair in col.rstrip(\";\").split(\";\"):\n self.ctx[(i, j)].append(pair)"} {"instruction": "Given the following Python 3 function, write the documentation\ndef _compute(self):\n r = self\n for func, args, kwargs in self._todo:\n r = func(self)(*args, **kwargs)\n return r", "response": "Execute the style functions built up in self. _todo and return the result."} {"instruction": "Write a Python 3 script for\napplying a function column - wise row - wise or table - wise or entire DataFrame.", "response": "def apply(self, func, axis=0, subset=None, **kwargs):\n \"\"\"\n Apply a function column-wise, row-wise, or table-wise,\n updating the HTML representation with the result.\n\n Parameters\n ----------\n func : function\n ``func`` should take a Series or DataFrame (depending\n on ``axis``), and return an object with the same shape.\n Must return a DataFrame with identical index and\n column labels when ``axis=None``\n axis : {0 or 'index', 1 or 'columns', None}, default 0\n apply to each column (``axis=0`` or ``'index'``), to each row\n (``axis=1`` or ``'columns'``), or to the entire DataFrame at once\n with ``axis=None``.\n subset : IndexSlice\n a valid indexer to limit ``data`` to *before* applying the\n function. Consider using a pandas.IndexSlice\n kwargs : dict\n pass along to ``func``\n\n Returns\n -------\n self : Styler\n\n Notes\n -----\n The output shape of ``func`` should match the input, i.e. if\n ``x`` is the input row, column, or table (depending on ``axis``),\n then ``func(x).shape == x.shape`` should be true.\n\n This is similar to ``DataFrame.apply``, except that ``axis=None``\n applies the function to the entire DataFrame at once,\n rather than column-wise or row-wise.\n\n Examples\n --------\n >>> def highlight_max(x):\n ... return ['background-color: yellow' if v == x.max() else ''\n for v in x]\n ...\n >>> df = pd.DataFrame(np.random.randn(5, 2))\n >>> df.style.apply(highlight_max)\n \"\"\"\n self._todo.append((lambda instance: getattr(instance, '_apply'),\n (func, axis, subset), kwargs))\n return self"} {"instruction": "Explain what the following Python 3 code does\ndef applymap(self, func, subset=None, **kwargs):\n self._todo.append((lambda instance: getattr(instance, '_applymap'),\n (func, subset), kwargs))\n return self", "response": "Applies a function elementwise updating the HTML\n representation with the result."} {"instruction": "How would you code a function in Python 3 to\nreturn a new Styler with the values in value if cond evaluates to True otherwise other.", "response": "def where(self, cond, value, other=None, subset=None, **kwargs):\n \"\"\"\n Apply a function elementwise, updating the HTML\n representation with a style which is selected in\n accordance with the return value of a function.\n\n .. versionadded:: 0.21.0\n\n Parameters\n ----------\n cond : callable\n ``cond`` should take a scalar and return a boolean\n value : str\n applied when ``cond`` returns true\n other : str\n applied when ``cond`` returns false\n subset : IndexSlice\n a valid indexer to limit ``data`` to *before* applying the\n function. Consider using a pandas.IndexSlice\n kwargs : dict\n pass along to ``cond``\n\n Returns\n -------\n self : Styler\n\n See Also\n --------\n Styler.applymap\n \"\"\"\n\n if other is None:\n other = ''\n\n return self.applymap(lambda val: value if cond(val) else other,\n subset=subset, **kwargs)"} {"instruction": "Make a summary of the following Python 3 code\ndef hide_columns(self, subset):\n subset = _non_reducing_slice(subset)\n hidden_df = self.data.loc[subset]\n self.hidden_columns = self.columns.get_indexer_for(hidden_df.columns)\n return self", "response": "Hide columns from rendering."} {"instruction": "Create a Python 3 function for\nhighlighting the null color for missing values.", "response": "def highlight_null(self, null_color='red'):\n \"\"\"\n Shade the background ``null_color`` for missing values.\n\n Parameters\n ----------\n null_color : str\n\n Returns\n -------\n self : Styler\n \"\"\"\n self.applymap(self._highlight_null, null_color=null_color)\n return self"} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef background_gradient(self, cmap='PuBu', low=0, high=0, axis=0,\n subset=None, text_color_threshold=0.408):\n \"\"\"\n Color the background in a gradient according to\n the data in each column (optionally row).\n\n Requires matplotlib.\n\n Parameters\n ----------\n cmap : str or colormap\n matplotlib colormap\n low, high : float\n compress the range by these values.\n axis : {0 or 'index', 1 or 'columns', None}, default 0\n apply to each column (``axis=0`` or ``'index'``), to each row\n (``axis=1`` or ``'columns'``), or to the entire DataFrame at once\n with ``axis=None``.\n subset : IndexSlice\n a valid slice for ``data`` to limit the style application to.\n text_color_threshold : float or int\n luminance threshold for determining text color. Facilitates text\n visibility across varying background colors. From 0 to 1.\n 0 = all text is dark colored, 1 = all text is light colored.\n\n .. versionadded:: 0.24.0\n\n Returns\n -------\n self : Styler\n\n Raises\n ------\n ValueError\n If ``text_color_threshold`` is not a value from 0 to 1.\n\n Notes\n -----\n Set ``text_color_threshold`` or tune ``low`` and ``high`` to keep the\n text legible by not using the entire range of the color map. The range\n of the data is extended by ``low * (x.max() - x.min())`` and ``high *\n (x.max() - x.min())`` before normalizing.\n \"\"\"\n subset = _maybe_numeric_slice(self.data, subset)\n subset = _non_reducing_slice(subset)\n self.apply(self._background_gradient, cmap=cmap, subset=subset,\n axis=axis, low=low, high=high,\n text_color_threshold=text_color_threshold)\n return self", "response": "Color the background in a gradient according to the given color map."} {"instruction": "Given the following Python 3 function, write the documentation\ndef _background_gradient(s, cmap='PuBu', low=0, high=0,\n text_color_threshold=0.408):\n \"\"\"\n Color background in a range according to the data.\n \"\"\"\n if (not isinstance(text_color_threshold, (float, int)) or\n not 0 <= text_color_threshold <= 1):\n msg = \"`text_color_threshold` must be a value from 0 to 1.\"\n raise ValueError(msg)\n\n with _mpl(Styler.background_gradient) as (plt, colors):\n smin = s.values.min()\n smax = s.values.max()\n rng = smax - smin\n # extend lower / upper bounds, compresses color range\n norm = colors.Normalize(smin - (rng * low), smax + (rng * high))\n # matplotlib colors.Normalize modifies inplace?\n # https://github.com/matplotlib/matplotlib/issues/5427\n rgbas = plt.cm.get_cmap(cmap)(norm(s.values))\n\n def relative_luminance(rgba):\n \"\"\"\n Calculate relative luminance of a color.\n\n The calculation adheres to the W3C standards\n (https://www.w3.org/WAI/GL/wiki/Relative_luminance)\n\n Parameters\n ----------\n color : rgb or rgba tuple\n\n Returns\n -------\n float\n The relative luminance as a value from 0 to 1\n \"\"\"\n r, g, b = (\n x / 12.92 if x <= 0.03928 else ((x + 0.055) / 1.055 ** 2.4)\n for x in rgba[:3]\n )\n return 0.2126 * r + 0.7152 * g + 0.0722 * b\n\n def css(rgba):\n dark = relative_luminance(rgba) < text_color_threshold\n text_color = '#f1f1f1' if dark else '#000000'\n return 'background-color: {b};color: {c};'.format(\n b=colors.rgb2hex(rgba), c=text_color\n )\n\n if s.ndim == 1:\n return [css(rgba) for rgba in rgbas]\n else:\n return pd.DataFrame(\n [[css(rgba) for rgba in row] for row in rgbas],\n index=s.index, columns=s.columns\n )", "response": "Returns a string that can be used to render a background gradient for a specific color range."} {"instruction": "Can you implement a function in Python 3 that\ndraws a bar chart in dataframe cells.", "response": "def _bar(s, align, colors, width=100, vmin=None, vmax=None):\n \"\"\"\n Draw bar chart in dataframe cells.\n \"\"\"\n # Get input value range.\n smin = s.min() if vmin is None else vmin\n if isinstance(smin, ABCSeries):\n smin = smin.min()\n smax = s.max() if vmax is None else vmax\n if isinstance(smax, ABCSeries):\n smax = smax.max()\n if align == 'mid':\n smin = min(0, smin)\n smax = max(0, smax)\n elif align == 'zero':\n # For \"zero\" mode, we want the range to be symmetrical around zero.\n smax = max(abs(smin), abs(smax))\n smin = -smax\n # Transform to percent-range of linear-gradient\n normed = width * (s.values - smin) / (smax - smin + 1e-12)\n zero = -width * smin / (smax - smin + 1e-12)\n\n def css_bar(start, end, color):\n \"\"\"\n Generate CSS code to draw a bar from start to end.\n \"\"\"\n css = 'width: 10em; height: 80%;'\n if end > start:\n css += 'background: linear-gradient(90deg,'\n if start > 0:\n css += ' transparent {s:.1f}%, {c} {s:.1f}%, '.format(\n s=start, c=color\n )\n css += '{c} {e:.1f}%, transparent {e:.1f}%)'.format(\n e=min(end, width), c=color,\n )\n return css\n\n def css(x):\n if pd.isna(x):\n return ''\n\n # avoid deprecated indexing `colors[x > zero]`\n color = colors[1] if x > zero else colors[0]\n\n if align == 'left':\n return css_bar(0, x, color)\n else:\n return css_bar(min(x, zero), max(x, zero), color)\n\n if s.ndim == 1:\n return [css(x) for x in normed]\n else:\n return pd.DataFrame(\n [[css(x) for x in row] for row in normed],\n index=s.index, columns=s.columns\n )"} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef bar(self, subset=None, axis=0, color='#d65f5f', width=100,\n align='left', vmin=None, vmax=None):\n \"\"\"\n Draw bar chart in the cell backgrounds.\n\n Parameters\n ----------\n subset : IndexSlice, optional\n A valid slice for `data` to limit the style application to.\n axis : {0 or 'index', 1 or 'columns', None}, default 0\n apply to each column (``axis=0`` or ``'index'``), to each row\n (``axis=1`` or ``'columns'``), or to the entire DataFrame at once\n with ``axis=None``.\n color : str or 2-tuple/list\n If a str is passed, the color is the same for both\n negative and positive numbers. If 2-tuple/list is used, the\n first element is the color_negative and the second is the\n color_positive (eg: ['#d65f5f', '#5fba7d']).\n width : float, default 100\n A number between 0 or 100. The largest value will cover `width`\n percent of the cell's width.\n align : {'left', 'zero',' mid'}, default 'left'\n How to align the bars with the cells.\n\n - 'left' : the min value starts at the left of the cell.\n - 'zero' : a value of zero is located at the center of the cell.\n - 'mid' : the center of the cell is at (max-min)/2, or\n if values are all negative (positive) the zero is aligned\n at the right (left) of the cell.\n\n .. versionadded:: 0.20.0\n\n vmin : float, optional\n Minimum bar value, defining the left hand limit\n of the bar drawing range, lower values are clipped to `vmin`.\n When None (default): the minimum value of the data will be used.\n\n .. versionadded:: 0.24.0\n\n vmax : float, optional\n Maximum bar value, defining the right hand limit\n of the bar drawing range, higher values are clipped to `vmax`.\n When None (default): the maximum value of the data will be used.\n\n .. versionadded:: 0.24.0\n\n Returns\n -------\n self : Styler\n \"\"\"\n if align not in ('left', 'zero', 'mid'):\n raise ValueError(\"`align` must be one of {'left', 'zero',' mid'}\")\n\n if not (is_list_like(color)):\n color = [color, color]\n elif len(color) == 1:\n color = [color[0], color[0]]\n elif len(color) > 2:\n raise ValueError(\"`color` must be string or a list-like\"\n \" of length 2: [`color_neg`, `color_pos`]\"\n \" (eg: color=['#d65f5f', '#5fba7d'])\")\n\n subset = _maybe_numeric_slice(self.data, subset)\n subset = _non_reducing_slice(subset)\n self.apply(self._bar, subset=subset, axis=axis,\n align=align, colors=color, width=width,\n vmin=vmin, vmax=vmax)\n\n return self", "response": "Draw a bar chart in the cell backgrounds."} {"instruction": "Implement a function in Python 3 to\nhighlight the maximum value of the log entry in the specified subset.", "response": "def highlight_max(self, subset=None, color='yellow', axis=0):\n \"\"\"\n Highlight the maximum by shading the background.\n\n Parameters\n ----------\n subset : IndexSlice, default None\n a valid slice for ``data`` to limit the style application to.\n color : str, default 'yellow'\n axis : {0 or 'index', 1 or 'columns', None}, default 0\n apply to each column (``axis=0`` or ``'index'``), to each row\n (``axis=1`` or ``'columns'``), or to the entire DataFrame at once\n with ``axis=None``.\n\n Returns\n -------\n self : Styler\n \"\"\"\n return self._highlight_handler(subset=subset, color=color, axis=axis,\n max_=True)"} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef highlight_min(self, subset=None, color='yellow', axis=0):\n return self._highlight_handler(subset=subset, color=color, axis=axis,\n max_=False)", "response": "Highlight the minimum by shading the background."} {"instruction": "Write a Python 3 function for\nhighlighting the min or max in a Series or DataFrame.", "response": "def _highlight_extrema(data, color='yellow', max_=True):\n \"\"\"\n Highlight the min or max in a Series or DataFrame.\n \"\"\"\n attr = 'background-color: {0}'.format(color)\n if data.ndim == 1: # Series from .apply\n if max_:\n extrema = data == data.max()\n else:\n extrema = data == data.min()\n return [attr if v else '' for v in extrema]\n else: # DataFrame from .tee\n if max_:\n extrema = data == data.max().max()\n else:\n extrema = data == data.min().min()\n return pd.DataFrame(np.where(extrema, attr, ''),\n index=data.index, columns=data.columns)"} {"instruction": "How would you explain what the following Python 3 function does\ndef register(self, dtype):\n if not issubclass(dtype, (PandasExtensionDtype, ExtensionDtype)):\n raise ValueError(\"can only register pandas extension dtypes\")\n\n self.dtypes.append(dtype)", "response": "Register a new dtype for this object."} {"instruction": "Can you write a function in Python 3 where it\nreturns the first matching dtype or None if no matching dtype is found.", "response": "def find(self, dtype):\n \"\"\"\n Parameters\n ----------\n dtype : PandasExtensionDtype or string\n\n Returns\n -------\n return the first matching dtype, otherwise return None\n \"\"\"\n if not isinstance(dtype, str):\n dtype_type = dtype\n if not isinstance(dtype, type):\n dtype_type = type(dtype)\n if issubclass(dtype_type, ExtensionDtype):\n return dtype\n\n return None\n\n for dtype_type in self.dtypes:\n try:\n return dtype_type.construct_from_string(dtype)\n except TypeError:\n pass\n\n return None"} {"instruction": "Make a summary of the following Python 3 code\ndef np_datetime64_compat(s, *args, **kwargs):\n s = tz_replacer(s)\n return np.datetime64(s, *args, **kwargs)", "response": "provide compat for construction of strings to numpy datetime64 s with\n tz - changes in 1. 11 that make it be a deprecation\n show a deprecation\n warning when need to pass a deprecation\n "} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef np_array_datetime64_compat(arr, *args, **kwargs):\n # is_list_like\n if (hasattr(arr, '__iter__') and not isinstance(arr, (str, bytes))):\n arr = [tz_replacer(s) for s in arr]\n else:\n arr = tz_replacer(arr)\n\n return np.array(arr, *args, **kwargs)", "response": "provide compat for construction of an array of strings to a\n np. array"} {"instruction": "Make a summary of the following Python 3 code\ndef _assert_safe_casting(cls, data, subarr):\n if not issubclass(data.dtype.type, np.signedinteger):\n if not np.array_equal(data, subarr):\n raise TypeError('Unsafe NumPy casting, you must '\n 'explicitly cast')", "response": "Ensure incoming data can be represented as ints."} {"instruction": "How would you code a function in Python 3 to\nget the value of a key from a series", "response": "def get_value(self, series, key):\n \"\"\" we always want to get an index value, never a value \"\"\"\n if not is_scalar(key):\n raise InvalidIndexError\n\n k = com.values_from_object(key)\n loc = self.get_loc(k)\n new_values = com.values_from_object(series)[loc]\n\n return new_values"} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef equals(self, other):\n if self is other:\n return True\n\n if not isinstance(other, Index):\n return False\n\n # need to compare nans locations and make sure that they are the same\n # since nans don't compare equal this is a bit tricky\n try:\n if not isinstance(other, Float64Index):\n other = self._constructor(other)\n if (not is_dtype_equal(self.dtype, other.dtype) or\n self.shape != other.shape):\n return False\n left, right = self._ndarray_values, other._ndarray_values\n return ((left == right) | (self._isnan & other._isnan)).all()\n except (TypeError, ValueError):\n return False", "response": "Determines if two Index objects contain the same elements."} {"instruction": "Write a Python 3 script to\nensure that the given string is decoded.", "response": "def _ensure_decoded(s):\n \"\"\" if we have bytes, decode them to unicode \"\"\"\n if isinstance(s, np.bytes_):\n s = s.decode('UTF-8')\n return s"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef _ensure_term(where, scope_level):\n\n # only consider list/tuple here as an ndarray is automatically a coordinate\n # list\n level = scope_level + 1\n if isinstance(where, (list, tuple)):\n wlist = []\n for w in filter(lambda x: x is not None, where):\n if not maybe_expression(w):\n wlist.append(w)\n else:\n wlist.append(Term(w, scope_level=level))\n where = wlist\n elif maybe_expression(where):\n where = Term(where, scope_level=level)\n return where", "response": "ensure that the where is a Term or a list of Term objects"} {"instruction": "Create a Python 3 function for\nwriting a single entry to the HDF file.", "response": "def to_hdf(path_or_buf, key, value, mode=None, complevel=None, complib=None,\n append=None, **kwargs):\n \"\"\" store this object, close it if we opened it \"\"\"\n\n if append:\n f = lambda store: store.append(key, value, **kwargs)\n else:\n f = lambda store: store.put(key, value, **kwargs)\n\n path_or_buf = _stringify_path(path_or_buf)\n if isinstance(path_or_buf, str):\n with HDFStore(path_or_buf, mode=mode, complevel=complevel,\n complib=complib) as store:\n f(store)\n else:\n f(path_or_buf)"} {"instruction": "Can you write a function in Python 3 where it\nreads from the HDFStore and return a DataFrame.", "response": "def read_hdf(path_or_buf, key=None, mode='r', **kwargs):\n \"\"\"\n Read from the store, close it if we opened it.\n\n Retrieve pandas object stored in file, optionally based on where\n criteria\n\n Parameters\n ----------\n path_or_buf : string, buffer or path object\n Path to the file to open, or an open :class:`pandas.HDFStore` object.\n Supports any object implementing the ``__fspath__`` protocol.\n This includes :class:`pathlib.Path` and py._path.local.LocalPath\n objects.\n\n .. versionadded:: 0.19.0 support for pathlib, py.path.\n .. versionadded:: 0.21.0 support for __fspath__ protocol.\n\n key : object, optional\n The group identifier in the store. Can be omitted if the HDF file\n contains a single pandas object.\n mode : {'r', 'r+', 'a'}, optional\n Mode to use when opening the file. Ignored if path_or_buf is a\n :class:`pandas.HDFStore`. Default is 'r'.\n where : list, optional\n A list of Term (or convertible) objects.\n start : int, optional\n Row number to start selection.\n stop : int, optional\n Row number to stop selection.\n columns : list, optional\n A list of columns names to return.\n iterator : bool, optional\n Return an iterator object.\n chunksize : int, optional\n Number of rows to include in an iteration when using an iterator.\n errors : str, default 'strict'\n Specifies how encoding and decoding errors are to be handled.\n See the errors argument for :func:`open` for a full list\n of options.\n **kwargs\n Additional keyword arguments passed to HDFStore.\n\n Returns\n -------\n item : object\n The selected object. Return type depends on the object stored.\n\n See Also\n --------\n DataFrame.to_hdf : Write a HDF file from a DataFrame.\n HDFStore : Low-level access to HDF files.\n\n Examples\n --------\n >>> df = pd.DataFrame([[1, 1.0, 'a']], columns=['x', 'y', 'z'])\n >>> df.to_hdf('./store.h5', 'data')\n >>> reread = pd.read_hdf('./store.h5')\n \"\"\"\n\n if mode not in ['r', 'r+', 'a']:\n raise ValueError('mode {0} is not allowed while performing a read. '\n 'Allowed modes are r, r+ and a.'.format(mode))\n # grab the scope\n if 'where' in kwargs:\n kwargs['where'] = _ensure_term(kwargs['where'], scope_level=1)\n\n if isinstance(path_or_buf, HDFStore):\n if not path_or_buf.is_open:\n raise IOError('The HDFStore must be open for reading.')\n\n store = path_or_buf\n auto_close = False\n else:\n path_or_buf = _stringify_path(path_or_buf)\n if not isinstance(path_or_buf, str):\n raise NotImplementedError('Support for generic buffers has not '\n 'been implemented.')\n try:\n exists = os.path.exists(path_or_buf)\n\n # if filepath is too long\n except (TypeError, ValueError):\n exists = False\n\n if not exists:\n raise FileNotFoundError(\n 'File {path} does not exist'.format(path=path_or_buf))\n\n store = HDFStore(path_or_buf, mode=mode, **kwargs)\n # can't auto open/close if we are using an iterator\n # so delegate to the iterator\n auto_close = True\n\n try:\n if key is None:\n groups = store.groups()\n if len(groups) == 0:\n raise ValueError('No dataset in HDF5 file.')\n candidate_only_group = groups[0]\n\n # For the HDF file to have only one dataset, all other groups\n # should then be metadata groups for that candidate group. (This\n # assumes that the groups() method enumerates parent groups\n # before their children.)\n for group_to_check in groups[1:]:\n if not _is_metadata_of(group_to_check, candidate_only_group):\n raise ValueError('key must be provided when HDF5 file '\n 'contains multiple datasets.')\n key = candidate_only_group._v_pathname\n return store.select(key, auto_close=auto_close, **kwargs)\n except (ValueError, TypeError, KeyError):\n # if there is an error, close the store\n try:\n store.close()\n except AttributeError:\n pass\n\n raise"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef _is_metadata_of(group, parent_group):\n if group._v_depth <= parent_group._v_depth:\n return False\n\n current = group\n while current._v_depth > 1:\n parent = current._v_parent\n if parent == parent_group and current._v_name == 'meta':\n return True\n current = current._v_parent\n return False", "response": "Check if a given group is a metadata group for a given parent_group."} {"instruction": "How would you code a function in Python 3 to\nget the info for this name", "response": "def _get_info(info, name):\n \"\"\" get/create the info for this name \"\"\"\n try:\n idx = info[name]\n except KeyError:\n idx = info[name] = dict()\n return idx"} {"instruction": "Write a Python 3 function for\nreturning a zone for a tz - aware type", "response": "def _get_tz(tz):\n \"\"\" for a tz-aware type, return an encoded zone \"\"\"\n zone = timezones.get_timezone(tz)\n if zone is None:\n zone = tz.utcoffset().total_seconds()\n return zone"} {"instruction": "Write a Python 3 function for\nsetting the timezone of the object in the result.", "response": "def _set_tz(values, tz, preserve_UTC=False, coerce=False):\n \"\"\"\n coerce the values to a DatetimeIndex if tz is set\n preserve the input shape if possible\n\n Parameters\n ----------\n values : ndarray\n tz : string/pickled tz object\n preserve_UTC : boolean,\n preserve the UTC of the result\n coerce : if we do not have a passed timezone, coerce to M8[ns] ndarray\n \"\"\"\n if tz is not None:\n name = getattr(values, 'name', None)\n values = values.ravel()\n tz = timezones.get_timezone(_ensure_decoded(tz))\n values = DatetimeIndex(values, name=name)\n if values.tz is None:\n values = values.tz_localize('UTC').tz_convert(tz)\n if preserve_UTC:\n if tz == 'UTC':\n values = list(values)\n elif coerce:\n values = np.asarray(values, dtype='M8[ns]')\n\n return values"} {"instruction": "Write a Python 3 function for\nconverting a string - like array to a fixed - length string - like array.", "response": "def _convert_string_array(data, encoding, errors, itemsize=None):\n \"\"\"\n we take a string-like that is object dtype and coerce to a fixed size\n string type\n\n Parameters\n ----------\n data : a numpy array of object dtype\n encoding : None or string-encoding\n errors : handler for encoding errors\n itemsize : integer, optional, defaults to the max length of the strings\n\n Returns\n -------\n data in a fixed-length string dtype, encoded to bytes if needed\n \"\"\"\n\n # encode if needed\n if encoding is not None and len(data):\n data = Series(data.ravel()).str.encode(\n encoding, errors).values.reshape(data.shape)\n\n # create the sized dtype\n if itemsize is None:\n ensured = ensure_object(data.ravel())\n itemsize = max(1, libwriters.max_len_string_array(ensured))\n\n data = np.asarray(data, dtype=\"S{size}\".format(size=itemsize))\n return data"} {"instruction": "How would you code a function in Python 3 to\nconvert a fixed length string array into a single object array.", "response": "def _unconvert_string_array(data, nan_rep=None, encoding=None,\n errors='strict'):\n \"\"\"\n inverse of _convert_string_array\n\n Parameters\n ----------\n data : fixed length string dtyped array\n nan_rep : the storage repr of NaN, optional\n encoding : the encoding of the data, optional\n errors : handler for encoding errors, default 'strict'\n\n Returns\n -------\n an object array of the decoded data\n\n \"\"\"\n shape = data.shape\n data = np.asarray(data.ravel(), dtype=object)\n\n # guard against a None encoding (because of a legacy\n # where the passed encoding is actually None)\n encoding = _ensure_encoding(encoding)\n if encoding is not None and len(data):\n\n itemsize = libwriters.max_len_string_array(ensure_object(data))\n dtype = \"U{0}\".format(itemsize)\n\n if isinstance(data[0], bytes):\n data = Series(data).str.decode(encoding, errors=errors).values\n else:\n data = data.astype(dtype, copy=False).astype(object, copy=False)\n\n if nan_rep is None:\n nan_rep = 'nan'\n\n data = libwriters.string_array_replace_from_nan_rep(data, nan_rep)\n return data.reshape(shape)"} {"instruction": "Implement a function in Python 3 to\nopen the file in the specified mode.", "response": "def open(self, mode='a', **kwargs):\n \"\"\"\n Open the file in the specified mode\n\n Parameters\n ----------\n mode : {'a', 'w', 'r', 'r+'}, default 'a'\n See HDFStore docstring or tables.open_file for info about modes\n \"\"\"\n tables = _tables()\n\n if self._mode != mode:\n\n # if we are changing a write mode to read, ok\n if self._mode in ['a', 'w'] and mode in ['r', 'r+']:\n pass\n elif mode in ['w']:\n\n # this would truncate, raise here\n if self.is_open:\n raise PossibleDataLossError(\n \"Re-opening the file [{0}] with mode [{1}] \"\n \"will delete the current file!\"\n .format(self._path, self._mode)\n )\n\n self._mode = mode\n\n # close and reopen the handle\n if self.is_open:\n self.close()\n\n if self._complevel and self._complevel > 0:\n self._filters = _tables().Filters(self._complevel, self._complib,\n fletcher32=self._fletcher32)\n\n try:\n self._handle = tables.open_file(self._path, self._mode, **kwargs)\n except (IOError) as e: # pragma: no cover\n if 'can not be written' in str(e):\n print(\n 'Opening {path} in read-only mode'.format(path=self._path))\n self._handle = tables.open_file(self._path, 'r', **kwargs)\n else:\n raise\n\n except (ValueError) as e:\n\n # trap PyTables >= 3.1 FILE_OPEN_POLICY exception\n # to provide an updated message\n if 'FILE_OPEN_POLICY' in str(e):\n e = ValueError(\n \"PyTables [{version}] no longer supports opening multiple \"\n \"files\\n\"\n \"even in read-only mode on this HDF5 version \"\n \"[{hdf_version}]. You can accept this\\n\"\n \"and not open the same file multiple times at once,\\n\"\n \"upgrade the HDF5 version, or downgrade to PyTables 3.0.0 \"\n \"which allows\\n\"\n \"files to be opened multiple times at once\\n\"\n .format(version=tables.__version__,\n hdf_version=tables.get_hdf5_version()))\n\n raise e\n\n except (Exception) as e:\n\n # trying to read from a non-existent file causes an error which\n # is not part of IOError, make it one\n if self._mode == 'r' and 'Unable to open/create file' in str(e):\n raise IOError(str(e))\n raise"} {"instruction": "Make a summary of the following Python 3 code\ndef flush(self, fsync=False):\n if self._handle is not None:\n self._handle.flush()\n if fsync:\n try:\n os.fsync(self._handle.fileno())\n except OSError:\n pass", "response": "Flushes all buffered modifications to disk."} {"instruction": "Create a Python 3 function to\nretrieve pandas object stored in file with the specified key.", "response": "def get(self, key):\n \"\"\"\n Retrieve pandas object stored in file\n\n Parameters\n ----------\n key : object\n\n Returns\n -------\n obj : same type as object stored in file\n \"\"\"\n group = self.get_node(key)\n if group is None:\n raise KeyError('No object named {key} in the file'.format(key=key))\n return self._read_group(group)"} {"instruction": "Write a Python 3 function that can\nretrieve pandas object stored in file optionally based on where applet.", "response": "def select(self, key, where=None, start=None, stop=None, columns=None,\n iterator=False, chunksize=None, auto_close=False, **kwargs):\n \"\"\"\n Retrieve pandas object stored in file, optionally based on where\n criteria\n\n Parameters\n ----------\n key : object\n where : list of Term (or convertible) objects, optional\n start : integer (defaults to None), row number to start selection\n stop : integer (defaults to None), row number to stop selection\n columns : a list of columns that if not None, will limit the return\n columns\n iterator : boolean, return an iterator, default False\n chunksize : nrows to include in iteration, return an iterator\n auto_close : boolean, should automatically close the store when\n finished, default is False\n\n Returns\n -------\n The selected object\n \"\"\"\n group = self.get_node(key)\n if group is None:\n raise KeyError('No object named {key} in the file'.format(key=key))\n\n # create the storer and axes\n where = _ensure_term(where, scope_level=1)\n s = self._create_storer(group)\n s.infer_axes()\n\n # function to call on iteration\n def func(_start, _stop, _where):\n return s.read(start=_start, stop=_stop,\n where=_where,\n columns=columns)\n\n # create the iterator\n it = TableIterator(self, s, func, where=where, nrows=s.nrows,\n start=start, stop=stop, iterator=iterator,\n chunksize=chunksize, auto_close=auto_close)\n\n return it.get_result()"} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef select_as_coordinates(\n self, key, where=None, start=None, stop=None, **kwargs):\n \"\"\"\n return the selection as an Index\n\n Parameters\n ----------\n key : object\n where : list of Term (or convertible) objects, optional\n start : integer (defaults to None), row number to start selection\n stop : integer (defaults to None), row number to stop selection\n \"\"\"\n where = _ensure_term(where, scope_level=1)\n return self.get_storer(key).read_coordinates(where=where, start=start,\n stop=stop, **kwargs)", "response": "return the selection as an Index\nTaxonomy"} {"instruction": "Here you have a function in Python 3, explain what it does\ndef select_column(self, key, column, **kwargs):\n return self.get_storer(key).read_column(column=column, **kwargs)", "response": "This is generally only useful to use to\n . select_column"} {"instruction": "Write a Python 3 function for\nretrieving pandas objects from multiple tables.", "response": "def select_as_multiple(self, keys, where=None, selector=None, columns=None,\n start=None, stop=None, iterator=False,\n chunksize=None, auto_close=False, **kwargs):\n \"\"\" Retrieve pandas objects from multiple tables\n\n Parameters\n ----------\n keys : a list of the tables\n selector : the table to apply the where criteria (defaults to keys[0]\n if not supplied)\n columns : the columns I want back\n start : integer (defaults to None), row number to start selection\n stop : integer (defaults to None), row number to stop selection\n iterator : boolean, return an iterator, default False\n chunksize : nrows to include in iteration, return an iterator\n\n Exceptions\n ----------\n raises KeyError if keys or selector is not found or keys is empty\n raises TypeError if keys is not a list or tuple\n raises ValueError if the tables are not ALL THE SAME DIMENSIONS\n \"\"\"\n\n # default to single select\n where = _ensure_term(where, scope_level=1)\n if isinstance(keys, (list, tuple)) and len(keys) == 1:\n keys = keys[0]\n if isinstance(keys, str):\n return self.select(key=keys, where=where, columns=columns,\n start=start, stop=stop, iterator=iterator,\n chunksize=chunksize, **kwargs)\n\n if not isinstance(keys, (list, tuple)):\n raise TypeError(\"keys must be a list/tuple\")\n\n if not len(keys):\n raise ValueError(\"keys must have a non-zero length\")\n\n if selector is None:\n selector = keys[0]\n\n # collect the tables\n tbls = [self.get_storer(k) for k in keys]\n s = self.get_storer(selector)\n\n # validate rows\n nrows = None\n for t, k in itertools.chain([(s, selector)], zip(tbls, keys)):\n if t is None:\n raise KeyError(\"Invalid table [{key}]\".format(key=k))\n if not t.is_table:\n raise TypeError(\n \"object [{obj}] is not a table, and cannot be used in all \"\n \"select as multiple\".format(obj=t.pathname)\n )\n\n if nrows is None:\n nrows = t.nrows\n elif t.nrows != nrows:\n raise ValueError(\n \"all tables must have exactly the same nrows!\")\n\n # axis is the concentation axes\n axis = list({t.non_index_axes[0][0] for t in tbls})[0]\n\n def func(_start, _stop, _where):\n\n # retrieve the objs, _where is always passed as a set of\n # coordinates here\n objs = [t.read(where=_where, columns=columns, start=_start,\n stop=_stop, **kwargs) for t in tbls]\n\n # concat and return\n return concat(objs, axis=axis,\n verify_integrity=False)._consolidate()\n\n # create the iterator\n it = TableIterator(self, s, func, where=where, nrows=nrows,\n start=start, stop=stop, iterator=iterator,\n chunksize=chunksize, auto_close=auto_close)\n\n return it.get_result(coordinates=True)"} {"instruction": "Implement a Python 3 function for\nstoring object in HDFStore with optional format", "response": "def put(self, key, value, format=None, append=False, **kwargs):\n \"\"\"\n Store object in HDFStore\n\n Parameters\n ----------\n key : object\n value : {Series, DataFrame}\n format : 'fixed(f)|table(t)', default is 'fixed'\n fixed(f) : Fixed format\n Fast writing/reading. Not-appendable, nor searchable\n table(t) : Table format\n Write as a PyTables Table structure which may perform\n worse but allow more flexible operations like searching\n / selecting subsets of the data\n append : boolean, default False\n This will force Table format, append the input data to the\n existing.\n data_columns : list of columns to create as data columns, or True to\n use all columns. See\n `here `__ # noqa\n encoding : default None, provide an encoding for strings\n dropna : boolean, default False, do not write an ALL nan row to\n the store settable by the option 'io.hdf.dropna_table'\n \"\"\"\n if format is None:\n format = get_option(\"io.hdf.default_format\") or 'fixed'\n kwargs = self._validate_format(format, kwargs)\n self._write_to_group(key, value, append=append, **kwargs)"} {"instruction": "Given the following Python 3 function, write the documentation\ndef remove(self, key, where=None, start=None, stop=None):\n where = _ensure_term(where, scope_level=1)\n try:\n s = self.get_storer(key)\n except KeyError:\n # the key is not a valid store, re-raising KeyError\n raise\n except Exception:\n\n if where is not None:\n raise ValueError(\n \"trying to remove a node with a non-None where clause!\")\n\n # we are actually trying to remove a node (with children)\n s = self.get_node(key)\n if s is not None:\n s._f_remove(recursive=True)\n return None\n\n # remove the node\n if com._all_none(where, start, stop):\n s.group._f_remove(recursive=True)\n\n # delete from the table\n else:\n if not s.is_table:\n raise ValueError(\n 'can only remove with where on objects written as tables')\n return s.delete(where=where, start=start, stop=stop)", "response": "Remove pandas object partially by specifying the where condition"} {"instruction": "Can you create a Python 3 function that\nappends a value to a table in file.", "response": "def append(self, key, value, format=None, append=True, columns=None,\n dropna=None, **kwargs):\n \"\"\"\n Append to Table in file. Node must already exist and be Table\n format.\n\n Parameters\n ----------\n key : object\n value : {Series, DataFrame}\n format : 'table' is the default\n table(t) : table format\n Write as a PyTables Table structure which may perform\n worse but allow more flexible operations like searching\n / selecting subsets of the data\n append : boolean, default True, append the input data to the\n existing\n data_columns : list of columns, or True, default None\n List of columns to create as indexed data columns for on-disk\n queries, or True to use all columns. By default only the axes\n of the object are indexed. See `here\n `__.\n min_itemsize : dict of columns that specify minimum string sizes\n nan_rep : string to use as string nan represenation\n chunksize : size to chunk the writing\n expectedrows : expected TOTAL row size of this table\n encoding : default None, provide an encoding for strings\n dropna : boolean, default False, do not write an ALL nan row to\n the store settable by the option 'io.hdf.dropna_table'\n\n Notes\n -----\n Does *not* check if data being appended overlaps with existing\n data in the table, so be careful\n \"\"\"\n if columns is not None:\n raise TypeError(\"columns is not a supported keyword in append, \"\n \"try data_columns\")\n\n if dropna is None:\n dropna = get_option(\"io.hdf.dropna_table\")\n if format is None:\n format = get_option(\"io.hdf.default_format\") or 'table'\n kwargs = self._validate_format(format, kwargs)\n self._write_to_group(key, value, append=append, dropna=dropna,\n **kwargs)"} {"instruction": "Implement a Python 3 function for\nappending a value to a set of tables in a multi - level manner.", "response": "def append_to_multiple(self, d, value, selector, data_columns=None,\n axes=None, dropna=False, **kwargs):\n \"\"\"\n Append to multiple tables\n\n Parameters\n ----------\n d : a dict of table_name to table_columns, None is acceptable as the\n values of one node (this will get all the remaining columns)\n value : a pandas object\n selector : a string that designates the indexable table; all of its\n columns will be designed as data_columns, unless data_columns is\n passed, in which case these are used\n data_columns : list of columns to create as data columns, or True to\n use all columns\n dropna : if evaluates to True, drop rows from all tables if any single\n row in each table has all NaN. Default False.\n\n Notes\n -----\n axes parameter is currently not accepted\n\n \"\"\"\n if axes is not None:\n raise TypeError(\"axes is currently not accepted as a parameter to\"\n \" append_to_multiple; you can create the \"\n \"tables independently instead\")\n\n if not isinstance(d, dict):\n raise ValueError(\n \"append_to_multiple must have a dictionary specified as the \"\n \"way to split the value\"\n )\n\n if selector not in d:\n raise ValueError(\n \"append_to_multiple requires a selector that is in passed dict\"\n )\n\n # figure out the splitting axis (the non_index_axis)\n axis = list(set(range(value.ndim)) - set(_AXES_MAP[type(value)]))[0]\n\n # figure out how to split the value\n remain_key = None\n remain_values = []\n for k, v in d.items():\n if v is None:\n if remain_key is not None:\n raise ValueError(\n \"append_to_multiple can only have one value in d that \"\n \"is None\"\n )\n remain_key = k\n else:\n remain_values.extend(v)\n if remain_key is not None:\n ordered = value.axes[axis]\n ordd = ordered.difference(Index(remain_values))\n ordd = sorted(ordered.get_indexer(ordd))\n d[remain_key] = ordered.take(ordd)\n\n # data_columns\n if data_columns is None:\n data_columns = d[selector]\n\n # ensure rows are synchronized across the tables\n if dropna:\n idxs = (value[cols].dropna(how='all').index for cols in d.values())\n valid_index = next(idxs)\n for index in idxs:\n valid_index = valid_index.intersection(index)\n value = value.loc[valid_index]\n\n # append\n for k, v in d.items():\n dc = data_columns if k == selector else None\n\n # compute the val\n val = value.reindex(v, axis=axis)\n\n self.append(k, val, data_columns=dc, **kwargs)"} {"instruction": "Write a Python 3 function for\ncreating a pytables index on the table with the given key.", "response": "def create_table_index(self, key, **kwargs):\n \"\"\" Create a pytables index on the table\n Parameters\n ----------\n key : object (the node to index)\n\n Exceptions\n ----------\n raises if the node is not a table\n\n \"\"\"\n\n # version requirements\n _tables()\n s = self.get_storer(key)\n if s is None:\n return\n\n if not s.is_table:\n raise TypeError(\n \"cannot create table index on a Fixed format store\")\n s.create_index(**kwargs)"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef groups(self):\n _tables()\n self._check_if_open()\n return [\n g for g in self._handle.walk_groups()\n if (not isinstance(g, _table_mod.link.Link) and\n (getattr(g._v_attrs, 'pandas_type', None) or\n getattr(g, 'table', None) or\n (isinstance(g, _table_mod.table.Table) and\n g._v_name != 'table')))\n ]", "response": "return a list of all the top - level nodes that are not themselves a\n pandas storage object"} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef walk(self, where=\"/\"):\n _tables()\n self._check_if_open()\n for g in self._handle.walk_groups(where):\n if getattr(g._v_attrs, 'pandas_type', None) is not None:\n continue\n\n groups = []\n leaves = []\n for child in g._v_children.values():\n pandas_type = getattr(child._v_attrs, 'pandas_type', None)\n if pandas_type is None:\n if isinstance(child, _table_mod.group.Group):\n groups.append(child._v_name)\n else:\n leaves.append(child._v_name)\n\n yield (g._v_pathname.rstrip('/'), groups, leaves)", "response": "Walk the group hierarchy for pandas objects and return the group path subgroups and pandas object names for each group."} {"instruction": "How would you explain what the following Python 3 function does\ndef get_node(self, key):\n self._check_if_open()\n try:\n if not key.startswith('/'):\n key = '/' + key\n return self._handle.get_node(self.root, key)\n except _table_mod.exceptions.NoSuchNodeError:\n return None", "response": "get the node with the key"} {"instruction": "Can you create a Python 3 function that\nreturns the storer object for a key raise if not in the file", "response": "def get_storer(self, key):\n \"\"\" return the storer object for a key, raise if not in the file \"\"\"\n group = self.get_node(key)\n if group is None:\n raise KeyError('No object named {key} in the file'.format(key=key))\n\n s = self._create_storer(group)\n s.infer_axes()\n return s"} {"instruction": "Given the following Python 3 function, write the documentation\ndef copy(self, file, mode='w', propindexes=True, keys=None, complib=None,\n complevel=None, fletcher32=False, overwrite=True):\n \"\"\" copy the existing store to a new file, upgrading in place\n\n Parameters\n ----------\n propindexes: restore indexes in copied file (defaults to True)\n keys : list of keys to include in the copy (defaults to all)\n overwrite : overwrite (remove and replace) existing nodes in the\n new store (default is True)\n mode, complib, complevel, fletcher32 same as in HDFStore.__init__\n\n Returns\n -------\n open file handle of the new store\n\n \"\"\"\n new_store = HDFStore(\n file,\n mode=mode,\n complib=complib,\n complevel=complevel,\n fletcher32=fletcher32)\n if keys is None:\n keys = list(self.keys())\n if not isinstance(keys, (tuple, list)):\n keys = [keys]\n for k in keys:\n s = self.get_storer(k)\n if s is not None:\n\n if k in new_store:\n if overwrite:\n new_store.remove(k)\n\n data = self.select(k)\n if s.is_table:\n\n index = False\n if propindexes:\n index = [a.name for a in s.axes if a.is_indexed]\n new_store.append(\n k, data, index=index,\n data_columns=getattr(s, 'data_columns', None),\n encoding=s.encoding\n )\n else:\n new_store.put(k, data, encoding=s.encoding)\n\n return new_store", "response": "copy the existing store to a new file upgrading in place\n "} {"instruction": "Can you implement a function in Python 3 that\nprints detailed information on the store.", "response": "def info(self):\n \"\"\"\n Print detailed information on the store.\n\n .. versionadded:: 0.21.0\n \"\"\"\n output = '{type}\\nFile path: {path}\\n'.format(\n type=type(self), path=pprint_thing(self._path))\n if self.is_open:\n lkeys = sorted(list(self.keys()))\n if len(lkeys):\n keys = []\n values = []\n\n for k in lkeys:\n try:\n s = self.get_storer(k)\n if s is not None:\n keys.append(pprint_thing(s.pathname or k))\n values.append(\n pprint_thing(s or 'invalid_HDFStore node'))\n except Exception as detail:\n keys.append(k)\n values.append(\n \"[invalid_HDFStore node: {detail}]\".format(\n detail=pprint_thing(detail)))\n\n output += adjoin(12, keys, values)\n else:\n output += 'Empty'\n else:\n output += \"File is CLOSED\"\n\n return output"} {"instruction": "Here you have a function in Python 3, explain what it does\ndef _validate_format(self, format, kwargs):\n kwargs = kwargs.copy()\n\n # validate\n try:\n kwargs['format'] = _FORMAT_MAP[format.lower()]\n except KeyError:\n raise TypeError(\"invalid HDFStore format specified [{0}]\"\n .format(format))\n\n return kwargs", "response": "validate / deprecate formats; return the new kwargs"} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef _create_storer(self, group, format=None, value=None, append=False,\n **kwargs):\n \"\"\" return a suitable class to operate \"\"\"\n\n def error(t):\n raise TypeError(\n \"cannot properly create the storer for: [{t}] [group->\"\n \"{group},value->{value},format->{format},append->{append},\"\n \"kwargs->{kwargs}]\".format(t=t, group=group,\n value=type(value), format=format,\n append=append, kwargs=kwargs))\n\n pt = _ensure_decoded(getattr(group._v_attrs, 'pandas_type', None))\n tt = _ensure_decoded(getattr(group._v_attrs, 'table_type', None))\n\n # infer the pt from the passed value\n if pt is None:\n if value is None:\n\n _tables()\n if (getattr(group, 'table', None) or\n isinstance(group, _table_mod.table.Table)):\n pt = 'frame_table'\n tt = 'generic_table'\n else:\n raise TypeError(\n \"cannot create a storer if the object is not existing \"\n \"nor a value are passed\")\n else:\n\n try:\n pt = _TYPE_MAP[type(value)]\n except KeyError:\n error('_TYPE_MAP')\n\n # we are actually a table\n if format == 'table':\n pt += '_table'\n\n # a storer node\n if 'table' not in pt:\n try:\n return globals()[_STORER_MAP[pt]](self, group, **kwargs)\n except KeyError:\n error('_STORER_MAP')\n\n # existing node (and must be a table)\n if tt is None:\n\n # if we are a writer, determine the tt\n if value is not None:\n\n if pt == 'series_table':\n index = getattr(value, 'index', None)\n if index is not None:\n if index.nlevels == 1:\n tt = 'appendable_series'\n elif index.nlevels > 1:\n tt = 'appendable_multiseries'\n elif pt == 'frame_table':\n index = getattr(value, 'index', None)\n if index is not None:\n if index.nlevels == 1:\n tt = 'appendable_frame'\n elif index.nlevels > 1:\n tt = 'appendable_multiframe'\n elif pt == 'wide_table':\n tt = 'appendable_panel'\n elif pt == 'ndim_table':\n tt = 'appendable_ndim'\n\n else:\n\n # distiguish between a frame/table\n tt = 'legacy_panel'\n try:\n fields = group.table._v_attrs.fields\n if len(fields) == 1 and fields[0] == 'value':\n tt = 'legacy_frame'\n except IndexError:\n pass\n\n try:\n return globals()[_TABLE_MAP[tt]](self, group, **kwargs)\n except KeyError:\n error('_TABLE_MAP')", "response": "Create a new instance of the appropriate storer class."} {"instruction": "How would you explain what the following Python 3 function does\ndef set_name(self, name, kind_attr=None):\n self.name = name\n self.kind_attr = kind_attr or \"{name}_kind\".format(name=name)\n if self.cname is None:\n self.cname = name\n\n return self", "response": "set the name of this indexer"} {"instruction": "Here you have a function in Python 3, explain what it does\ndef set_pos(self, pos):\n self.pos = pos\n if pos is not None and self.typ is not None:\n self.typ._v_pos = pos\n return self", "response": "set the position of this column in the Table"} {"instruction": "Given the following Python 3 function, write the documentation\ndef is_indexed(self):\n try:\n return getattr(self.table.cols, self.cname).is_indexed\n except AttributeError:\n False", "response": "return whether I am an indexed column"} {"instruction": "Write a Python 3 function for\ninferring this column from the table", "response": "def infer(self, handler):\n \"\"\"infer this column from the table: create and return a new object\"\"\"\n table = handler.table\n new_self = self.copy()\n new_self.set_table(table)\n new_self.get_attr()\n new_self.read_metadata(handler)\n return new_self"} {"instruction": "Implement a Python 3 function for\nconverting the values from this selection to the index instance", "response": "def convert(self, values, nan_rep, encoding, errors):\n \"\"\" set the values from this selection: take = take ownership \"\"\"\n\n # values is a recarray\n if values.dtype.fields is not None:\n values = values[self.cname]\n\n values = _maybe_convert(values, self.kind, encoding, errors)\n\n kwargs = dict()\n if self.freq is not None:\n kwargs['freq'] = _ensure_decoded(self.freq)\n if self.index_name is not None:\n kwargs['name'] = _ensure_decoded(self.index_name)\n # making an Index instance could throw a number of different errors\n try:\n self.values = Index(values, **kwargs)\n except Exception: # noqa: E722\n\n # if the output freq is different that what we recorded,\n # it should be None (see also 'doc example part 2')\n if 'freq' in kwargs:\n kwargs['freq'] = None\n self.values = Index(values, **kwargs)\n\n self.values = _set_tz(self.values, self.tz)\n\n return self"} {"instruction": "Explain what the following Python 3 code does\ndef validate_col(self, itemsize=None):\n\n # validate this column for string truncation (or reset to the max size)\n if _ensure_decoded(self.kind) == 'string':\n c = self.col\n if c is not None:\n if itemsize is None:\n itemsize = self.itemsize\n if c.itemsize < itemsize:\n raise ValueError(\n \"Trying to store a string with len [{itemsize}] in \"\n \"[{cname}] column but\\nthis column has a limit of \"\n \"[{c_itemsize}]!\\nConsider using min_itemsize to \"\n \"preset the sizes on these columns\".format(\n itemsize=itemsize, cname=self.cname,\n c_itemsize=c.itemsize))\n return c.itemsize\n\n return None", "response": "validate this column return the compared against itemsize"} {"instruction": "Write a Python 3 script to\nset the info for this indexable with the key - value pair", "response": "def update_info(self, info):\n \"\"\" set/update the info for this indexable with the key/value\n if there is a conflict raise/warn as needed \"\"\"\n\n for key in self._info_fields:\n\n value = getattr(self, key, None)\n idx = _get_info(info, self.name)\n\n existing_value = idx.get(key)\n if key in idx and value is not None and existing_value != value:\n\n # frequency/name just warn\n if key in ['freq', 'index_name']:\n ws = attribute_conflict_doc % (key, existing_value, value)\n warnings.warn(ws, AttributeConflictWarning, stacklevel=6)\n\n # reset\n idx[key] = None\n setattr(self, key, None)\n\n else:\n raise ValueError(\n \"invalid info for [{name}] for [{key}], \"\n \"existing_value [{existing_value}] conflicts with \"\n \"new value [{value}]\".format(\n name=self.name, key=key,\n existing_value=existing_value, value=value))\n else:\n if value is not None or existing_value is not None:\n idx[key] = value\n\n return self"} {"instruction": "Create a Python 3 function for\nsetting my state from the passed info", "response": "def set_info(self, info):\n \"\"\" set my state from the passed info \"\"\"\n idx = info.get(self.name)\n if idx is not None:\n self.__dict__.update(idx)"} {"instruction": "Here you have a function in Python 3, explain what it does\ndef validate_metadata(self, handler):\n if self.meta == 'category':\n new_metadata = self.metadata\n cur_metadata = handler.read_metadata(self.cname)\n if (new_metadata is not None and cur_metadata is not None and\n not array_equivalent(new_metadata, cur_metadata)):\n raise ValueError(\"cannot append a categorical with \"\n \"different categories to the existing\")", "response": "validate that the metadata of the current object is not changed the categories"} {"instruction": "How would you implement a function in Python 3 that\nsets the meta data for this object", "response": "def write_metadata(self, handler):\n \"\"\" set the meta data \"\"\"\n if self.metadata is not None:\n handler.write_metadata(self.cname, self.metadata)"} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef convert(self, values, nan_rep, encoding, errors):\n\n self.values = Int64Index(np.arange(self.table.nrows))\n return self", "response": "convert the values from this selection to Int64Index"} {"instruction": "Make a summary of the following Python 3 code\ndef create_for_block(\n cls, i=None, name=None, cname=None, version=None, **kwargs):\n \"\"\" return a new datacol with the block i \"\"\"\n\n if cname is None:\n cname = name or 'values_block_{idx}'.format(idx=i)\n if name is None:\n name = cname\n\n # prior to 0.10.1, we named values blocks like: values_block_0 an the\n # name values_0\n try:\n if version[0] == 0 and version[1] <= 10 and version[2] == 0:\n m = re.search(r\"values_block_(\\d+)\", name)\n if m:\n name = \"values_{group}\".format(group=m.groups()[0])\n except IndexError:\n pass\n\n return cls(name=name, cname=cname, **kwargs)", "response": "create a new datacol with the block i"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef set_atom(self, block, block_items, existing_col, min_itemsize,\n nan_rep, info, encoding=None, errors='strict'):\n \"\"\" create and setup my atom from the block b \"\"\"\n\n self.values = list(block_items)\n\n # short-cut certain block types\n if block.is_categorical:\n return self.set_atom_categorical(block, items=block_items,\n info=info)\n elif block.is_datetimetz:\n return self.set_atom_datetime64tz(block, info=info)\n elif block.is_datetime:\n return self.set_atom_datetime64(block)\n elif block.is_timedelta:\n return self.set_atom_timedelta64(block)\n elif block.is_complex:\n return self.set_atom_complex(block)\n\n dtype = block.dtype.name\n inferred_type = lib.infer_dtype(block.values, skipna=False)\n\n if inferred_type == 'date':\n raise TypeError(\n \"[date] is not implemented as a table column\")\n elif inferred_type == 'datetime':\n # after 8260\n # this only would be hit for a mutli-timezone dtype\n # which is an error\n\n raise TypeError(\n \"too many timezones in this block, create separate \"\n \"data columns\"\n )\n elif inferred_type == 'unicode':\n raise TypeError(\n \"[unicode] is not implemented as a table column\")\n\n # this is basically a catchall; if say a datetime64 has nans then will\n # end up here ###\n elif inferred_type == 'string' or dtype == 'object':\n self.set_atom_string(\n block, block_items,\n existing_col,\n min_itemsize,\n nan_rep,\n encoding,\n errors)\n\n # set as a data block\n else:\n self.set_atom_data(block)", "response": "set the atom from the block b"} {"instruction": "Create a Python 3 function to\nreturn the PyTables column class for this column", "response": "def get_atom_coltype(self, kind=None):\n \"\"\" return the PyTables column class for this column \"\"\"\n if kind is None:\n kind = self.kind\n if self.kind.startswith('uint'):\n col_name = \"UInt{name}Col\".format(name=kind[4:])\n else:\n col_name = \"{name}Col\".format(name=kind.capitalize())\n\n return getattr(_tables(), col_name)"} {"instruction": "Make a summary of the following Python 3 code\ndef validate_attr(self, append):\n if append:\n existing_fields = getattr(self.attrs, self.kind_attr, None)\n if (existing_fields is not None and\n existing_fields != list(self.values)):\n raise ValueError(\"appended items do not match existing items\"\n \" in table!\")\n\n existing_dtype = getattr(self.attrs, self.dtype_attr, None)\n if (existing_dtype is not None and\n existing_dtype != self.dtype):\n raise ValueError(\"appended items dtype do not match existing \"\n \"items dtype in table!\")", "response": "validate that we have the same order as the existing & same dtype"} {"instruction": "Create a Python 3 function for\nconverting the data from this selection to the correct dtype", "response": "def convert(self, values, nan_rep, encoding, errors):\n \"\"\"set the data from this selection (and convert to the correct dtype\n if we can)\n \"\"\"\n\n # values is a recarray\n if values.dtype.fields is not None:\n values = values[self.cname]\n\n self.set_data(values)\n\n # use the meta if needed\n meta = _ensure_decoded(self.meta)\n\n # convert to the correct dtype\n if self.dtype is not None:\n dtype = _ensure_decoded(self.dtype)\n\n # reverse converts\n if dtype == 'datetime64':\n\n # recreate with tz if indicated\n self.data = _set_tz(self.data, self.tz, coerce=True)\n\n elif dtype == 'timedelta64':\n self.data = np.asarray(self.data, dtype='m8[ns]')\n elif dtype == 'date':\n try:\n self.data = np.asarray(\n [date.fromordinal(v) for v in self.data], dtype=object)\n except ValueError:\n self.data = np.asarray(\n [date.fromtimestamp(v) for v in self.data],\n dtype=object)\n elif dtype == 'datetime':\n self.data = np.asarray(\n [datetime.fromtimestamp(v) for v in self.data],\n dtype=object)\n\n elif meta == 'category':\n\n # we have a categorical\n categories = self.metadata\n codes = self.data.ravel()\n\n # if we have stored a NaN in the categories\n # then strip it; in theory we could have BOTH\n # -1s in the codes and nulls :<\n if categories is None:\n # Handle case of NaN-only categorical columns in which case\n # the categories are an empty array; when this is stored,\n # pytables cannot write a zero-len array, so on readback\n # the categories would be None and `read_hdf()` would fail.\n categories = Index([], dtype=np.float64)\n else:\n mask = isna(categories)\n if mask.any():\n categories = categories[~mask]\n codes[codes != -1] -= mask.astype(int).cumsum().values\n\n self.data = Categorical.from_codes(codes,\n categories=categories,\n ordered=self.ordered)\n\n else:\n\n try:\n self.data = self.data.astype(dtype, copy=False)\n except TypeError:\n self.data = self.data.astype('O', copy=False)\n\n # convert nans / decode\n if _ensure_decoded(self.kind) == 'string':\n self.data = _unconvert_string_array(\n self.data, nan_rep=nan_rep, encoding=encoding, errors=errors)\n\n return self"} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef get_attr(self):\n self.values = getattr(self.attrs, self.kind_attr, None)\n self.dtype = getattr(self.attrs, self.dtype_attr, None)\n self.meta = getattr(self.attrs, self.meta_attr, None)\n self.set_kind()", "response": "get the data for this column"} {"instruction": "Write a Python 3 script for\nsetting the data for this column", "response": "def set_attr(self):\n \"\"\" set the data for this column \"\"\"\n setattr(self.attrs, self.kind_attr, self.values)\n setattr(self.attrs, self.meta_attr, self.meta)\n if self.dtype is not None:\n setattr(self.attrs, self.dtype_attr, self.dtype)"} {"instruction": "How would you implement a function in Python 3 that\ncomputes and set our version", "response": "def set_version(self):\n \"\"\" compute and set our version \"\"\"\n version = _ensure_decoded(\n getattr(self.group._v_attrs, 'pandas_version', None))\n try:\n self.version = tuple(int(x) for x in version.split('.'))\n if len(self.version) == 2:\n self.version = self.version + (0,)\n except AttributeError:\n self.version = (0, 0, 0)"} {"instruction": "Create a Python 3 function for\nsetting my pandas type & version", "response": "def set_object_info(self):\n \"\"\" set my pandas type & version \"\"\"\n self.attrs.pandas_type = str(self.pandas_kind)\n self.attrs.pandas_version = str(_version)\n self.set_version()"} {"instruction": "Write a Python 3 script for\ninferring the axes of my storer return a boolean indicating if we have a valid storer", "response": "def infer_axes(self):\n \"\"\" infer the axes of my storer\n return a boolean indicating if we have a valid storer or not \"\"\"\n\n s = self.storable\n if s is None:\n return False\n self.get_attrs()\n return True"} {"instruction": "Implement a function in Python 3 to\ndelete a node from the store", "response": "def delete(self, where=None, start=None, stop=None, **kwargs):\n \"\"\"\n support fully deleting the node in its entirety (only) - where\n specification must be None\n \"\"\"\n if com._all_none(where, start, stop):\n self._handle.remove_node(self.group, recursive=True)\n return None\n\n raise TypeError(\"cannot delete on an abstract storer\")"} {"instruction": "Can you implement a function in Python 3 that\nvalidates that the table keywords are passed in kwargs and return the kwargs", "response": "def validate_read(self, kwargs):\n \"\"\"\n remove table keywords from kwargs and return\n raise if any keywords are passed which are not-None\n \"\"\"\n kwargs = copy.copy(kwargs)\n\n columns = kwargs.pop('columns', None)\n if columns is not None:\n raise TypeError(\"cannot pass a column specification when reading \"\n \"a Fixed format store. this store must be \"\n \"selected in its entirety\")\n where = kwargs.pop('where', None)\n if where is not None:\n raise TypeError(\"cannot pass a where specification when reading \"\n \"from a Fixed format store. this store must be \"\n \"selected in its entirety\")\n return kwargs"} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef set_attrs(self):\n self.attrs.encoding = self.encoding\n self.attrs.errors = self.errors", "response": "set our object attributes"} {"instruction": "Write a Python 3 function for\nreading an array for the specified node", "response": "def read_array(self, key, start=None, stop=None):\n \"\"\" read an array for the specified node (off of group \"\"\"\n import tables\n node = getattr(self.group, key)\n attrs = node._v_attrs\n\n transposed = getattr(attrs, 'transposed', False)\n\n if isinstance(node, tables.VLArray):\n ret = node[0][start:stop]\n else:\n dtype = getattr(attrs, 'value_type', None)\n shape = getattr(attrs, 'shape', None)\n\n if shape is not None:\n # length 0 axis\n ret = np.empty(shape, dtype=dtype)\n else:\n ret = node[start:stop]\n\n if dtype == 'datetime64':\n\n # reconstruct a timezone if indicated\n ret = _set_tz(ret, getattr(attrs, 'tz', None), coerce=True)\n\n elif dtype == 'timedelta64':\n ret = np.asarray(ret, dtype='m8[ns]')\n\n if transposed:\n return ret.T\n else:\n return ret"} {"instruction": "Implement a function in Python 3 to\nwrite an empty array", "response": "def write_array_empty(self, key, value):\n \"\"\" write a 0-len array \"\"\"\n\n # ugly hack for length 0 axes\n arr = np.empty((1,) * value.ndim)\n self._handle.create_array(self.group, key, arr)\n getattr(self.group, key)._v_attrs.value_type = str(value.dtype)\n getattr(self.group, key)._v_attrs.shape = value.shape"} {"instruction": "Make a summary of the following Python 3 code\ndef validate_read(self, kwargs):\n kwargs = super().validate_read(kwargs)\n if 'start' in kwargs or 'stop' in kwargs:\n raise NotImplementedError(\"start and/or stop are not supported \"\n \"in fixed Sparse reading\")\n return kwargs", "response": "Validate that start and stop keywords are not None"} {"instruction": "Given the following Python 3 function, write the documentation\ndef write(self, obj, **kwargs):\n super().write(obj, **kwargs)\n for name, ss in obj.items():\n key = 'sparse_series_{name}'.format(name=name)\n if key not in self.group._v_children:\n node = self._handle.create_group(self.group, key)\n else:\n node = getattr(self.group, key)\n s = SparseSeriesFixed(self.parent, node)\n s.write(ss)\n self.attrs.default_fill_value = obj.default_fill_value\n self.attrs.default_kind = obj.default_kind\n self.write_index('columns', obj.columns)", "response": "write it as a collection of individual sparse series"} {"instruction": "Create a Python 3 function to\nvalidate against an existing table", "response": "def validate(self, other):\n \"\"\" validate against an existing table \"\"\"\n if other is None:\n return\n\n if other.table_type != self.table_type:\n raise TypeError(\n \"incompatible table_type with existing \"\n \"[{other} - {self}]\".format(\n other=other.table_type, self=self.table_type))\n\n for c in ['index_axes', 'non_index_axes', 'values_axes']:\n sv = getattr(self, c, None)\n ov = getattr(other, c, None)\n if sv != ov:\n\n # show the error for the specific axes\n for i, sax in enumerate(sv):\n oax = ov[i]\n if sax != oax:\n raise ValueError(\n \"invalid combinate of [{c}] on appending data \"\n \"[{sax}] vs current table [{oax}]\".format(\n c=c, sax=sax, oax=oax))\n\n # should never get here\n raise Exception(\n \"invalid combinate of [{c}] on appending data [{sv}] vs \"\n \"current table [{ov}]\".format(c=c, sv=sv, ov=ov))"} {"instruction": "How would you explain what the following Python 3 function does\ndef validate_metadata(self, existing):\n self.metadata = [\n c.name for c in self.values_axes if c.metadata is not None]", "response": "create and validate metadata"} {"instruction": "Given the following Python 3 function, write the documentation\ndef validate_multiindex(self, obj):\n levels = [l if l is not None else \"level_{0}\".format(i)\n for i, l in enumerate(obj.index.names)]\n try:\n return obj.reset_index(), levels\n except ValueError:\n raise ValueError(\"duplicate names/columns in the multi-index when \"\n \"storing as a table\")", "response": "validate that we can store the multi - index ; reset and return the the\n new object\n "} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef nrows_expected(self):\n return np.prod([i.cvalues.shape[0] for i in self.index_axes])", "response": "compute the expected nrows"} {"instruction": "Create a Python 3 function to\nreturn a tuple of my permutated axes non_indexable at the front", "response": "def data_orientation(self):\n \"\"\"return a tuple of my permutated axes, non_indexable at the front\"\"\"\n return tuple(itertools.chain([int(a[0]) for a in self.non_index_axes],\n [int(a.axis) for a in self.index_axes]))"} {"instruction": "Implement a Python 3 function for\nreturning a dict of the kinds allowable columns for this object", "response": "def queryables(self):\n \"\"\" return a dict of the kinds allowable columns for this object \"\"\"\n\n # compute the values_axes queryables\n return dict(\n [(a.cname, a) for a in self.index_axes] +\n [(self.storage_obj_type._AXIS_NAMES[axis], None)\n for axis, values in self.non_index_axes] +\n [(v.cname, v) for v in self.values_axes\n if v.name in set(self.data_columns)]\n )"} {"instruction": "Make a summary of the following Python 3 code\ndef _get_metadata_path(self, key):\n return \"{group}/meta/{key}/meta\".format(group=self.group._v_pathname,\n key=key)", "response": "return the metadata pathname for this key"} {"instruction": "Write a Python 3 function that can\nwrite out a meta data array to the key as a fixed - format Series", "response": "def write_metadata(self, key, values):\n \"\"\"\n write out a meta data array to the key as a fixed-format Series\n\n Parameters\n ----------\n key : string\n values : ndarray\n\n \"\"\"\n values = Series(values)\n self.parent.put(self._get_metadata_path(key), values, format='table',\n encoding=self.encoding, errors=self.errors,\n nan_rep=self.nan_rep)"} {"instruction": "Write a Python 3 script for\nreturning the meta data array for this key", "response": "def read_metadata(self, key):\n \"\"\" return the meta data array for this key \"\"\"\n if getattr(getattr(self.group, 'meta', None), key, None) is not None:\n return self.parent.select(self._get_metadata_path(key))\n return None"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef set_attrs(self):\n self.attrs.table_type = str(self.table_type)\n self.attrs.index_cols = self.index_cols()\n self.attrs.values_cols = self.values_cols()\n self.attrs.non_index_axes = self.non_index_axes\n self.attrs.data_columns = self.data_columns\n self.attrs.nan_rep = self.nan_rep\n self.attrs.encoding = self.encoding\n self.attrs.errors = self.errors\n self.attrs.levels = self.levels\n self.attrs.metadata = self.metadata\n self.set_info()", "response": "set our table type & indexables"} {"instruction": "How would you explain what the following Python 3 function does\ndef validate_version(self, where=None):\n if where is not None:\n if (self.version[0] <= 0 and self.version[1] <= 10 and\n self.version[2] < 1):\n ws = incompatibility_doc % '.'.join(\n [str(x) for x in self.version])\n warnings.warn(ws, IncompatibilityWarning)", "response": "Validate that the version of the resource is valid."} {"instruction": "Write a Python 3 script to\nvalidate the min_itemisze has the items that are not in the data_columns", "response": "def validate_min_itemsize(self, min_itemsize):\n \"\"\"validate the min_itemisze doesn't contain items that are not in the\n axes this needs data_columns to be defined\n \"\"\"\n if min_itemsize is None:\n return\n if not isinstance(min_itemsize, dict):\n return\n\n q = self.queryables()\n for k, v in min_itemsize.items():\n\n # ok, apply generally\n if k == 'values':\n continue\n if k not in q:\n raise ValueError(\n \"min_itemsize has the key [{key}] which is not an axis or \"\n \"data_column\".format(key=k))"} {"instruction": "Explain what the following Python 3 code does\ndef indexables(self):\n if self._indexables is None:\n\n self._indexables = []\n\n # index columns\n self._indexables.extend([\n IndexCol(name=name, axis=axis, pos=i)\n for i, (axis, name) in enumerate(self.attrs.index_cols)\n ])\n\n # values columns\n dc = set(self.data_columns)\n base_pos = len(self._indexables)\n\n def f(i, c):\n klass = DataCol\n if c in dc:\n klass = DataIndexableCol\n return klass.create_for_block(i=i, name=c, pos=base_pos + i,\n version=self.version)\n\n self._indexables.extend(\n [f(i, c) for i, c in enumerate(self.attrs.values_cols)])\n\n return self._indexables", "response": "create the indexables if they don t exist"} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef create_index(self, columns=None, optlevel=None, kind=None):\n\n if not self.infer_axes():\n return\n if columns is False:\n return\n\n # index all indexables and data_columns\n if columns is None or columns is True:\n columns = [a.cname for a in self.axes if a.is_data_indexable]\n if not isinstance(columns, (tuple, list)):\n columns = [columns]\n\n kw = dict()\n if optlevel is not None:\n kw['optlevel'] = optlevel\n if kind is not None:\n kw['kind'] = kind\n\n table = self.table\n for c in columns:\n v = getattr(table.cols, c, None)\n if v is not None:\n\n # remove the index if the kind/optlevel have changed\n if v.is_indexed:\n index = v.index\n cur_optlevel = index.optlevel\n cur_kind = index.kind\n\n if kind is not None and cur_kind != kind:\n v.remove_index()\n else:\n kw['kind'] = cur_kind\n\n if optlevel is not None and cur_optlevel != optlevel:\n v.remove_index()\n else:\n kw['optlevel'] = cur_optlevel\n\n # create the index\n if not v.is_indexed:\n if v.type.startswith('complex'):\n raise TypeError(\n 'Columns containing complex values can be stored '\n 'but cannot'\n ' be indexed when using table format. Either use '\n 'fixed format, set index=False, or do not include '\n 'the columns containing complex values to '\n 'data_columns when initializing the table.')\n v.create_index(**kw)", "response": "Create a pytables index on the specified columns."} {"instruction": "Create a Python 3 function to\ncreate and return the axes sniffed from the table return True if success otherwise False", "response": "def read_axes(self, where, **kwargs):\n \"\"\"create and return the axes sniffed from the table: return boolean\n for success\n \"\"\"\n\n # validate the version\n self.validate_version(where)\n\n # infer the data kind\n if not self.infer_axes():\n return False\n\n # create the selection\n self.selection = Selection(self, where=where, **kwargs)\n values = self.selection.select()\n\n # convert the data\n for a in self.axes:\n a.set_info(self.info)\n a.convert(values, nan_rep=self.nan_rep, encoding=self.encoding,\n errors=self.errors)\n\n return True"} {"instruction": "How would you explain what the following Python 3 function does\ndef validate_data_columns(self, data_columns, min_itemsize):\n\n if not len(self.non_index_axes):\n return []\n\n axis, axis_labels = self.non_index_axes[0]\n info = self.info.get(axis, dict())\n if info.get('type') == 'MultiIndex' and data_columns:\n raise ValueError(\"cannot use a multi-index on axis [{0}] with \"\n \"data_columns {1}\".format(axis, data_columns))\n\n # evaluate the passed data_columns, True == use all columns\n # take only valide axis labels\n if data_columns is True:\n data_columns = list(axis_labels)\n elif data_columns is None:\n data_columns = []\n\n # if min_itemsize is a dict, add the keys (exclude 'values')\n if isinstance(min_itemsize, dict):\n\n existing_data_columns = set(data_columns)\n data_columns.extend([\n k for k in min_itemsize.keys()\n if k != 'values' and k not in existing_data_columns\n ])\n\n # return valid columns in the order of our axis\n return [c for c in data_columns if c in axis_labels]", "response": "take the input data_columns and min_itemize and create a data\n columns spec\n "} {"instruction": "Write a Python 3 function that can\ncreate and return the axes for the new leagcy tables", "response": "def create_axes(self, axes, obj, validate=True, nan_rep=None,\n data_columns=None, min_itemsize=None, **kwargs):\n \"\"\" create and return the axes\n leagcy tables create an indexable column, indexable index,\n non-indexable fields\n\n Parameters:\n -----------\n axes: a list of the axes in order to create (names or numbers of\n the axes)\n obj : the object to create axes on\n validate: validate the obj against an existing object already\n written\n min_itemsize: a dict of the min size for a column in bytes\n nan_rep : a values to use for string column nan_rep\n encoding : the encoding for string values\n data_columns : a list of columns that we want to create separate to\n allow indexing (or True will force all columns)\n\n \"\"\"\n\n # set the default axes if needed\n if axes is None:\n try:\n axes = _AXES_MAP[type(obj)]\n except KeyError:\n raise TypeError(\n \"cannot properly create the storer for: [group->{group},\"\n \"value->{value}]\".format(\n group=self.group._v_name, value=type(obj)))\n\n # map axes to numbers\n axes = [obj._get_axis_number(a) for a in axes]\n\n # do we have an existing table (if so, use its axes & data_columns)\n if self.infer_axes():\n existing_table = self.copy()\n existing_table.infer_axes()\n axes = [a.axis for a in existing_table.index_axes]\n data_columns = existing_table.data_columns\n nan_rep = existing_table.nan_rep\n self.encoding = existing_table.encoding\n self.errors = existing_table.errors\n self.info = copy.copy(existing_table.info)\n else:\n existing_table = None\n\n # currently support on ndim-1 axes\n if len(axes) != self.ndim - 1:\n raise ValueError(\n \"currently only support ndim-1 indexers in an AppendableTable\")\n\n # create according to the new data\n self.non_index_axes = []\n self.data_columns = []\n\n # nan_representation\n if nan_rep is None:\n nan_rep = 'nan'\n\n self.nan_rep = nan_rep\n\n # create axes to index and non_index\n index_axes_map = dict()\n for i, a in enumerate(obj.axes):\n\n if i in axes:\n name = obj._AXIS_NAMES[i]\n index_axes_map[i] = _convert_index(\n a, self.encoding, self.errors, self.format_type\n ).set_name(name).set_axis(i)\n else:\n\n # we might be able to change the axes on the appending data if\n # necessary\n append_axis = list(a)\n if existing_table is not None:\n indexer = len(self.non_index_axes)\n exist_axis = existing_table.non_index_axes[indexer][1]\n if not array_equivalent(np.array(append_axis),\n np.array(exist_axis)):\n\n # ahah! -> reindex\n if array_equivalent(np.array(sorted(append_axis)),\n np.array(sorted(exist_axis))):\n append_axis = exist_axis\n\n # the non_index_axes info\n info = _get_info(self.info, i)\n info['names'] = list(a.names)\n info['type'] = a.__class__.__name__\n\n self.non_index_axes.append((i, append_axis))\n\n # set axis positions (based on the axes)\n self.index_axes = [\n index_axes_map[a].set_pos(j).update_info(self.info)\n for j, a in enumerate(axes)\n ]\n j = len(self.index_axes)\n\n # check for column conflicts\n for a in self.axes:\n a.maybe_set_size(min_itemsize=min_itemsize)\n\n # reindex by our non_index_axes & compute data_columns\n for a in self.non_index_axes:\n obj = _reindex_axis(obj, a[0], a[1])\n\n def get_blk_items(mgr, blocks):\n return [mgr.items.take(blk.mgr_locs) for blk in blocks]\n\n # figure out data_columns and get out blocks\n block_obj = self.get_object(obj)._consolidate()\n blocks = block_obj._data.blocks\n blk_items = get_blk_items(block_obj._data, blocks)\n if len(self.non_index_axes):\n axis, axis_labels = self.non_index_axes[0]\n data_columns = self.validate_data_columns(\n data_columns, min_itemsize)\n if len(data_columns):\n mgr = block_obj.reindex(\n Index(axis_labels).difference(Index(data_columns)),\n axis=axis\n )._data\n\n blocks = list(mgr.blocks)\n blk_items = get_blk_items(mgr, blocks)\n for c in data_columns:\n mgr = block_obj.reindex([c], axis=axis)._data\n blocks.extend(mgr.blocks)\n blk_items.extend(get_blk_items(mgr, mgr.blocks))\n\n # reorder the blocks in the same order as the existing_table if we can\n if existing_table is not None:\n by_items = {tuple(b_items.tolist()): (b, b_items)\n for b, b_items in zip(blocks, blk_items)}\n new_blocks = []\n new_blk_items = []\n for ea in existing_table.values_axes:\n items = tuple(ea.values)\n try:\n b, b_items = by_items.pop(items)\n new_blocks.append(b)\n new_blk_items.append(b_items)\n except (IndexError, KeyError):\n raise ValueError(\n \"cannot match existing table structure for [{items}] \"\n \"on appending data\".format(\n items=(','.join(pprint_thing(item) for\n item in items))))\n blocks = new_blocks\n blk_items = new_blk_items\n\n # add my values\n self.values_axes = []\n for i, (b, b_items) in enumerate(zip(blocks, blk_items)):\n\n # shape of the data column are the indexable axes\n klass = DataCol\n name = None\n\n # we have a data_column\n if (data_columns and len(b_items) == 1 and\n b_items[0] in data_columns):\n klass = DataIndexableCol\n name = b_items[0]\n self.data_columns.append(name)\n\n # make sure that we match up the existing columns\n # if we have an existing table\n if existing_table is not None and validate:\n try:\n existing_col = existing_table.values_axes[i]\n except (IndexError, KeyError):\n raise ValueError(\n \"Incompatible appended table [{blocks}]\"\n \"with existing table [{table}]\".format(\n blocks=blocks,\n table=existing_table.values_axes))\n else:\n existing_col = None\n\n try:\n col = klass.create_for_block(\n i=i, name=name, version=self.version)\n col.set_atom(block=b, block_items=b_items,\n existing_col=existing_col,\n min_itemsize=min_itemsize,\n nan_rep=nan_rep,\n encoding=self.encoding,\n errors=self.errors,\n info=self.info)\n col.set_pos(j)\n\n self.values_axes.append(col)\n except (NotImplementedError, ValueError, TypeError) as e:\n raise e\n except Exception as detail:\n raise Exception(\n \"cannot find the correct atom type -> \"\n \"[dtype->{name},items->{items}] {detail!s}\".format(\n name=b.dtype.name, items=b_items, detail=detail))\n j += 1\n\n # validate our min_itemsize\n self.validate_min_itemsize(min_itemsize)\n\n # validate our metadata\n self.validate_metadata(existing_table)\n\n # validate the axes if we have an existing table\n if validate:\n self.validate(existing_table)"} {"instruction": "Can you implement a function in Python 3 that\nprocesses axes and return the related record ID.", "response": "def process_axes(self, obj, columns=None):\n \"\"\" process axes filters \"\"\"\n\n # make a copy to avoid side effects\n if columns is not None:\n columns = list(columns)\n\n # make sure to include levels if we have them\n if columns is not None and self.is_multi_index:\n for n in self.levels:\n if n not in columns:\n columns.insert(0, n)\n\n # reorder by any non_index_axes & limit to the select columns\n for axis, labels in self.non_index_axes:\n obj = _reindex_axis(obj, axis, labels, columns)\n\n # apply the selection filters (but keep in the same order)\n if self.selection.filter is not None:\n for field, op, filt in self.selection.filter.format():\n\n def process_filter(field, filt):\n\n for axis_name in obj._AXIS_NAMES.values():\n axis_number = obj._get_axis_number(axis_name)\n axis_values = obj._get_axis(axis_name)\n\n # see if the field is the name of an axis\n if field == axis_name:\n\n # if we have a multi-index, then need to include\n # the levels\n if self.is_multi_index:\n filt = filt.union(Index(self.levels))\n\n takers = op(axis_values, filt)\n return obj.loc._getitem_axis(takers,\n axis=axis_number)\n\n # this might be the name of a file IN an axis\n elif field in axis_values:\n\n # we need to filter on this dimension\n values = ensure_index(getattr(obj, field).values)\n filt = ensure_index(filt)\n\n # hack until we support reversed dim flags\n if isinstance(obj, DataFrame):\n axis_number = 1 - axis_number\n takers = op(values, filt)\n return obj.loc._getitem_axis(takers,\n axis=axis_number)\n\n raise ValueError(\"cannot find the field [{field}] for \"\n \"filtering!\".format(field=field))\n\n obj = process_filter(field, filt)\n\n return obj"} {"instruction": "How would you explain what the following Python 3 function does\ndef create_description(self, complib=None, complevel=None,\n fletcher32=False, expectedrows=None):\n \"\"\" create the description of the table from the axes & values \"\"\"\n\n # provided expected rows if its passed\n if expectedrows is None:\n expectedrows = max(self.nrows_expected, 10000)\n\n d = dict(name='table', expectedrows=expectedrows)\n\n # description from the axes & values\n d['description'] = {a.cname: a.typ for a in self.axes}\n\n if complib:\n if complevel is None:\n complevel = self._complevel or 9\n filters = _tables().Filters(\n complevel=complevel, complib=complib,\n fletcher32=fletcher32 or self._fletcher32)\n d['filters'] = filters\n elif self._filters is not None:\n d['filters'] = self._filters\n\n return d", "response": "create the description of the table"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef read_coordinates(self, where=None, start=None, stop=None, **kwargs):\n\n # validate the version\n self.validate_version(where)\n\n # infer the data kind\n if not self.infer_axes():\n return False\n\n # create the selection\n self.selection = Selection(\n self, where=where, start=start, stop=stop, **kwargs)\n coords = self.selection.select_coords()\n if self.selection.filter is not None:\n for field, op, filt in self.selection.filter.format():\n data = self.read_column(\n field, start=coords.min(), stop=coords.max() + 1)\n coords = coords[\n op(data.iloc[coords - coords.min()], filt).values]\n\n return Index(coords)", "response": "select coordinates from a table ; return the the\n coordinates object"} {"instruction": "How would you explain what the following Python 3 function does\ndef read_column(self, column, where=None, start=None, stop=None):\n\n # validate the version\n self.validate_version()\n\n # infer the data kind\n if not self.infer_axes():\n return False\n\n if where is not None:\n raise TypeError(\"read_column does not currently accept a where \"\n \"clause\")\n\n # find the axes\n for a in self.axes:\n if column == a.name:\n\n if not a.is_data_indexable:\n raise ValueError(\n \"column [{column}] can not be extracted individually; \"\n \"it is not data indexable\".format(column=column))\n\n # column must be an indexable or a data column\n c = getattr(self.table.cols, column)\n a.set_info(self.info)\n return Series(_set_tz(a.convert(c[start:stop],\n nan_rep=self.nan_rep,\n encoding=self.encoding,\n errors=self.errors\n ).take_data(),\n a.tz, True), name=column)\n\n raise KeyError(\n \"column [{column}] not found in the table\".format(column=column))", "response": "return a single column from the table generally only indexables\n are interesting"} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef read(self, where=None, columns=None, **kwargs):\n\n if not self.read_axes(where=where, **kwargs):\n return None\n\n raise NotImplementedError(\"Panel is removed in pandas 0.25.0\")", "response": "read the n indexable columns with an arbitrary number of data"} {"instruction": "How would you explain what the following Python 3 function does\ndef write_data(self, chunksize, dropna=False):\n\n names = self.dtype.names\n nrows = self.nrows_expected\n\n # if dropna==True, then drop ALL nan rows\n masks = []\n if dropna:\n\n for a in self.values_axes:\n\n # figure the mask: only do if we can successfully process this\n # column, otherwise ignore the mask\n mask = isna(a.data).all(axis=0)\n if isinstance(mask, np.ndarray):\n masks.append(mask.astype('u1', copy=False))\n\n # consolidate masks\n if len(masks):\n mask = masks[0]\n for m in masks[1:]:\n mask = mask & m\n mask = mask.ravel()\n else:\n mask = None\n\n # broadcast the indexes if needed\n indexes = [a.cvalues for a in self.index_axes]\n nindexes = len(indexes)\n bindexes = []\n for i, idx in enumerate(indexes):\n\n # broadcast to all other indexes except myself\n if i > 0 and i < nindexes:\n repeater = np.prod(\n [indexes[bi].shape[0] for bi in range(0, i)])\n idx = np.tile(idx, repeater)\n\n if i < nindexes - 1:\n repeater = np.prod([indexes[bi].shape[0]\n for bi in range(i + 1, nindexes)])\n idx = np.repeat(idx, repeater)\n\n bindexes.append(idx)\n\n # transpose the values so first dimension is last\n # reshape the values if needed\n values = [a.take_data() for a in self.values_axes]\n values = [v.transpose(np.roll(np.arange(v.ndim), v.ndim - 1))\n for v in values]\n bvalues = []\n for i, v in enumerate(values):\n new_shape = (nrows,) + self.dtype[names[nindexes + i]].shape\n bvalues.append(values[i].reshape(new_shape))\n\n # write the chunks\n if chunksize is None:\n chunksize = 100000\n\n rows = np.empty(min(chunksize, nrows), dtype=self.dtype)\n chunks = int(nrows / chunksize) + 1\n for i in range(chunks):\n start_i = i * chunksize\n end_i = min((i + 1) * chunksize, nrows)\n if start_i >= end_i:\n break\n\n self.write_data_chunk(\n rows,\n indexes=[a[start_i:end_i] for a in bindexes],\n mask=mask[start_i:end_i] if mask is not None else None,\n values=[v[start_i:end_i] for v in bvalues])", "response": "we form the data into a 2 - d including indexes values mask and write chunk - by - chunk"} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef write_data_chunk(self, rows, indexes, mask, values):\n\n # 0 len\n for v in values:\n if not np.prod(v.shape):\n return\n\n try:\n nrows = indexes[0].shape[0]\n if nrows != len(rows):\n rows = np.empty(nrows, dtype=self.dtype)\n names = self.dtype.names\n nindexes = len(indexes)\n\n # indexes\n for i, idx in enumerate(indexes):\n rows[names[i]] = idx\n\n # values\n for i, v in enumerate(values):\n rows[names[i + nindexes]] = v\n\n # mask\n if mask is not None:\n m = ~mask.ravel().astype(bool, copy=False)\n if not m.all():\n rows = rows[m]\n\n except Exception as detail:\n raise Exception(\n \"cannot create row-data -> {detail}\".format(detail=detail))\n\n try:\n if len(rows):\n self.table.append(rows)\n self.table.flush()\n except Exception as detail:\n raise TypeError(\n \"tables cannot write this data -> {detail}\".format(\n detail=detail))", "response": "Writes the data in a single chunk of the data table."} {"instruction": "Can you tell what is the following Python 3 function doing\ndef write(self, obj, data_columns=None, **kwargs):\n if not isinstance(obj, DataFrame):\n name = obj.name or 'values'\n obj = DataFrame({name: obj}, index=obj.index)\n obj.columns = [name]\n return super().write(obj=obj, data_columns=obj.columns.tolist(),\n **kwargs)", "response": "write a single entry to the table"} {"instruction": "Write a Python 3 script to\nwrite the object as a frame table", "response": "def write(self, obj, **kwargs):\n \"\"\" we are going to write this as a frame table \"\"\"\n name = obj.name or 'values'\n obj, self.levels = self.validate_multiindex(obj)\n cols = list(self.levels)\n cols.append(name)\n obj.columns = cols\n return super().write(obj=obj, **kwargs)"} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef indexables(self):\n if self._indexables is None:\n\n d = self.description\n\n # the index columns is just a simple index\n self._indexables = [GenericIndexCol(name='index', axis=0)]\n\n for i, n in enumerate(d._v_names):\n\n dc = GenericDataIndexableCol(\n name=n, pos=i, values=[n], version=self.version)\n self._indexables.append(dc)\n\n return self._indexables", "response": "create the indexables from the table description"} {"instruction": "Given the following Python 3 function, write the documentation\ndef generate(self, where):\n if where is None:\n return None\n\n q = self.table.queryables()\n try:\n return Expr(where, queryables=q, encoding=self.table.encoding)\n except NameError:\n # raise a nice message, suggesting that the user should use\n # data_columns\n raise ValueError(\n \"The passed where expression: {0}\\n\"\n \" contains an invalid variable reference\\n\"\n \" all of the variable references must be a \"\n \"reference to\\n\"\n \" an axis (e.g. 'index' or 'columns'), or a \"\n \"data_column\\n\"\n \" The currently defined references are: {1}\\n\"\n .format(where, ','.join(q.keys()))\n )", "response": "Generate a new instance of the class that is used to access the related object."} {"instruction": "Can you tell what is the following Python 3 function doing\ndef select_coords(self):\n start, stop = self.start, self.stop\n nrows = self.table.nrows\n if start is None:\n start = 0\n elif start < 0:\n start += nrows\n if self.stop is None:\n stop = nrows\n elif stop < 0:\n stop += nrows\n\n if self.condition is not None:\n return self.table.table.get_where_list(self.condition.format(),\n start=start, stop=stop,\n sort=True)\n elif self.coordinates is not None:\n return self.coordinates\n\n return np.arange(start, stop)", "response": "generate the selection of the coordinates of the related object"} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef astype(self, dtype, copy=True):\n return np.array(self, dtype=dtype, copy=copy)", "response": "Cast to a NumPy array with the specified dtype."} {"instruction": "Here you have a function in Python 3, explain what it does\ndef argsort(self, ascending=True, kind='quicksort', *args, **kwargs):\n # Implementor note: You have two places to override the behavior of\n # argsort.\n # 1. _values_for_argsort : construct the values passed to np.argsort\n # 2. argsort : total control over sorting.\n ascending = nv.validate_argsort_with_ascending(ascending, args, kwargs)\n values = self._values_for_argsort()\n result = np.argsort(values, kind=kind, **kwargs)\n if not ascending:\n result = result[::-1]\n return result", "response": "Return the indices that would sort this array."} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef fillna(self, value=None, method=None, limit=None):\n from pandas.api.types import is_array_like\n from pandas.util._validators import validate_fillna_kwargs\n from pandas.core.missing import pad_1d, backfill_1d\n\n value, method = validate_fillna_kwargs(value, method)\n\n mask = self.isna()\n\n if is_array_like(value):\n if len(value) != len(self):\n raise ValueError(\"Length of 'value' does not match. Got ({}) \"\n \" expected {}\".format(len(value), len(self)))\n value = value[mask]\n\n if mask.any():\n if method is not None:\n func = pad_1d if method == 'pad' else backfill_1d\n new_values = func(self.astype(object), limit=limit,\n mask=mask)\n new_values = self._from_sequence(new_values, dtype=self.dtype)\n else:\n # fill with value\n new_values = self.copy()\n new_values[mask] = value\n else:\n new_values = self.copy()\n return new_values", "response": "Fill missing values with the specified value."} {"instruction": "Can you implement a function in Python 3 that\nshifts the array by desired number of periods.", "response": "def shift(\n self,\n periods: int = 1,\n fill_value: object = None,\n ) -> ABCExtensionArray:\n \"\"\"\n Shift values by desired number.\n\n Newly introduced missing values are filled with\n ``self.dtype.na_value``.\n\n .. versionadded:: 0.24.0\n\n Parameters\n ----------\n periods : int, default 1\n The number of periods to shift. Negative values are allowed\n for shifting backwards.\n\n fill_value : object, optional\n The scalar value to use for newly introduced missing values.\n The default is ``self.dtype.na_value``\n\n .. versionadded:: 0.24.0\n\n Returns\n -------\n shifted : ExtensionArray\n\n Notes\n -----\n If ``self`` is empty or ``periods`` is 0, a copy of ``self`` is\n returned.\n\n If ``periods > len(self)``, then an array of size\n len(self) is returned, with all values filled with\n ``self.dtype.na_value``.\n \"\"\"\n # Note: this implementation assumes that `self.dtype.na_value` can be\n # stored in an instance of your ExtensionArray with `self.dtype`.\n if not len(self) or periods == 0:\n return self.copy()\n\n if isna(fill_value):\n fill_value = self.dtype.na_value\n\n empty = self._from_sequence(\n [fill_value] * min(abs(periods), len(self)),\n dtype=self.dtype\n )\n if periods > 0:\n a = empty\n b = self[:-periods]\n else:\n a = self[abs(periods):]\n b = empty\n return self._concat_same_type([a, b])"} {"instruction": "How would you code a function in Python 3 to\ncompute the ExtensionArray of unique values.", "response": "def unique(self):\n \"\"\"\n Compute the ExtensionArray of unique values.\n\n Returns\n -------\n uniques : ExtensionArray\n \"\"\"\n from pandas import unique\n\n uniques = unique(self.astype(object))\n return self._from_sequence(uniques, dtype=self.dtype)"} {"instruction": "Explain what the following Python 3 code does\ndef searchsorted(self, value, side=\"left\", sorter=None):\n # Note: the base tests provided by pandas only test the basics.\n # We do not test\n # 1. Values outside the range of the `data_for_sorting` fixture\n # 2. Values between the values in the `data_for_sorting` fixture\n # 3. Missing values.\n arr = self.astype(object)\n return arr.searchsorted(value, side=side, sorter=sorter)", "response": "Return a sorted array of integers where each element in value is inserted into the corresponding array."} {"instruction": "Can you write a function in Python 3 where it\nreturns an array and missing value suitable for factorization.", "response": "def _values_for_factorize(self) -> Tuple[np.ndarray, Any]:\n \"\"\"\n Return an array and missing value suitable for factorization.\n\n Returns\n -------\n values : ndarray\n\n An array suitable for factorization. This should maintain order\n and be a supported dtype (Float64, Int64, UInt64, String, Object).\n By default, the extension array is cast to object dtype.\n na_value : object\n The value in `values` to consider missing. This will be treated\n as NA in the factorization routines, so it will be coded as\n `na_sentinal` and not included in `uniques`. By default,\n ``np.nan`` is used.\n\n Notes\n -----\n The values returned by this method are also used in\n :func:`pandas.util.hash_pandas_object`.\n \"\"\"\n return self.astype(object), np.nan"} {"instruction": "Write a Python 3 function that can\nreturn the internal factorization of the internal array.", "response": "def factorize(\n self,\n na_sentinel: int = -1,\n ) -> Tuple[np.ndarray, ABCExtensionArray]:\n \"\"\"\n Encode the extension array as an enumerated type.\n\n Parameters\n ----------\n na_sentinel : int, default -1\n Value to use in the `labels` array to indicate missing values.\n\n Returns\n -------\n labels : ndarray\n An integer NumPy array that's an indexer into the original\n ExtensionArray.\n uniques : ExtensionArray\n An ExtensionArray containing the unique values of `self`.\n\n .. note::\n\n uniques will *not* contain an entry for the NA value of\n the ExtensionArray if there are any missing values present\n in `self`.\n\n See Also\n --------\n pandas.factorize : Top-level factorize method that dispatches here.\n\n Notes\n -----\n :meth:`pandas.factorize` offers a `sort` keyword as well.\n \"\"\"\n # Impelmentor note: There are two ways to override the behavior of\n # pandas.factorize\n # 1. _values_for_factorize and _from_factorize.\n # Specify the values passed to pandas' internal factorization\n # routines, and how to convert from those values back to the\n # original ExtensionArray.\n # 2. ExtensionArray.factorize.\n # Complete control over factorization.\n from pandas.core.algorithms import _factorize_array\n\n arr, na_value = self._values_for_factorize()\n\n labels, uniques = _factorize_array(arr, na_sentinel=na_sentinel,\n na_value=na_value)\n\n uniques = self._from_factorized(uniques, self)\n return labels, uniques"} {"instruction": "Explain what the following Python 3 code does\ndef take(\n self,\n indices: Sequence[int],\n allow_fill: bool = False,\n fill_value: Any = None\n ) -> ABCExtensionArray:\n \"\"\"\n Take elements from an array.\n\n Parameters\n ----------\n indices : sequence of integers\n Indices to be taken.\n allow_fill : bool, default False\n How to handle negative values in `indices`.\n\n * False: negative values in `indices` indicate positional indices\n from the right (the default). This is similar to\n :func:`numpy.take`.\n\n * True: negative values in `indices` indicate\n missing values. These values are set to `fill_value`. Any other\n other negative values raise a ``ValueError``.\n\n fill_value : any, optional\n Fill value to use for NA-indices when `allow_fill` is True.\n This may be ``None``, in which case the default NA value for\n the type, ``self.dtype.na_value``, is used.\n\n For many ExtensionArrays, there will be two representations of\n `fill_value`: a user-facing \"boxed\" scalar, and a low-level\n physical NA value. `fill_value` should be the user-facing version,\n and the implementation should handle translating that to the\n physical version for processing the take if necessary.\n\n Returns\n -------\n ExtensionArray\n\n Raises\n ------\n IndexError\n When the indices are out of bounds for the array.\n ValueError\n When `indices` contains negative values other than ``-1``\n and `allow_fill` is True.\n\n Notes\n -----\n ExtensionArray.take is called by ``Series.__getitem__``, ``.loc``,\n ``iloc``, when `indices` is a sequence of values. Additionally,\n it's called by :meth:`Series.reindex`, or any other method\n that causes realignment, with a `fill_value`.\n\n See Also\n --------\n numpy.take\n pandas.api.extensions.take\n\n Examples\n --------\n Here's an example implementation, which relies on casting the\n extension array to object dtype. This uses the helper method\n :func:`pandas.api.extensions.take`.\n\n .. code-block:: python\n\n def take(self, indices, allow_fill=False, fill_value=None):\n from pandas.core.algorithms import take\n\n # If the ExtensionArray is backed by an ndarray, then\n # just pass that here instead of coercing to object.\n data = self.astype(object)\n\n if allow_fill and fill_value is None:\n fill_value = self.dtype.na_value\n\n # fill value should always be translated from the scalar\n # type for the array, to the physical storage type for\n # the data, before passing to take.\n\n result = take(data, indices, fill_value=fill_value,\n allow_fill=allow_fill)\n return self._from_sequence(result, dtype=self.dtype)\n \"\"\"\n # Implementer note: The `fill_value` parameter should be a user-facing\n # value, an instance of self.dtype.type. When passed `fill_value=None`,\n # the default of `self.dtype.na_value` should be used.\n # This may differ from the physical storage type your ExtensionArray\n # uses. In this case, your implementation is responsible for casting\n # the user-facing type to the storage type, before using\n # pandas.api.extensions.take\n raise AbstractMethodError(self)", "response": "Take elements from an array."} {"instruction": "Can you tell what is the following Python 3 function doing\ndef _formatter(\n self,\n boxed: bool = False,\n ) -> Callable[[Any], Optional[str]]:\n \"\"\"Formatting function for scalar values.\n\n This is used in the default '__repr__'. The returned formatting\n function receives instances of your scalar type.\n\n Parameters\n ----------\n boxed: bool, default False\n An indicated for whether or not your array is being printed\n within a Series, DataFrame, or Index (True), or just by\n itself (False). This may be useful if you want scalar values\n to appear differently within a Series versus on its own (e.g.\n quoted or not).\n\n Returns\n -------\n Callable[[Any], str]\n A callable that gets instances of the scalar type and\n returns a string. By default, :func:`repr` is used\n when ``boxed=False`` and :func:`str` is used when\n ``boxed=True``.\n \"\"\"\n if boxed:\n return str\n return repr", "response": "Returns a function that returns a string representation of the array."} {"instruction": "How would you implement a function in Python 3 that\nreturns a scalar result of performing the reduction operation.", "response": "def _reduce(self, name, skipna=True, **kwargs):\n \"\"\"\n Return a scalar result of performing the reduction operation.\n\n Parameters\n ----------\n name : str\n Name of the function, supported values are:\n { any, all, min, max, sum, mean, median, prod,\n std, var, sem, kurt, skew }.\n skipna : bool, default True\n If True, skip NaN values.\n **kwargs\n Additional keyword arguments passed to the reduction function.\n Currently, `ddof` is the only supported kwarg.\n\n Returns\n -------\n scalar\n\n Raises\n ------\n TypeError : subclass does not define reductions\n \"\"\"\n raise TypeError(\"cannot perform {name} with type {dtype}\".format(\n name=name, dtype=self.dtype))"} {"instruction": "Can you create a Python 3 function that\nmakes an alias for a method of the underlying ExtensionArray. Parameters ---------- array_method : method on an Array class Returns ------- method", "response": "def ea_passthrough(array_method):\n \"\"\"\n Make an alias for a method of the underlying ExtensionArray.\n\n Parameters\n ----------\n array_method : method on an Array class\n\n Returns\n -------\n method\n \"\"\"\n\n def method(self, *args, **kwargs):\n return array_method(self._data, *args, **kwargs)\n\n method.__name__ = array_method.__name__\n method.__doc__ = array_method.__doc__\n return method"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef _create_comparison_method(cls, op):\n def wrapper(self, other):\n if isinstance(other, ABCSeries):\n # the arrays defer to Series for comparison ops but the indexes\n # don't, so we have to unwrap here.\n other = other._values\n\n result = op(self._data, maybe_unwrap_index(other))\n return result\n\n wrapper.__doc__ = op.__doc__\n wrapper.__name__ = '__{}__'.format(op.__name__)\n return wrapper", "response": "Create a comparison method that dispatches to cls. values."} {"instruction": "Write a Python 3 script to\ndetermine if two Index objects contain the same elements.", "response": "def equals(self, other):\n \"\"\"\n Determines if two Index objects contain the same elements.\n \"\"\"\n if self.is_(other):\n return True\n\n if not isinstance(other, ABCIndexClass):\n return False\n elif not isinstance(other, type(self)):\n try:\n other = type(self)(other)\n except Exception:\n return False\n\n if not is_dtype_equal(self.dtype, other.dtype):\n # have different timezone\n return False\n\n elif is_period_dtype(self):\n if not is_period_dtype(other):\n return False\n if self.freq != other.freq:\n return False\n\n return np.array_equal(self.asi8, other.asi8)"} {"instruction": "Implement a function in Python 3 to\ncreate the join wrapper methods.", "response": "def _join_i8_wrapper(joinf, dtype, with_indexers=True):\n \"\"\"\n Create the join wrapper methods.\n \"\"\"\n from pandas.core.arrays.datetimelike import DatetimeLikeArrayMixin\n\n @staticmethod\n def wrapper(left, right):\n if isinstance(left, (np.ndarray, ABCIndex, ABCSeries,\n DatetimeLikeArrayMixin)):\n left = left.view('i8')\n if isinstance(right, (np.ndarray, ABCIndex, ABCSeries,\n DatetimeLikeArrayMixin)):\n right = right.view('i8')\n results = joinf(left, right)\n if with_indexers:\n join_index, left_indexer, right_indexer = results\n join_index = join_index.view(dtype)\n return join_index, left_indexer, right_indexer\n return results\n\n return wrapper"} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef sort_values(self, return_indexer=False, ascending=True):\n if return_indexer:\n _as = self.argsort()\n if not ascending:\n _as = _as[::-1]\n sorted_index = self.take(_as)\n return sorted_index, _as\n else:\n sorted_values = np.sort(self._ndarray_values)\n attribs = self._get_attributes_dict()\n freq = attribs['freq']\n\n if freq is not None and not is_period_dtype(self):\n if freq.n > 0 and not ascending:\n freq = freq * -1\n elif freq.n < 0 and ascending:\n freq = freq * -1\n attribs['freq'] = freq\n\n if not ascending:\n sorted_values = sorted_values[::-1]\n\n return self._simple_new(sorted_values, **attribs)", "response": "Sort the values of the array."} {"instruction": "Given the following Python 3 function, write the documentation\ndef min(self, axis=None, skipna=True, *args, **kwargs):\n nv.validate_min(args, kwargs)\n nv.validate_minmax_axis(axis)\n\n if not len(self):\n return self._na_value\n\n i8 = self.asi8\n try:\n # quick check\n if len(i8) and self.is_monotonic:\n if i8[0] != iNaT:\n return self._box_func(i8[0])\n\n if self.hasnans:\n if skipna:\n min_stamp = self[~self._isnan].asi8.min()\n else:\n return self._na_value\n else:\n min_stamp = i8.min()\n return self._box_func(min_stamp)\n except ValueError:\n return self._na_value", "response": "Return the minimum value of the Index or minimum along an axis."} {"instruction": "Write a Python 3 script for\nreturning the indices of the minimum values along an axis.", "response": "def argmin(self, axis=None, skipna=True, *args, **kwargs):\n \"\"\"\n Returns the indices of the minimum values along an axis.\n\n See `numpy.ndarray.argmin` for more information on the\n `axis` parameter.\n\n See Also\n --------\n numpy.ndarray.argmin\n \"\"\"\n nv.validate_argmin(args, kwargs)\n nv.validate_minmax_axis(axis)\n\n i8 = self.asi8\n if self.hasnans:\n mask = self._isnan\n if mask.all() or not skipna:\n return -1\n i8 = i8.copy()\n i8[mask] = np.iinfo('int64').max\n return i8.argmin()"} {"instruction": "Write a Python 3 function that can\nreturn the maximum value of the Index or maximum along an axis.", "response": "def max(self, axis=None, skipna=True, *args, **kwargs):\n \"\"\"\n Return the maximum value of the Index or maximum along\n an axis.\n\n See Also\n --------\n numpy.ndarray.max\n Series.max : Return the maximum value in a Series.\n \"\"\"\n nv.validate_max(args, kwargs)\n nv.validate_minmax_axis(axis)\n\n if not len(self):\n return self._na_value\n\n i8 = self.asi8\n try:\n # quick check\n if len(i8) and self.is_monotonic:\n if i8[-1] != iNaT:\n return self._box_func(i8[-1])\n\n if self.hasnans:\n if skipna:\n max_stamp = self[~self._isnan].asi8.max()\n else:\n return self._na_value\n else:\n max_stamp = i8.max()\n return self._box_func(max_stamp)\n except ValueError:\n return self._na_value"} {"instruction": "Explain what the following Python 3 code does\ndef argmax(self, axis=None, skipna=True, *args, **kwargs):\n nv.validate_argmax(args, kwargs)\n nv.validate_minmax_axis(axis)\n\n i8 = self.asi8\n if self.hasnans:\n mask = self._isnan\n if mask.all() or not skipna:\n return -1\n i8 = i8.copy()\n i8[mask] = 0\n return i8.argmax()", "response": "Returns the indices of the maximum values along an axis."} {"instruction": "How would you implement a function in Python 3 that\nreturns a list of tuples of the attr formatted_value.", "response": "def _format_attrs(self):\n \"\"\"\n Return a list of tuples of the (attr,formatted_value).\n \"\"\"\n attrs = super()._format_attrs()\n for attrib in self._attributes:\n if attrib == 'freq':\n freq = self.freqstr\n if freq is not None:\n freq = \"'%s'\" % freq\n attrs.append(('freq', freq))\n return attrs"} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef _convert_scalar_indexer(self, key, kind=None):\n\n assert kind in ['ix', 'loc', 'getitem', 'iloc', None]\n\n # we don't allow integer/float indexing for loc\n # we don't allow float indexing for ix/getitem\n if is_scalar(key):\n is_int = is_integer(key)\n is_flt = is_float(key)\n if kind in ['loc'] and (is_int or is_flt):\n self._invalid_indexer('index', key)\n elif kind in ['ix', 'getitem'] and is_flt:\n self._invalid_indexer('index', key)\n\n return super()._convert_scalar_indexer(key, kind=kind)", "response": "Convert a scalar indexer to an index."} {"instruction": "Write a Python 3 function for\nadding in the datetimelike methods.", "response": "def _add_datetimelike_methods(cls):\n \"\"\"\n Add in the datetimelike methods (as we may have to override the\n superclass).\n \"\"\"\n\n def __add__(self, other):\n # dispatch to ExtensionArray implementation\n result = self._data.__add__(maybe_unwrap_index(other))\n return wrap_arithmetic_op(self, other, result)\n\n cls.__add__ = __add__\n\n def __radd__(self, other):\n # alias for __add__\n return self.__add__(other)\n cls.__radd__ = __radd__\n\n def __sub__(self, other):\n # dispatch to ExtensionArray implementation\n result = self._data.__sub__(maybe_unwrap_index(other))\n return wrap_arithmetic_op(self, other, result)\n\n cls.__sub__ = __sub__\n\n def __rsub__(self, other):\n result = self._data.__rsub__(maybe_unwrap_index(other))\n return wrap_arithmetic_op(self, other, result)\n\n cls.__rsub__ = __rsub__"} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef isin(self, values):\n if not isinstance(values, type(self)):\n try:\n values = type(self)(values)\n except ValueError:\n return self.astype(object).isin(values)\n\n return algorithms.isin(self.asi8, values.asi8)", "response": "Compute boolean array of whether each index value is found in the\n passed set of values."} {"instruction": "Explain what the following Python 3 code does\ndef _summary(self, name=None):\n formatter = self._formatter_func\n if len(self) > 0:\n index_summary = ', %s to %s' % (formatter(self[0]),\n formatter(self[-1]))\n else:\n index_summary = ''\n\n if name is None:\n name = type(self).__name__\n result = '%s: %s entries%s' % (printing.pprint_thing(name),\n len(self), index_summary)\n if self.freq:\n result += '\\nFreq: %s' % self.freqstr\n\n # display as values, not quoted\n result = result.replace(\"'\", \"\")\n return result", "response": "Return a summarized representation of the index and entries."} {"instruction": "Implement a Python 3 function for\nconcatenating to_concat which has the same dtype.", "response": "def _concat_same_dtype(self, to_concat, name):\n \"\"\"\n Concatenate to_concat which has the same class.\n \"\"\"\n attribs = self._get_attributes_dict()\n attribs['name'] = name\n # do not pass tz to set because tzlocal cannot be hashed\n if len({str(x.dtype) for x in to_concat}) != 1:\n raise ValueError('to_concat must have the same tz')\n\n new_data = type(self._values)._concat_same_type(to_concat).asi8\n\n # GH 3232: If the concat result is evenly spaced, we can retain the\n # original frequency\n is_diff_evenly_spaced = len(unique_deltas(new_data)) == 1\n if not is_period_dtype(self) and not is_diff_evenly_spaced:\n # reset freq\n attribs['freq'] = None\n\n return self._simple_new(new_data, **attribs)"} {"instruction": "Implement a Python 3 function for\nshifting the index by desired number of time frequency increments.", "response": "def shift(self, periods, freq=None):\n \"\"\"\n Shift index by desired number of time frequency increments.\n\n This method is for shifting the values of datetime-like indexes\n by a specified time increment a given number of times.\n\n Parameters\n ----------\n periods : int\n Number of periods (or increments) to shift by,\n can be positive or negative.\n\n .. versionchanged:: 0.24.0\n\n freq : pandas.DateOffset, pandas.Timedelta or string, optional\n Frequency increment to shift by.\n If None, the index is shifted by its own `freq` attribute.\n Offset aliases are valid strings, e.g., 'D', 'W', 'M' etc.\n\n Returns\n -------\n pandas.DatetimeIndex\n Shifted index.\n\n See Also\n --------\n Index.shift : Shift values of Index.\n PeriodIndex.shift : Shift values of PeriodIndex.\n \"\"\"\n result = self._data._time_shift(periods, freq=freq)\n return type(self)(result, name=self.name)"} {"instruction": "Make a summary of the following Python 3 code\ndef _single_replace(self, to_replace, method, inplace, limit):\n if self.ndim != 1:\n raise TypeError('cannot replace {0} with method {1} on a {2}'\n .format(to_replace, method, type(self).__name__))\n\n orig_dtype = self.dtype\n result = self if inplace else self.copy()\n fill_f = missing.get_fill_func(method)\n\n mask = missing.mask_missing(result.values, to_replace)\n values = fill_f(result.values, limit=limit, mask=mask)\n\n if values.dtype == orig_dtype and inplace:\n return\n\n result = pd.Series(values, index=self.index,\n dtype=self.dtype).__finalize__(self)\n\n if inplace:\n self._update_inplace(result._data)\n return\n\n return result", "response": "Replace values in a Series using a fill method specified when no no_match replacement value is given in the replace method."} {"instruction": "Can you tell what is the following Python 3 function doing\ndef _doc_parms(cls):\n axis_descr = \"{%s}\" % ', '.join(\"{0} ({1})\".format(a, i)\n for i, a in enumerate(cls._AXIS_ORDERS))\n name = (cls._constructor_sliced.__name__\n if cls._AXIS_LEN > 1 else 'scalar')\n name2 = cls.__name__\n return axis_descr, name, name2", "response": "Return a tuple of the doc parms."} {"instruction": "Here you have a function in Python 3, explain what it does\ndef _init_mgr(self, mgr, axes=None, dtype=None, copy=False):\n for a, axe in axes.items():\n if axe is not None:\n mgr = mgr.reindex_axis(axe,\n axis=self._get_block_manager_axis(a),\n copy=False)\n\n # make a copy if explicitly requested\n if copy:\n mgr = mgr.copy()\n if dtype is not None:\n # avoid further copies if we can\n if len(mgr.blocks) > 1 or mgr.blocks[0].values.dtype != dtype:\n mgr = mgr.astype(dtype=dtype)\n return mgr", "response": "init a manager with the specified axes and dtype"} {"instruction": "Implement a function in Python 3 to\nvalidate the passed dtype", "response": "def _validate_dtype(self, dtype):\n \"\"\" validate the passed dtype \"\"\"\n\n if dtype is not None:\n dtype = pandas_dtype(dtype)\n\n # a compound dtype\n if dtype.kind == 'V':\n raise NotImplementedError(\"compound dtypes are not implemented\"\n \" in the {0} constructor\"\n .format(self.__class__.__name__))\n\n return dtype"} {"instruction": "Write a Python 3 script for\nproviding axes setup for the major PandasObjects.", "response": "def _setup_axes(cls, axes, info_axis=None, stat_axis=None, aliases=None,\n slicers=None, axes_are_reversed=False, build_axes=True,\n ns=None, docs=None):\n \"\"\"Provide axes setup for the major PandasObjects.\n\n Parameters\n ----------\n axes : the names of the axes in order (lowest to highest)\n info_axis_num : the axis of the selector dimension (int)\n stat_axis_num : the number of axis for the default stats (int)\n aliases : other names for a single axis (dict)\n slicers : how axes slice to others (dict)\n axes_are_reversed : boolean whether to treat passed axes as\n reversed (DataFrame)\n build_axes : setup the axis properties (default True)\n \"\"\"\n\n cls._AXIS_ORDERS = axes\n cls._AXIS_NUMBERS = {a: i for i, a in enumerate(axes)}\n cls._AXIS_LEN = len(axes)\n cls._AXIS_ALIASES = aliases or dict()\n cls._AXIS_IALIASES = {v: k for k, v in cls._AXIS_ALIASES.items()}\n cls._AXIS_NAMES = dict(enumerate(axes))\n cls._AXIS_SLICEMAP = slicers or None\n cls._AXIS_REVERSED = axes_are_reversed\n\n # typ\n setattr(cls, '_typ', cls.__name__.lower())\n\n # indexing support\n cls._ix = None\n\n if info_axis is not None:\n cls._info_axis_number = info_axis\n cls._info_axis_name = axes[info_axis]\n\n if stat_axis is not None:\n cls._stat_axis_number = stat_axis\n cls._stat_axis_name = axes[stat_axis]\n\n # setup the actual axis\n if build_axes:\n\n def set_axis(a, i):\n setattr(cls, a, properties.AxisProperty(i, docs.get(a, a)))\n cls._internal_names_set.add(a)\n\n if axes_are_reversed:\n m = cls._AXIS_LEN - 1\n for i, a in cls._AXIS_NAMES.items():\n set_axis(a, m - i)\n else:\n for i, a in cls._AXIS_NAMES.items():\n set_axis(a, i)\n\n assert not isinstance(ns, dict)"} {"instruction": "Write a Python 3 script to\nreturn an axes dictionary for myself.", "response": "def _construct_axes_dict(self, axes=None, **kwargs):\n \"\"\"Return an axes dictionary for myself.\"\"\"\n d = {a: self._get_axis(a) for a in (axes or self._AXIS_ORDERS)}\n d.update(kwargs)\n return d"} {"instruction": "How would you explain what the following Python 3 function does\ndef _construct_axes_dict_from(self, axes, **kwargs):\n d = {a: ax for a, ax in zip(self._AXIS_ORDERS, axes)}\n d.update(kwargs)\n return d", "response": "Return an axes dictionary for the passed axes."} {"instruction": "Create a Python 3 function to\nreturn an axes dictionary for myself.", "response": "def _construct_axes_dict_for_slice(self, axes=None, **kwargs):\n \"\"\"Return an axes dictionary for myself.\"\"\"\n d = {self._AXIS_SLICEMAP[a]: self._get_axis(a)\n for a in (axes or self._AXIS_ORDERS)}\n d.update(kwargs)\n return d"} {"instruction": "Create a Python 3 function to\nconstruct and returns axes from the supplied arguments.", "response": "def _construct_axes_from_arguments(\n self, args, kwargs, require_all=False, sentinel=None):\n \"\"\"Construct and returns axes if supplied in args/kwargs.\n\n If require_all, raise if all axis arguments are not supplied\n return a tuple of (axes, kwargs).\n\n sentinel specifies the default parameter when an axis is not\n supplied; useful to distinguish when a user explicitly passes None\n in scenarios where None has special meaning.\n \"\"\"\n\n # construct the args\n args = list(args)\n for a in self._AXIS_ORDERS:\n\n # if we have an alias for this axis\n alias = self._AXIS_IALIASES.get(a)\n if alias is not None:\n if a in kwargs:\n if alias in kwargs:\n raise TypeError(\"arguments are mutually exclusive \"\n \"for [%s,%s]\" % (a, alias))\n continue\n if alias in kwargs:\n kwargs[a] = kwargs.pop(alias)\n continue\n\n # look for a argument by position\n if a not in kwargs:\n try:\n kwargs[a] = args.pop(0)\n except IndexError:\n if require_all:\n raise TypeError(\"not enough/duplicate arguments \"\n \"specified!\")\n\n axes = {a: kwargs.pop(a, sentinel) for a in self._AXIS_ORDERS}\n return axes, kwargs"} {"instruction": "How would you explain what the following Python 3 function does\ndef _get_block_manager_axis(cls, axis):\n axis = cls._get_axis_number(axis)\n if cls._AXIS_REVERSED:\n m = cls._AXIS_LEN - 1\n return m - axis\n return axis", "response": "Map the axis to the block_manager axis."} {"instruction": "How would you code a function in Python 3 to\nreturn the space character free column resolvers of a dataframe.", "response": "def _get_space_character_free_column_resolvers(self):\n \"\"\"Return the space character free column resolvers of a dataframe.\n\n Column names with spaces are 'cleaned up' so that they can be referred\n to by backtick quoting.\n Used in :meth:`DataFrame.eval`.\n \"\"\"\n from pandas.core.computation.common import _remove_spaces_column_name\n\n return {_remove_spaces_column_name(k): v for k, v\n in self.iteritems()}"} {"instruction": "Given the following Python 3 function, write the documentation\ndef shape(self):\n return tuple(len(self._get_axis(a)) for a in self._AXIS_ORDERS)", "response": "Return a tuple of axis dimensions"} {"instruction": "Write a Python 3 script for\ntransposing the underlying data into a new set of unique elements.", "response": "def transpose(self, *args, **kwargs):\n \"\"\"\n Permute the dimensions of the %(klass)s\n\n Parameters\n ----------\n args : %(args_transpose)s\n copy : boolean, default False\n Make a copy of the underlying data. Mixed-dtype data will\n always result in a copy\n **kwargs\n Additional keyword arguments will be passed to the function.\n\n Returns\n -------\n y : same as input\n\n Examples\n --------\n >>> p.transpose(2, 0, 1)\n >>> p.transpose(2, 0, 1, copy=True)\n \"\"\"\n\n # construct the args\n axes, kwargs = self._construct_axes_from_arguments(args, kwargs,\n require_all=True)\n axes_names = tuple(self._get_axis_name(axes[a])\n for a in self._AXIS_ORDERS)\n axes_numbers = tuple(self._get_axis_number(axes[a])\n for a in self._AXIS_ORDERS)\n\n # we must have unique axes\n if len(axes) != len(set(axes)):\n raise ValueError('Must specify %s unique axes' % self._AXIS_LEN)\n\n new_axes = self._construct_axes_dict_from(self, [self._get_axis(x)\n for x in axes_names])\n new_values = self.values.transpose(axes_numbers)\n if kwargs.pop('copy', None) or (len(args) and args[-1]):\n new_values = new_values.copy()\n\n nv.validate_transpose_for_generic(self, kwargs)\n return self._constructor(new_values, **new_axes).__finalize__(self)"} {"instruction": "How would you explain what the following Python 3 function does\ndef swapaxes(self, axis1, axis2, copy=True):\n i = self._get_axis_number(axis1)\n j = self._get_axis_number(axis2)\n\n if i == j:\n if copy:\n return self.copy()\n return self\n\n mapping = {i: j, j: i}\n\n new_axes = (self._get_axis(mapping.get(k, k))\n for k in range(self._AXIS_LEN))\n new_values = self.values.swapaxes(i, j)\n if copy:\n new_values = new_values.copy()\n\n return self._constructor(new_values, *new_axes).__finalize__(self)", "response": "Interchange axes and swap values axes appropriately."} {"instruction": "Implement a Python 3 function for\nreturning a new DataFrame with requested index and column level removed.", "response": "def droplevel(self, level, axis=0):\n \"\"\"\n Return DataFrame with requested index / column level(s) removed.\n\n .. versionadded:: 0.24.0\n\n Parameters\n ----------\n level : int, str, or list-like\n If a string is given, must be the name of a level\n If list-like, elements must be names or positional indexes\n of levels.\n\n axis : {0 or 'index', 1 or 'columns'}, default 0\n\n Returns\n -------\n DataFrame.droplevel()\n\n Examples\n --------\n >>> df = pd.DataFrame([\n ... [1, 2, 3, 4],\n ... [5, 6, 7, 8],\n ... [9, 10, 11, 12]\n ... ]).set_index([0, 1]).rename_axis(['a', 'b'])\n\n >>> df.columns = pd.MultiIndex.from_tuples([\n ... ('c', 'e'), ('d', 'f')\n ... ], names=['level_1', 'level_2'])\n\n >>> df\n level_1 c d\n level_2 e f\n a b\n 1 2 3 4\n 5 6 7 8\n 9 10 11 12\n\n >>> df.droplevel('a')\n level_1 c d\n level_2 e f\n b\n 2 3 4\n 6 7 8\n 10 11 12\n\n >>> df.droplevel('level2', axis=1)\n level_1 c d\n a b\n 1 2 3 4\n 5 6 7 8\n 9 10 11 12\n \"\"\"\n labels = self._get_axis(axis)\n new_labels = labels.droplevel(level)\n result = self.set_axis(new_labels, axis=axis, inplace=False)\n return result"} {"instruction": "Make a summary of the following Python 3 code\ndef squeeze(self, axis=None):\n axis = (self._AXIS_NAMES if axis is None else\n (self._get_axis_number(axis),))\n try:\n return self.iloc[\n tuple(0 if i in axis and len(a) == 1 else slice(None)\n for i, a in enumerate(self.axes))]\n except Exception:\n return self", "response": "Squeeze the elements of the a\n object into a Series or DataFrames."} {"instruction": "Write a Python 3 function that can\nreturn a new MultiIndex with the swap levels i and j in a particular axis.", "response": "def swaplevel(self, i=-2, j=-1, axis=0):\n \"\"\"\n Swap levels i and j in a MultiIndex on a particular axis\n\n Parameters\n ----------\n i, j : int, str (can be mixed)\n Level of index to be swapped. Can pass level name as string.\n\n Returns\n -------\n swapped : same type as caller (new object)\n\n .. versionchanged:: 0.18.1\n\n The indexes ``i`` and ``j`` are now optional, and default to\n the two innermost levels of the index.\n \"\"\"\n axis = self._get_axis_number(axis)\n result = self.copy()\n labels = result._data.axes[axis]\n result._data.set_axis(axis, labels.swaplevel(i, j))\n return result"} {"instruction": "How would you code a function in Python 3 to\nrename the names of the specified key - value pairs.", "response": "def rename(self, *args, **kwargs):\n \"\"\"\n Alter axes input function or functions. Function / dict values must be\n unique (1-to-1). Labels not contained in a dict / Series will be left\n as-is. Extra labels listed don't throw an error. Alternatively, change\n ``Series.name`` with a scalar value (Series only).\n\n Parameters\n ----------\n %(axes)s : scalar, list-like, dict-like or function, optional\n Scalar or list-like will alter the ``Series.name`` attribute,\n and raise on DataFrame or Panel.\n dict-like or functions are transformations to apply to\n that axis' values\n copy : bool, default True\n Also copy underlying data.\n inplace : bool, default False\n Whether to return a new %(klass)s. If True then value of copy is\n ignored.\n level : int or level name, default None\n In case of a MultiIndex, only rename labels in the specified\n level.\n errors : {'ignore', 'raise'}, default 'ignore'\n If 'raise', raise a `KeyError` when a dict-like `mapper`, `index`,\n or `columns` contains labels that are not present in the Index\n being transformed.\n If 'ignore', existing keys will be renamed and extra keys will be\n ignored.\n\n Returns\n -------\n renamed : %(klass)s (new object)\n\n Raises\n ------\n KeyError\n If any of the labels is not found in the selected axis and\n \"errors='raise'\".\n\n See Also\n --------\n NDFrame.rename_axis\n\n Examples\n --------\n\n >>> s = pd.Series([1, 2, 3])\n >>> s\n 0 1\n 1 2\n 2 3\n dtype: int64\n >>> s.rename(\"my_name\") # scalar, changes Series.name\n 0 1\n 1 2\n 2 3\n Name: my_name, dtype: int64\n >>> s.rename(lambda x: x ** 2) # function, changes labels\n 0 1\n 1 2\n 4 3\n dtype: int64\n >>> s.rename({1: 3, 2: 5}) # mapping, changes labels\n 0 1\n 3 2\n 5 3\n dtype: int64\n\n Since ``DataFrame`` doesn't have a ``.name`` attribute,\n only mapping-type arguments are allowed.\n\n >>> df = pd.DataFrame({\"A\": [1, 2, 3], \"B\": [4, 5, 6]})\n >>> df.rename(2)\n Traceback (most recent call last):\n ...\n TypeError: 'int' object is not callable\n\n ``DataFrame.rename`` supports two calling conventions\n\n * ``(index=index_mapper, columns=columns_mapper, ...)``\n * ``(mapper, axis={'index', 'columns'}, ...)``\n\n We *highly* recommend using keyword arguments to clarify your\n intent.\n\n >>> df.rename(index=str, columns={\"A\": \"a\", \"B\": \"c\"})\n a c\n 0 1 4\n 1 2 5\n 2 3 6\n\n >>> df.rename(index=str, columns={\"A\": \"a\", \"C\": \"c\"})\n a B\n 0 1 4\n 1 2 5\n 2 3 6\n\n Using axis-style parameters\n\n >>> df.rename(str.lower, axis='columns')\n a b\n 0 1 4\n 1 2 5\n 2 3 6\n\n >>> df.rename({1: 2, 2: 4}, axis='index')\n A B\n 0 1 4\n 2 2 5\n 4 3 6\n\n See the :ref:`user guide ` for more.\n \"\"\"\n axes, kwargs = self._construct_axes_from_arguments(args, kwargs)\n copy = kwargs.pop('copy', True)\n inplace = kwargs.pop('inplace', False)\n level = kwargs.pop('level', None)\n axis = kwargs.pop('axis', None)\n errors = kwargs.pop('errors', 'ignore')\n if axis is not None:\n # Validate the axis\n self._get_axis_number(axis)\n\n if kwargs:\n raise TypeError('rename() got an unexpected keyword '\n 'argument \"{0}\"'.format(list(kwargs.keys())[0]))\n\n if com.count_not_none(*axes.values()) == 0:\n raise TypeError('must pass an index to rename')\n\n self._consolidate_inplace()\n result = self if inplace else self.copy(deep=copy)\n\n # start in the axis order to eliminate too many copies\n for axis in lrange(self._AXIS_LEN):\n v = axes.get(self._AXIS_NAMES[axis])\n if v is None:\n continue\n f = com._get_rename_function(v)\n baxis = self._get_block_manager_axis(axis)\n if level is not None:\n level = self.axes[axis]._get_level_number(level)\n\n # GH 13473\n if not callable(v):\n indexer = self.axes[axis].get_indexer_for(v)\n if errors == 'raise' and len(indexer[indexer == -1]):\n missing_labels = [label for index, label in enumerate(v)\n if indexer[index] == -1]\n raise KeyError('{} not found in axis'\n .format(missing_labels))\n\n result._data = result._data.rename_axis(f, axis=baxis, copy=copy,\n level=level)\n result._clear_item_cache()\n\n if inplace:\n self._update_inplace(result._data)\n else:\n return result.__finalize__(self)"} {"instruction": "How would you implement a function in Python 3 that\nreturns a new Series or DataFrame with the new names on the specified axis.", "response": "def rename_axis(self, mapper=sentinel, **kwargs):\n \"\"\"\n Set the name of the axis for the index or columns.\n\n Parameters\n ----------\n mapper : scalar, list-like, optional\n Value to set the axis name attribute.\n index, columns : scalar, list-like, dict-like or function, optional\n A scalar, list-like, dict-like or functions transformations to\n apply to that axis' values.\n\n Use either ``mapper`` and ``axis`` to\n specify the axis to target with ``mapper``, or ``index``\n and/or ``columns``.\n\n .. versionchanged:: 0.24.0\n\n axis : {0 or 'index', 1 or 'columns'}, default 0\n The axis to rename.\n copy : bool, default True\n Also copy underlying data.\n inplace : bool, default False\n Modifies the object directly, instead of creating a new Series\n or DataFrame.\n\n Returns\n -------\n Series, DataFrame, or None\n The same type as the caller or None if `inplace` is True.\n\n See Also\n --------\n Series.rename : Alter Series index labels or name.\n DataFrame.rename : Alter DataFrame index labels or name.\n Index.rename : Set new names on index.\n\n Notes\n -----\n Prior to version 0.21.0, ``rename_axis`` could also be used to change\n the axis *labels* by passing a mapping or scalar. This behavior is\n deprecated and will be removed in a future version. Use ``rename``\n instead.\n\n ``DataFrame.rename_axis`` supports two calling conventions\n\n * ``(index=index_mapper, columns=columns_mapper, ...)``\n * ``(mapper, axis={'index', 'columns'}, ...)``\n\n The first calling convention will only modify the names of\n the index and/or the names of the Index object that is the columns.\n In this case, the parameter ``copy`` is ignored.\n\n The second calling convention will modify the names of the\n the corresponding index if mapper is a list or a scalar.\n However, if mapper is dict-like or a function, it will use the\n deprecated behavior of modifying the axis *labels*.\n\n We *highly* recommend using keyword arguments to clarify your\n intent.\n\n Examples\n --------\n **Series**\n\n >>> s = pd.Series([\"dog\", \"cat\", \"monkey\"])\n >>> s\n 0 dog\n 1 cat\n 2 monkey\n dtype: object\n >>> s.rename_axis(\"animal\")\n animal\n 0 dog\n 1 cat\n 2 monkey\n dtype: object\n\n **DataFrame**\n\n >>> df = pd.DataFrame({\"num_legs\": [4, 4, 2],\n ... \"num_arms\": [0, 0, 2]},\n ... [\"dog\", \"cat\", \"monkey\"])\n >>> df\n num_legs num_arms\n dog 4 0\n cat 4 0\n monkey 2 2\n >>> df = df.rename_axis(\"animal\")\n >>> df\n num_legs num_arms\n animal\n dog 4 0\n cat 4 0\n monkey 2 2\n >>> df = df.rename_axis(\"limbs\", axis=\"columns\")\n >>> df\n limbs num_legs num_arms\n animal\n dog 4 0\n cat 4 0\n monkey 2 2\n\n **MultiIndex**\n\n >>> df.index = pd.MultiIndex.from_product([['mammal'],\n ... ['dog', 'cat', 'monkey']],\n ... names=['type', 'name'])\n >>> df\n limbs num_legs num_arms\n type name\n mammal dog 4 0\n cat 4 0\n monkey 2 2\n\n >>> df.rename_axis(index={'type': 'class'})\n limbs num_legs num_arms\n class name\n mammal dog 4 0\n cat 4 0\n monkey 2 2\n\n >>> df.rename_axis(columns=str.upper)\n LIMBS num_legs num_arms\n type name\n mammal dog 4 0\n cat 4 0\n monkey 2 2\n \"\"\"\n axes, kwargs = self._construct_axes_from_arguments(\n (), kwargs, sentinel=sentinel)\n copy = kwargs.pop('copy', True)\n inplace = kwargs.pop('inplace', False)\n axis = kwargs.pop('axis', 0)\n if axis is not None:\n axis = self._get_axis_number(axis)\n\n if kwargs:\n raise TypeError('rename_axis() got an unexpected keyword '\n 'argument \"{0}\"'.format(list(kwargs.keys())[0]))\n\n inplace = validate_bool_kwarg(inplace, 'inplace')\n\n if (mapper is not sentinel):\n # Use v0.23 behavior if a scalar or list\n non_mapper = is_scalar(mapper) or (is_list_like(mapper) and not\n is_dict_like(mapper))\n if non_mapper:\n return self._set_axis_name(mapper, axis=axis, inplace=inplace)\n else:\n # Deprecated (v0.21) behavior is if mapper is specified,\n # and not a list or scalar, then call rename\n msg = (\"Using 'rename_axis' to alter labels is deprecated. \"\n \"Use '.rename' instead\")\n warnings.warn(msg, FutureWarning, stacklevel=3)\n axis = self._get_axis_name(axis)\n d = {'copy': copy, 'inplace': inplace}\n d[axis] = mapper\n return self.rename(**d)\n else:\n # Use new behavior. Means that index and/or columns\n # is specified\n result = self if inplace else self.copy(deep=copy)\n\n for axis in lrange(self._AXIS_LEN):\n v = axes.get(self._AXIS_NAMES[axis])\n if v is sentinel:\n continue\n non_mapper = is_scalar(v) or (is_list_like(v) and not\n is_dict_like(v))\n if non_mapper:\n newnames = v\n else:\n f = com._get_rename_function(v)\n curnames = self._get_axis(axis).names\n newnames = [f(name) for name in curnames]\n result._set_axis_name(newnames, axis=axis,\n inplace=True)\n if not inplace:\n return result"} {"instruction": "Here you have a function in Python 3, explain what it does\ndef _set_axis_name(self, name, axis=0, inplace=False):\n axis = self._get_axis_number(axis)\n idx = self._get_axis(axis).set_names(name)\n\n inplace = validate_bool_kwarg(inplace, 'inplace')\n renamed = self if inplace else self.copy()\n renamed.set_axis(idx, axis=axis, inplace=True)\n if not inplace:\n return renamed", "response": "Set the name of the specified axis."} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef equals(self, other):\n if not isinstance(other, self._constructor):\n return False\n return self._data.equals(other._data)", "response": "Tests whether two Series or DataFrames or DataFrame objects are equal."} {"instruction": "How would you implement a function in Python 3 that\nreturns the boolean value of a single element PandasObject.", "response": "def bool(self):\n \"\"\"\n Return the bool of a single element PandasObject.\n\n This must be a boolean scalar value, either True or False. Raise a\n ValueError if the PandasObject does not have exactly 1 element, or that\n element is not boolean\n \"\"\"\n v = self.squeeze()\n if isinstance(v, (bool, np.bool_)):\n return bool(v)\n elif is_scalar(v):\n raise ValueError(\"bool cannot act on a non-boolean single element \"\n \"{0}\".format(self.__class__.__name__))\n\n self.__nonzero__()"} {"instruction": "Given the following Python 3 function, write the documentation\ndef _is_level_reference(self, key, axis=0):\n axis = self._get_axis_number(axis)\n\n if self.ndim > 2:\n raise NotImplementedError(\n \"_is_level_reference is not implemented for {type}\"\n .format(type=type(self)))\n\n return (key is not None and\n is_hashable(key) and\n key in self.axes[axis].names and\n not self._is_label_reference(key, axis=axis))", "response": "Tests whether a key is a level reference for a given axis."} {"instruction": "Given the following Python 3 function, write the documentation\ndef _is_label_reference(self, key, axis=0):\n if self.ndim > 2:\n raise NotImplementedError(\n \"_is_label_reference is not implemented for {type}\"\n .format(type=type(self)))\n\n axis = self._get_axis_number(axis)\n other_axes = (ax for ax in range(self._AXIS_LEN) if ax != axis)\n\n return (key is not None and\n is_hashable(key) and\n any(key in self.axes[ax] for ax in other_axes))", "response": "Tests whether a key is a label reference for a given axis."} {"instruction": "Write a Python 3 script to\ntest whether a key is either a label or level reference for a given axis.", "response": "def _is_label_or_level_reference(self, key, axis=0):\n \"\"\"\n Test whether a key is a label or level reference for a given axis.\n\n To be considered either a label or a level reference, `key` must be a\n string that:\n - (axis=0): Matches a column label or an index level\n - (axis=1): Matches an index label or a column level\n\n Parameters\n ----------\n key: str\n Potential label or level name\n axis: int, default 0\n Axis that levels are associated with (0 for index, 1 for columns)\n\n Returns\n -------\n is_label_or_level: bool\n \"\"\"\n\n if self.ndim > 2:\n raise NotImplementedError(\n \"_is_label_or_level_reference is not implemented for {type}\"\n .format(type=type(self)))\n\n return (self._is_level_reference(key, axis=axis) or\n self._is_label_reference(key, axis=axis))"} {"instruction": "Can you create a Python 3 function that\nchecks whether a key is ambiguous.", "response": "def _check_label_or_level_ambiguity(self, key, axis=0):\n \"\"\"\n Check whether `key` is ambiguous.\n\n By ambiguous, we mean that it matches both a level of the input\n `axis` and a label of the other axis.\n\n Parameters\n ----------\n key: str or object\n label or level name\n axis: int, default 0\n Axis that levels are associated with (0 for index, 1 for columns)\n\n Raises\n ------\n ValueError: `key` is ambiguous\n \"\"\"\n if self.ndim > 2:\n raise NotImplementedError(\n \"_check_label_or_level_ambiguity is not implemented for {type}\"\n .format(type=type(self)))\n\n axis = self._get_axis_number(axis)\n other_axes = (ax for ax in range(self._AXIS_LEN) if ax != axis)\n\n if (key is not None and\n is_hashable(key) and\n key in self.axes[axis].names and\n any(key in self.axes[ax] for ax in other_axes)):\n\n # Build an informative and grammatical warning\n level_article, level_type = (('an', 'index')\n if axis == 0 else\n ('a', 'column'))\n\n label_article, label_type = (('a', 'column')\n if axis == 0 else\n ('an', 'index'))\n\n msg = (\"'{key}' is both {level_article} {level_type} level and \"\n \"{label_article} {label_type} label, which is ambiguous.\"\n ).format(key=key,\n level_article=level_article,\n level_type=level_type,\n label_article=label_article,\n label_type=label_type)\n raise ValueError(msg)"} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef _get_label_or_level_values(self, key, axis=0):\n if self.ndim > 2:\n raise NotImplementedError(\n \"_get_label_or_level_values is not implemented for {type}\"\n .format(type=type(self)))\n\n axis = self._get_axis_number(axis)\n other_axes = [ax for ax in range(self._AXIS_LEN) if ax != axis]\n\n if self._is_label_reference(key, axis=axis):\n self._check_label_or_level_ambiguity(key, axis=axis)\n values = self.xs(key, axis=other_axes[0])._values\n elif self._is_level_reference(key, axis=axis):\n values = self.axes[axis].get_level_values(key)._values\n else:\n raise KeyError(key)\n\n # Check for duplicates\n if values.ndim > 1:\n\n if other_axes and isinstance(\n self._get_axis(other_axes[0]), MultiIndex):\n multi_message = ('\\n'\n 'For a multi-index, the label must be a '\n 'tuple with elements corresponding to '\n 'each level.')\n else:\n multi_message = ''\n\n label_axis_name = 'column' if axis == 0 else 'index'\n raise ValueError((\"The {label_axis_name} label '{key}' \"\n \"is not unique.{multi_message}\")\n .format(key=key,\n label_axis_name=label_axis_name,\n multi_message=multi_message))\n\n return values", "response": "Returns a 1 - D array of values associated with key a label or level name."} {"instruction": "Explain what the following Python 3 code does\ndef _drop_labels_or_levels(self, keys, axis=0):\n if self.ndim > 2:\n raise NotImplementedError(\n \"_drop_labels_or_levels is not implemented for {type}\"\n .format(type=type(self)))\n\n axis = self._get_axis_number(axis)\n\n # Validate keys\n keys = com.maybe_make_list(keys)\n invalid_keys = [k for k in keys if not\n self._is_label_or_level_reference(k, axis=axis)]\n\n if invalid_keys:\n raise ValueError((\"The following keys are not valid labels or \"\n \"levels for axis {axis}: {invalid_keys}\")\n .format(axis=axis,\n invalid_keys=invalid_keys))\n\n # Compute levels and labels to drop\n levels_to_drop = [k for k in keys\n if self._is_level_reference(k, axis=axis)]\n\n labels_to_drop = [k for k in keys\n if not self._is_level_reference(k, axis=axis)]\n\n # Perform copy upfront and then use inplace operations below.\n # This ensures that we always perform exactly one copy.\n # ``copy`` and/or ``inplace`` options could be added in the future.\n dropped = self.copy()\n\n if axis == 0:\n # Handle dropping index levels\n if levels_to_drop:\n dropped.reset_index(levels_to_drop, drop=True, inplace=True)\n\n # Handle dropping columns labels\n if labels_to_drop:\n dropped.drop(labels_to_drop, axis=1, inplace=True)\n else:\n # Handle dropping column levels\n if levels_to_drop:\n if isinstance(dropped.columns, MultiIndex):\n # Drop the specified levels from the MultiIndex\n dropped.columns = dropped.columns.droplevel(levels_to_drop)\n else:\n # Drop the last level of Index by replacing with\n # a RangeIndex\n dropped.columns = RangeIndex(dropped.columns.size)\n\n # Handle dropping index labels\n if labels_to_drop:\n dropped.drop(labels_to_drop, axis=0, inplace=True)\n\n return dropped", "response": "Drop labels and or levels for the given axis."} {"instruction": "How would you explain what the following Python 3 function does\ndef empty(self):\n return any(len(self._get_axis(a)) == 0 for a in self._AXIS_ORDERS)", "response": "Indicate whether the DataFrame is empty."} {"instruction": "Can you create a Python 3 function that\nreturns the data resource as a JSON dict.", "response": "def _repr_data_resource_(self):\n \"\"\"\n Not a real Jupyter special repr method, but we use the same\n naming convention.\n \"\"\"\n if config.get_option(\"display.html.table_schema\"):\n data = self.head(config.get_option('display.max_rows'))\n payload = json.loads(data.to_json(orient='table'),\n object_pairs_hook=collections.OrderedDict)\n return payload"} {"instruction": "How would you implement a function in Python 3 that\nconverts the object to a JSON string.", "response": "def to_json(self, path_or_buf=None, orient=None, date_format=None,\n double_precision=10, force_ascii=True, date_unit='ms',\n default_handler=None, lines=False, compression='infer',\n index=True):\n \"\"\"\n Convert the object to a JSON string.\n\n Note NaN's and None will be converted to null and datetime objects\n will be converted to UNIX timestamps.\n\n Parameters\n ----------\n path_or_buf : string or file handle, optional\n File path or object. If not specified, the result is returned as\n a string.\n orient : string\n Indication of expected JSON string format.\n\n * Series\n\n - default is 'index'\n - allowed values are: {'split','records','index','table'}\n\n * DataFrame\n\n - default is 'columns'\n - allowed values are:\n {'split','records','index','columns','values','table'}\n\n * The format of the JSON string\n\n - 'split' : dict like {'index' -> [index],\n 'columns' -> [columns], 'data' -> [values]}\n - 'records' : list like\n [{column -> value}, ... , {column -> value}]\n - 'index' : dict like {index -> {column -> value}}\n - 'columns' : dict like {column -> {index -> value}}\n - 'values' : just the values array\n - 'table' : dict like {'schema': {schema}, 'data': {data}}\n describing the data, and the data component is\n like ``orient='records'``.\n\n .. versionchanged:: 0.20.0\n\n date_format : {None, 'epoch', 'iso'}\n Type of date conversion. 'epoch' = epoch milliseconds,\n 'iso' = ISO8601. The default depends on the `orient`. For\n ``orient='table'``, the default is 'iso'. For all other orients,\n the default is 'epoch'.\n double_precision : int, default 10\n The number of decimal places to use when encoding\n floating point values.\n force_ascii : bool, default True\n Force encoded string to be ASCII.\n date_unit : string, default 'ms' (milliseconds)\n The time unit to encode to, governs timestamp and ISO8601\n precision. One of 's', 'ms', 'us', 'ns' for second, millisecond,\n microsecond, and nanosecond respectively.\n default_handler : callable, default None\n Handler to call if object cannot otherwise be converted to a\n suitable format for JSON. Should receive a single argument which is\n the object to convert and return a serialisable object.\n lines : bool, default False\n If 'orient' is 'records' write out line delimited json format. Will\n throw ValueError if incorrect 'orient' since others are not list\n like.\n\n .. versionadded:: 0.19.0\n\n compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}\n\n A string representing the compression to use in the output file,\n only used when the first argument is a filename. By default, the\n compression is inferred from the filename.\n\n .. versionadded:: 0.21.0\n .. versionchanged:: 0.24.0\n 'infer' option added and set to default\n index : bool, default True\n Whether to include the index values in the JSON string. Not\n including the index (``index=False``) is only supported when\n orient is 'split' or 'table'.\n\n .. versionadded:: 0.23.0\n\n See Also\n --------\n read_json\n\n Examples\n --------\n\n >>> df = pd.DataFrame([['a', 'b'], ['c', 'd']],\n ... index=['row 1', 'row 2'],\n ... columns=['col 1', 'col 2'])\n >>> df.to_json(orient='split')\n '{\"columns\":[\"col 1\",\"col 2\"],\n \"index\":[\"row 1\",\"row 2\"],\n \"data\":[[\"a\",\"b\"],[\"c\",\"d\"]]}'\n\n Encoding/decoding a Dataframe using ``'records'`` formatted JSON.\n Note that index labels are not preserved with this encoding.\n\n >>> df.to_json(orient='records')\n '[{\"col 1\":\"a\",\"col 2\":\"b\"},{\"col 1\":\"c\",\"col 2\":\"d\"}]'\n\n Encoding/decoding a Dataframe using ``'index'`` formatted JSON:\n\n >>> df.to_json(orient='index')\n '{\"row 1\":{\"col 1\":\"a\",\"col 2\":\"b\"},\"row 2\":{\"col 1\":\"c\",\"col 2\":\"d\"}}'\n\n Encoding/decoding a Dataframe using ``'columns'`` formatted JSON:\n\n >>> df.to_json(orient='columns')\n '{\"col 1\":{\"row 1\":\"a\",\"row 2\":\"c\"},\"col 2\":{\"row 1\":\"b\",\"row 2\":\"d\"}}'\n\n Encoding/decoding a Dataframe using ``'values'`` formatted JSON:\n\n >>> df.to_json(orient='values')\n '[[\"a\",\"b\"],[\"c\",\"d\"]]'\n\n Encoding with Table Schema\n\n >>> df.to_json(orient='table')\n '{\"schema\": {\"fields\": [{\"name\": \"index\", \"type\": \"string\"},\n {\"name\": \"col 1\", \"type\": \"string\"},\n {\"name\": \"col 2\", \"type\": \"string\"}],\n \"primaryKey\": \"index\",\n \"pandas_version\": \"0.20.0\"},\n \"data\": [{\"index\": \"row 1\", \"col 1\": \"a\", \"col 2\": \"b\"},\n {\"index\": \"row 2\", \"col 1\": \"c\", \"col 2\": \"d\"}]}'\n \"\"\"\n\n from pandas.io import json\n if date_format is None and orient == 'table':\n date_format = 'iso'\n elif date_format is None:\n date_format = 'epoch'\n return json.to_json(path_or_buf=path_or_buf, obj=self, orient=orient,\n date_format=date_format,\n double_precision=double_precision,\n force_ascii=force_ascii, date_unit=date_unit,\n default_handler=default_handler,\n lines=lines, compression=compression,\n index=index)"} {"instruction": "Write a Python 3 function for\nwriting the contained data to an HDF5 file using the HDFStore.", "response": "def to_hdf(self, path_or_buf, key, **kwargs):\n \"\"\"\n Write the contained data to an HDF5 file using HDFStore.\n\n Hierarchical Data Format (HDF) is self-describing, allowing an\n application to interpret the structure and contents of a file with\n no outside information. One HDF file can hold a mix of related objects\n which can be accessed as a group or as individual objects.\n\n In order to add another DataFrame or Series to an existing HDF file\n please use append mode and a different a key.\n\n For more information see the :ref:`user guide `.\n\n Parameters\n ----------\n path_or_buf : str or pandas.HDFStore\n File path or HDFStore object.\n key : str\n Identifier for the group in the store.\n mode : {'a', 'w', 'r+'}, default 'a'\n Mode to open file:\n\n - 'w': write, a new file is created (an existing file with\n the same name would be deleted).\n - 'a': append, an existing file is opened for reading and\n writing, and if the file does not exist it is created.\n - 'r+': similar to 'a', but the file must already exist.\n format : {'fixed', 'table'}, default 'fixed'\n Possible values:\n\n - 'fixed': Fixed format. Fast writing/reading. Not-appendable,\n nor searchable.\n - 'table': Table format. Write as a PyTables Table structure\n which may perform worse but allow more flexible operations\n like searching / selecting subsets of the data.\n append : bool, default False\n For Table formats, append the input data to the existing.\n data_columns : list of columns or True, optional\n List of columns to create as indexed data columns for on-disk\n queries, or True to use all columns. By default only the axes\n of the object are indexed. See :ref:`io.hdf5-query-data-columns`.\n Applicable only to format='table'.\n complevel : {0-9}, optional\n Specifies a compression level for data.\n A value of 0 disables compression.\n complib : {'zlib', 'lzo', 'bzip2', 'blosc'}, default 'zlib'\n Specifies the compression library to be used.\n As of v0.20.2 these additional compressors for Blosc are supported\n (default if no compressor specified: 'blosc:blosclz'):\n {'blosc:blosclz', 'blosc:lz4', 'blosc:lz4hc', 'blosc:snappy',\n 'blosc:zlib', 'blosc:zstd'}.\n Specifying a compression library which is not available issues\n a ValueError.\n fletcher32 : bool, default False\n If applying compression use the fletcher32 checksum.\n dropna : bool, default False\n If true, ALL nan rows will not be written to store.\n errors : str, default 'strict'\n Specifies how encoding and decoding errors are to be handled.\n See the errors argument for :func:`open` for a full list\n of options.\n\n See Also\n --------\n DataFrame.read_hdf : Read from HDF file.\n DataFrame.to_parquet : Write a DataFrame to the binary parquet format.\n DataFrame.to_sql : Write to a sql table.\n DataFrame.to_feather : Write out feather-format for DataFrames.\n DataFrame.to_csv : Write out to a csv file.\n\n Examples\n --------\n >>> df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]},\n ... index=['a', 'b', 'c'])\n >>> df.to_hdf('data.h5', key='df', mode='w')\n\n We can add another object to the same file:\n\n >>> s = pd.Series([1, 2, 3, 4])\n >>> s.to_hdf('data.h5', key='s')\n\n Reading from HDF file:\n\n >>> pd.read_hdf('data.h5', 'df')\n A B\n a 1 4\n b 2 5\n c 3 6\n >>> pd.read_hdf('data.h5', 's')\n 0 1\n 1 2\n 2 3\n 3 4\n dtype: int64\n\n Deleting file with data:\n\n >>> import os\n >>> os.remove('data.h5')\n \"\"\"\n from pandas.io import pytables\n return pytables.to_hdf(path_or_buf, key, self, **kwargs)"} {"instruction": "Implement a function in Python 3 to\nserialize the object to a msgpack - formatted string.", "response": "def to_msgpack(self, path_or_buf=None, encoding='utf-8', **kwargs):\n \"\"\"\n Serialize object to input file path using msgpack format.\n\n THIS IS AN EXPERIMENTAL LIBRARY and the storage format\n may not be stable until a future release.\n\n Parameters\n ----------\n path : string File path, buffer-like, or None\n if None, return generated string\n append : bool whether to append to an existing msgpack\n (default is False)\n compress : type of compressor (zlib or blosc), default to None (no\n compression)\n \"\"\"\n\n from pandas.io import packers\n return packers.to_msgpack(path_or_buf, self, encoding=encoding,\n **kwargs)"} {"instruction": "How would you code a function in Python 3 to\nwrite the DataFrame to a SQL database.", "response": "def to_sql(self, name, con, schema=None, if_exists='fail', index=True,\n index_label=None, chunksize=None, dtype=None, method=None):\n \"\"\"\n Write records stored in a DataFrame to a SQL database.\n\n Databases supported by SQLAlchemy [1]_ are supported. Tables can be\n newly created, appended to, or overwritten.\n\n Parameters\n ----------\n name : string\n Name of SQL table.\n con : sqlalchemy.engine.Engine or sqlite3.Connection\n Using SQLAlchemy makes it possible to use any DB supported by that\n library. Legacy support is provided for sqlite3.Connection objects.\n schema : string, optional\n Specify the schema (if database flavor supports this). If None, use\n default schema.\n if_exists : {'fail', 'replace', 'append'}, default 'fail'\n How to behave if the table already exists.\n\n * fail: Raise a ValueError.\n * replace: Drop the table before inserting new values.\n * append: Insert new values to the existing table.\n\n index : bool, default True\n Write DataFrame index as a column. Uses `index_label` as the column\n name in the table.\n index_label : string or sequence, default None\n Column label for index column(s). If None is given (default) and\n `index` is True, then the index names are used.\n A sequence should be given if the DataFrame uses MultiIndex.\n chunksize : int, optional\n Rows will be written in batches of this size at a time. By default,\n all rows will be written at once.\n dtype : dict, optional\n Specifying the datatype for columns. The keys should be the column\n names and the values should be the SQLAlchemy types or strings for\n the sqlite3 legacy mode.\n method : {None, 'multi', callable}, default None\n Controls the SQL insertion clause used:\n\n * None : Uses standard SQL ``INSERT`` clause (one per row).\n * 'multi': Pass multiple values in a single ``INSERT`` clause.\n * callable with signature ``(pd_table, conn, keys, data_iter)``.\n\n Details and a sample callable implementation can be found in the\n section :ref:`insert method `.\n\n .. versionadded:: 0.24.0\n\n Raises\n ------\n ValueError\n When the table already exists and `if_exists` is 'fail' (the\n default).\n\n See Also\n --------\n read_sql : Read a DataFrame from a table.\n\n Notes\n -----\n Timezone aware datetime columns will be written as\n ``Timestamp with timezone`` type with SQLAlchemy if supported by the\n database. Otherwise, the datetimes will be stored as timezone unaware\n timestamps local to the original timezone.\n\n .. versionadded:: 0.24.0\n\n References\n ----------\n .. [1] http://docs.sqlalchemy.org\n .. [2] https://www.python.org/dev/peps/pep-0249/\n\n Examples\n --------\n\n Create an in-memory SQLite database.\n\n >>> from sqlalchemy import create_engine\n >>> engine = create_engine('sqlite://', echo=False)\n\n Create a table from scratch with 3 rows.\n\n >>> df = pd.DataFrame({'name' : ['User 1', 'User 2', 'User 3']})\n >>> df\n name\n 0 User 1\n 1 User 2\n 2 User 3\n\n >>> df.to_sql('users', con=engine)\n >>> engine.execute(\"SELECT * FROM users\").fetchall()\n [(0, 'User 1'), (1, 'User 2'), (2, 'User 3')]\n\n >>> df1 = pd.DataFrame({'name' : ['User 4', 'User 5']})\n >>> df1.to_sql('users', con=engine, if_exists='append')\n >>> engine.execute(\"SELECT * FROM users\").fetchall()\n [(0, 'User 1'), (1, 'User 2'), (2, 'User 3'),\n (0, 'User 4'), (1, 'User 5')]\n\n Overwrite the table with just ``df1``.\n\n >>> df1.to_sql('users', con=engine, if_exists='replace',\n ... index_label='id')\n >>> engine.execute(\"SELECT * FROM users\").fetchall()\n [(0, 'User 4'), (1, 'User 5')]\n\n Specify the dtype (especially useful for integers with missing values).\n Notice that while pandas is forced to store the data as floating point,\n the database supports nullable integers. When fetching the data with\n Python, we get back integer scalars.\n\n >>> df = pd.DataFrame({\"A\": [1, None, 2]})\n >>> df\n A\n 0 1.0\n 1 NaN\n 2 2.0\n\n >>> from sqlalchemy.types import Integer\n >>> df.to_sql('integers', con=engine, index=False,\n ... dtype={\"A\": Integer()})\n\n >>> engine.execute(\"SELECT * FROM integers\").fetchall()\n [(1,), (None,), (2,)]\n \"\"\"\n from pandas.io import sql\n sql.to_sql(self, name, con, schema=schema, if_exists=if_exists,\n index=index, index_label=index_label, chunksize=chunksize,\n dtype=dtype, method=method)"} {"instruction": "Here you have a function in Python 3, explain what it does\ndef to_pickle(self, path, compression='infer',\n protocol=pickle.HIGHEST_PROTOCOL):\n \"\"\"\n Pickle (serialize) object to file.\n\n Parameters\n ----------\n path : str\n File path where the pickled object will be stored.\n compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, \\\n default 'infer'\n A string representing the compression to use in the output file. By\n default, infers from the file extension in specified path.\n\n .. versionadded:: 0.20.0\n protocol : int\n Int which indicates which protocol should be used by the pickler,\n default HIGHEST_PROTOCOL (see [1]_ paragraph 12.1.2). The possible\n values are 0, 1, 2, 3, 4. A negative value for the protocol\n parameter is equivalent to setting its value to HIGHEST_PROTOCOL.\n\n .. [1] https://docs.python.org/3/library/pickle.html\n .. versionadded:: 0.21.0\n\n See Also\n --------\n read_pickle : Load pickled pandas object (or any object) from file.\n DataFrame.to_hdf : Write DataFrame to an HDF5 file.\n DataFrame.to_sql : Write DataFrame to a SQL database.\n DataFrame.to_parquet : Write a DataFrame to the binary parquet format.\n\n Examples\n --------\n >>> original_df = pd.DataFrame({\"foo\": range(5), \"bar\": range(5, 10)})\n >>> original_df\n foo bar\n 0 0 5\n 1 1 6\n 2 2 7\n 3 3 8\n 4 4 9\n >>> original_df.to_pickle(\"./dummy.pkl\")\n\n >>> unpickled_df = pd.read_pickle(\"./dummy.pkl\")\n >>> unpickled_df\n foo bar\n 0 0 5\n 1 1 6\n 2 2 7\n 3 3 8\n 4 4 9\n\n >>> import os\n >>> os.remove(\"./dummy.pkl\")\n \"\"\"\n from pandas.io.pickle import to_pickle\n return to_pickle(self, path, compression=compression,\n protocol=protocol)", "response": "Serialize the current object to a pickled file."} {"instruction": "How would you explain what the following Python 3 function does\ndef to_clipboard(self, excel=True, sep=None, **kwargs):\n from pandas.io import clipboards\n clipboards.to_clipboard(self, excel=excel, sep=sep, **kwargs)", "response": "r Copy the object to the system clipboard."} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef to_xarray(self):\n\n try:\n import xarray\n except ImportError:\n # Give a nice error message\n raise ImportError(\"the xarray library is not installed\\n\"\n \"you can install via conda\\n\"\n \"conda install xarray\\n\"\n \"or via pip\\n\"\n \"pip install xarray\\n\")\n\n if self.ndim == 1:\n return xarray.DataArray.from_series(self)\n elif self.ndim == 2:\n return xarray.Dataset.from_dataframe(self)\n\n # > 2 dims\n coords = [(a, self._get_axis(a)) for a in self._AXIS_ORDERS]\n return xarray.DataArray(self,\n coords=coords,\n )", "response": "Return an xarray object from the pandas object."} {"instruction": "Can you tell what is the following Python 3 function doing\ndef to_latex(self, buf=None, columns=None, col_space=None, header=True,\n index=True, na_rep='NaN', formatters=None, float_format=None,\n sparsify=None, index_names=True, bold_rows=False,\n column_format=None, longtable=None, escape=None,\n encoding=None, decimal='.', multicolumn=None,\n multicolumn_format=None, multirow=None):\n r\"\"\"\n Render an object to a LaTeX tabular environment table.\n\n Render an object to a tabular environment table. You can splice\n this into a LaTeX document. Requires \\usepackage{booktabs}.\n\n .. versionchanged:: 0.20.2\n Added to Series\n\n Parameters\n ----------\n buf : file descriptor or None\n Buffer to write to. If None, the output is returned as a string.\n columns : list of label, optional\n The subset of columns to write. Writes all columns by default.\n col_space : int, optional\n The minimum width of each column.\n header : bool or list of str, default True\n Write out the column names. If a list of strings is given,\n it is assumed to be aliases for the column names.\n index : bool, default True\n Write row names (index).\n na_rep : str, default 'NaN'\n Missing data representation.\n formatters : list of functions or dict of {str: function}, optional\n Formatter functions to apply to columns' elements by position or\n name. The result of each function must be a unicode string.\n List must be of length equal to the number of columns.\n float_format : str, optional\n Format string for floating point numbers.\n sparsify : bool, optional\n Set to False for a DataFrame with a hierarchical index to print\n every multiindex key at each row. By default, the value will be\n read from the config module.\n index_names : bool, default True\n Prints the names of the indexes.\n bold_rows : bool, default False\n Make the row labels bold in the output.\n column_format : str, optional\n The columns format as specified in `LaTeX table format\n `__ e.g. 'rcl' for 3\n columns. By default, 'l' will be used for all columns except\n columns of numbers, which default to 'r'.\n longtable : bool, optional\n By default, the value will be read from the pandas config\n module. Use a longtable environment instead of tabular. Requires\n adding a \\usepackage{longtable} to your LaTeX preamble.\n escape : bool, optional\n By default, the value will be read from the pandas config\n module. When set to False prevents from escaping latex special\n characters in column names.\n encoding : str, optional\n A string representing the encoding to use in the output file,\n defaults to 'utf-8'.\n decimal : str, default '.'\n Character recognized as decimal separator, e.g. ',' in Europe.\n\n .. versionadded:: 0.18.0\n multicolumn : bool, default True\n Use \\multicolumn to enhance MultiIndex columns.\n The default will be read from the config module.\n\n .. versionadded:: 0.20.0\n multicolumn_format : str, default 'l'\n The alignment for multicolumns, similar to `column_format`\n The default will be read from the config module.\n\n .. versionadded:: 0.20.0\n multirow : bool, default False\n Use \\multirow to enhance MultiIndex rows. Requires adding a\n \\usepackage{multirow} to your LaTeX preamble. Will print\n centered labels (instead of top-aligned) across the contained\n rows, separating groups via clines. The default will be read\n from the pandas config module.\n\n .. versionadded:: 0.20.0\n\n Returns\n -------\n str or None\n If buf is None, returns the resulting LateX format as a\n string. Otherwise returns None.\n\n See Also\n --------\n DataFrame.to_string : Render a DataFrame to a console-friendly\n tabular output.\n DataFrame.to_html : Render a DataFrame as an HTML table.\n\n Examples\n --------\n >>> df = pd.DataFrame({'name': ['Raphael', 'Donatello'],\n ... 'mask': ['red', 'purple'],\n ... 'weapon': ['sai', 'bo staff']})\n >>> df.to_latex(index=False) # doctest: +NORMALIZE_WHITESPACE\n '\\\\begin{tabular}{lll}\\n\\\\toprule\\n name & mask & weapon\n \\\\\\\\\\n\\\\midrule\\n Raphael & red & sai \\\\\\\\\\n Donatello &\n purple & bo staff \\\\\\\\\\n\\\\bottomrule\\n\\\\end{tabular}\\n'\n \"\"\"\n # Get defaults from the pandas config\n if self.ndim == 1:\n self = self.to_frame()\n if longtable is None:\n longtable = config.get_option(\"display.latex.longtable\")\n if escape is None:\n escape = config.get_option(\"display.latex.escape\")\n if multicolumn is None:\n multicolumn = config.get_option(\"display.latex.multicolumn\")\n if multicolumn_format is None:\n multicolumn_format = config.get_option(\n \"display.latex.multicolumn_format\")\n if multirow is None:\n multirow = config.get_option(\"display.latex.multirow\")\n\n formatter = DataFrameFormatter(self, buf=buf, columns=columns,\n col_space=col_space, na_rep=na_rep,\n header=header, index=index,\n formatters=formatters,\n float_format=float_format,\n bold_rows=bold_rows,\n sparsify=sparsify,\n index_names=index_names,\n escape=escape, decimal=decimal)\n formatter.to_latex(column_format=column_format, longtable=longtable,\n encoding=encoding, multicolumn=multicolumn,\n multicolumn_format=multicolumn_format,\n multirow=multirow)\n\n if buf is None:\n return formatter.buf.getvalue()", "response": "r Converts the object to LaTeX tabular environment table."} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef to_csv(self, path_or_buf=None, sep=\",\", na_rep='', float_format=None,\n columns=None, header=True, index=True, index_label=None,\n mode='w', encoding=None, compression='infer', quoting=None,\n quotechar='\"', line_terminator=None, chunksize=None,\n tupleize_cols=None, date_format=None, doublequote=True,\n escapechar=None, decimal='.'):\n r\"\"\"\n Write object to a comma-separated values (csv) file.\n\n .. versionchanged:: 0.24.0\n The order of arguments for Series was changed.\n\n Parameters\n ----------\n path_or_buf : str or file handle, default None\n File path or object, if None is provided the result is returned as\n a string. If a file object is passed it should be opened with\n `newline=''`, disabling universal newlines.\n\n .. versionchanged:: 0.24.0\n\n Was previously named \"path\" for Series.\n\n sep : str, default ','\n String of length 1. Field delimiter for the output file.\n na_rep : str, default ''\n Missing data representation.\n float_format : str, default None\n Format string for floating point numbers.\n columns : sequence, optional\n Columns to write.\n header : bool or list of str, default True\n Write out the column names. If a list of strings is given it is\n assumed to be aliases for the column names.\n\n .. versionchanged:: 0.24.0\n\n Previously defaulted to False for Series.\n\n index : bool, default True\n Write row names (index).\n index_label : str or sequence, or False, default None\n Column label for index column(s) if desired. If None is given, and\n `header` and `index` are True, then the index names are used. A\n sequence should be given if the object uses MultiIndex. If\n False do not print fields for index names. Use index_label=False\n for easier importing in R.\n mode : str\n Python write mode, default 'w'.\n encoding : str, optional\n A string representing the encoding to use in the output file,\n defaults to 'utf-8'.\n compression : str, default 'infer'\n Compression mode among the following possible values: {'infer',\n 'gzip', 'bz2', 'zip', 'xz', None}. If 'infer' and `path_or_buf`\n is path-like, then detect compression from the following\n extensions: '.gz', '.bz2', '.zip' or '.xz'. (otherwise no\n compression).\n\n .. versionchanged:: 0.24.0\n\n 'infer' option added and set to default.\n\n quoting : optional constant from csv module\n Defaults to csv.QUOTE_MINIMAL. If you have set a `float_format`\n then floats are converted to strings and thus csv.QUOTE_NONNUMERIC\n will treat them as non-numeric.\n quotechar : str, default '\\\"'\n String of length 1. Character used to quote fields.\n line_terminator : str, optional\n The newline character or character sequence to use in the output\n file. Defaults to `os.linesep`, which depends on the OS in which\n this method is called ('\\n' for linux, '\\r\\n' for Windows, i.e.).\n\n .. versionchanged:: 0.24.0\n chunksize : int or None\n Rows to write at a time.\n tupleize_cols : bool, default False\n Write MultiIndex columns as a list of tuples (if True) or in\n the new, expanded format, where each MultiIndex column is a row\n in the CSV (if False).\n\n .. deprecated:: 0.21.0\n This argument will be removed and will always write each row\n of the multi-index as a separate row in the CSV file.\n date_format : str, default None\n Format string for datetime objects.\n doublequote : bool, default True\n Control quoting of `quotechar` inside a field.\n escapechar : str, default None\n String of length 1. Character used to escape `sep` and `quotechar`\n when appropriate.\n decimal : str, default '.'\n Character recognized as decimal separator. E.g. use ',' for\n European data.\n\n Returns\n -------\n None or str\n If path_or_buf is None, returns the resulting csv format as a\n string. Otherwise returns None.\n\n See Also\n --------\n read_csv : Load a CSV file into a DataFrame.\n to_excel : Write DataFrame to an Excel file.\n\n Examples\n --------\n >>> df = pd.DataFrame({'name': ['Raphael', 'Donatello'],\n ... 'mask': ['red', 'purple'],\n ... 'weapon': ['sai', 'bo staff']})\n >>> df.to_csv(index=False)\n 'name,mask,weapon\\nRaphael,red,sai\\nDonatello,purple,bo staff\\n'\n \"\"\"\n\n df = self if isinstance(self, ABCDataFrame) else self.to_frame()\n\n if tupleize_cols is not None:\n warnings.warn(\"The 'tupleize_cols' parameter is deprecated and \"\n \"will be removed in a future version\",\n FutureWarning, stacklevel=2)\n else:\n tupleize_cols = False\n\n from pandas.io.formats.csvs import CSVFormatter\n formatter = CSVFormatter(df, path_or_buf,\n line_terminator=line_terminator, sep=sep,\n encoding=encoding,\n compression=compression, quoting=quoting,\n na_rep=na_rep, float_format=float_format,\n cols=columns, header=header, index=index,\n index_label=index_label, mode=mode,\n chunksize=chunksize, quotechar=quotechar,\n tupleize_cols=tupleize_cols,\n date_format=date_format,\n doublequote=doublequote,\n escapechar=escapechar, decimal=decimal)\n formatter.save()\n\n if path_or_buf is None:\n return formatter.path_or_buf.getvalue()", "response": "r This function writes the Series to a comma - separated CSV file."} {"instruction": "Given the following Python 3 function, write the documentation\ndef _create_indexer(cls, name, indexer):\n if getattr(cls, name, None) is None:\n _indexer = functools.partial(indexer, name)\n setattr(cls, name, property(_indexer, doc=indexer.__doc__))", "response": "Create an indexer like _name in the class."} {"instruction": "Write a Python 3 function that can\nget item from object for given key. Returns default value if not found.", "response": "def get(self, key, default=None):\n \"\"\"\n Get item from object for given key (DataFrame column, Panel slice,\n etc.). Returns default value if not found.\n\n Parameters\n ----------\n key : object\n\n Returns\n -------\n value : same type as items contained in object\n \"\"\"\n try:\n return self[key]\n except (KeyError, ValueError, IndexError):\n return default"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef _get_item_cache(self, item):\n cache = self._item_cache\n res = cache.get(item)\n if res is None:\n values = self._data.get(item)\n res = self._box_item_values(item, values)\n cache[item] = res\n res._set_as_cached(item, self)\n\n # for a chain\n res._is_copy = self._is_copy\n return res", "response": "Return the cached item represents a label indexer."} {"instruction": "Create a Python 3 function to\nset the _cacher attribute on the calling object with a weakref to cacher.", "response": "def _set_as_cached(self, item, cacher):\n \"\"\"Set the _cacher attribute on the calling object with a weakref to\n cacher.\n \"\"\"\n self._cacher = (item, weakref.ref(cacher))"} {"instruction": "Explain what the following Python 3 code does\ndef _iget_item_cache(self, item):\n ax = self._info_axis\n if ax.is_unique:\n lower = self._get_item_cache(ax[item])\n else:\n lower = self._take(item, axis=self._info_axis_number)\n return lower", "response": "Return the cached item represents a positional indexer."} {"instruction": "Can you implement a function in Python 3 that\nupdates the cache if clear is True then clear the item cache if clear is False then clear the item cache if clear is True then clear the item cache if clear is True then clear the item cache if clear is False then clear the item cache if clear is True then clear the item cache if clear is True then clear the item cache if clear is False", "response": "def _maybe_update_cacher(self, clear=False, verify_is_copy=True):\n \"\"\"\n See if we need to update our parent cacher if clear, then clear our\n cache.\n\n Parameters\n ----------\n clear : boolean, default False\n clear the item cache\n verify_is_copy : boolean, default True\n provide is_copy checks\n\n \"\"\"\n\n cacher = getattr(self, '_cacher', None)\n if cacher is not None:\n ref = cacher[1]()\n\n # we are trying to reference a dead referant, hence\n # a copy\n if ref is None:\n del self._cacher\n else:\n try:\n ref._maybe_cache_changed(cacher[0], self)\n except Exception:\n pass\n\n if verify_is_copy:\n self._check_setitem_copy(stacklevel=5, t='referant')\n\n if clear:\n self._clear_item_cache()"} {"instruction": "Write a Python 3 script to\nconstruct a slice of this container.", "response": "def _slice(self, slobj, axis=0, kind=None):\n \"\"\"\n Construct a slice of this container.\n\n kind parameter is maintained for compatibility with Series slicing.\n \"\"\"\n axis = self._get_block_manager_axis(axis)\n result = self._constructor(self._data.get_slice(slobj, axis=axis))\n result = result.__finalize__(self)\n\n # this could be a view\n # but only in a single-dtyped view slicable case\n is_copy = axis != 0 or result._is_view\n result._set_is_copy(self, copy=is_copy)\n return result"} {"instruction": "Given the following Python 3 function, write the documentation\ndef _check_is_chained_assignment_possible(self):\n if self._is_view and self._is_cached:\n ref = self._get_cacher()\n if ref is not None and ref._is_mixed_type:\n self._check_setitem_copy(stacklevel=4, t='referant',\n force=True)\n return True\n elif self._is_copy:\n self._check_setitem_copy(stacklevel=4, t='referant')\n return False", "response": "Checks if the current value of a node is chained assignment possible."} {"instruction": "Write a Python 3 script for\nreturning the elements in the given positional indices along an axis.", "response": "def _take(self, indices, axis=0, is_copy=True):\n \"\"\"\n Return the elements in the given *positional* indices along an axis.\n\n This means that we are not indexing according to actual values in\n the index attribute of the object. We are indexing according to the\n actual position of the element in the object.\n\n This is the internal version of ``.take()`` and will contain a wider\n selection of parameters useful for internal use but not as suitable\n for public usage.\n\n Parameters\n ----------\n indices : array-like\n An array of ints indicating which positions to take.\n axis : int, default 0\n The axis on which to select elements. \"0\" means that we are\n selecting rows, \"1\" means that we are selecting columns, etc.\n is_copy : bool, default True\n Whether to return a copy of the original object or not.\n\n Returns\n -------\n taken : same type as caller\n An array-like containing the elements taken from the object.\n\n See Also\n --------\n numpy.ndarray.take\n numpy.take\n \"\"\"\n self._consolidate_inplace()\n\n new_data = self._data.take(indices,\n axis=self._get_block_manager_axis(axis),\n verify=True)\n result = self._constructor(new_data).__finalize__(self)\n\n # Maybe set copy if we didn't actually change the index.\n if is_copy:\n if not result._get_axis(axis).equals(self._get_axis(axis)):\n result._set_is_copy(self)\n\n return result"} {"instruction": "Write a Python 3 script to\nreturn the elements in the given positions along an axis.", "response": "def take(self, indices, axis=0, convert=None, is_copy=True, **kwargs):\n \"\"\"\n Return the elements in the given *positional* indices along an axis.\n\n This means that we are not indexing according to actual values in\n the index attribute of the object. We are indexing according to the\n actual position of the element in the object.\n\n Parameters\n ----------\n indices : array-like\n An array of ints indicating which positions to take.\n axis : {0 or 'index', 1 or 'columns', None}, default 0\n The axis on which to select elements. ``0`` means that we are\n selecting rows, ``1`` means that we are selecting columns.\n convert : bool, default True\n Whether to convert negative indices into positive ones.\n For example, ``-1`` would map to the ``len(axis) - 1``.\n The conversions are similar to the behavior of indexing a\n regular Python list.\n\n .. deprecated:: 0.21.0\n In the future, negative indices will always be converted.\n\n is_copy : bool, default True\n Whether to return a copy of the original object or not.\n **kwargs\n For compatibility with :meth:`numpy.take`. Has no effect on the\n output.\n\n Returns\n -------\n taken : same type as caller\n An array-like containing the elements taken from the object.\n\n See Also\n --------\n DataFrame.loc : Select a subset of a DataFrame by labels.\n DataFrame.iloc : Select a subset of a DataFrame by positions.\n numpy.take : Take elements from an array along an axis.\n\n Examples\n --------\n >>> df = pd.DataFrame([('falcon', 'bird', 389.0),\n ... ('parrot', 'bird', 24.0),\n ... ('lion', 'mammal', 80.5),\n ... ('monkey', 'mammal', np.nan)],\n ... columns=['name', 'class', 'max_speed'],\n ... index=[0, 2, 3, 1])\n >>> df\n name class max_speed\n 0 falcon bird 389.0\n 2 parrot bird 24.0\n 3 lion mammal 80.5\n 1 monkey mammal NaN\n\n Take elements at positions 0 and 3 along the axis 0 (default).\n\n Note how the actual indices selected (0 and 1) do not correspond to\n our selected indices 0 and 3. That's because we are selecting the 0th\n and 3rd rows, not rows whose indices equal 0 and 3.\n\n >>> df.take([0, 3])\n name class max_speed\n 0 falcon bird 389.0\n 1 monkey mammal NaN\n\n Take elements at indices 1 and 2 along the axis 1 (column selection).\n\n >>> df.take([1, 2], axis=1)\n class max_speed\n 0 bird 389.0\n 2 bird 24.0\n 3 mammal 80.5\n 1 mammal NaN\n\n We may take elements using negative integers for positive indices,\n starting from the end of the object, just like with Python lists.\n\n >>> df.take([-1, -2])\n name class max_speed\n 1 monkey mammal NaN\n 3 lion mammal 80.5\n \"\"\"\n if convert is not None:\n msg = (\"The 'convert' parameter is deprecated \"\n \"and will be removed in a future version.\")\n warnings.warn(msg, FutureWarning, stacklevel=2)\n\n nv.validate_take(tuple(), kwargs)\n return self._take(indices, axis=axis, is_copy=is_copy)"} {"instruction": "Can you create a Python 3 function that\nreturns a Series or DataFrame containing the values of a specific cross - section.", "response": "def xs(self, key, axis=0, level=None, drop_level=True):\n \"\"\"\n Return cross-section from the Series/DataFrame.\n\n This method takes a `key` argument to select data at a particular\n level of a MultiIndex.\n\n Parameters\n ----------\n key : label or tuple of label\n Label contained in the index, or partially in a MultiIndex.\n axis : {0 or 'index', 1 or 'columns'}, default 0\n Axis to retrieve cross-section on.\n level : object, defaults to first n levels (n=1 or len(key))\n In case of a key partially contained in a MultiIndex, indicate\n which levels are used. Levels can be referred by label or position.\n drop_level : bool, default True\n If False, returns object with same levels as self.\n\n Returns\n -------\n Series or DataFrame\n Cross-section from the original Series or DataFrame\n corresponding to the selected index levels.\n\n See Also\n --------\n DataFrame.loc : Access a group of rows and columns\n by label(s) or a boolean array.\n DataFrame.iloc : Purely integer-location based indexing\n for selection by position.\n\n Notes\n -----\n `xs` can not be used to set values.\n\n MultiIndex Slicers is a generic way to get/set values on\n any level or levels.\n It is a superset of `xs` functionality, see\n :ref:`MultiIndex Slicers `.\n\n Examples\n --------\n >>> d = {'num_legs': [4, 4, 2, 2],\n ... 'num_wings': [0, 0, 2, 2],\n ... 'class': ['mammal', 'mammal', 'mammal', 'bird'],\n ... 'animal': ['cat', 'dog', 'bat', 'penguin'],\n ... 'locomotion': ['walks', 'walks', 'flies', 'walks']}\n >>> df = pd.DataFrame(data=d)\n >>> df = df.set_index(['class', 'animal', 'locomotion'])\n >>> df\n num_legs num_wings\n class animal locomotion\n mammal cat walks 4 0\n dog walks 4 0\n bat flies 2 2\n bird penguin walks 2 2\n\n Get values at specified index\n\n >>> df.xs('mammal')\n num_legs num_wings\n animal locomotion\n cat walks 4 0\n dog walks 4 0\n bat flies 2 2\n\n Get values at several indexes\n\n >>> df.xs(('mammal', 'dog'))\n num_legs num_wings\n locomotion\n walks 4 0\n\n Get values at specified index and level\n\n >>> df.xs('cat', level=1)\n num_legs num_wings\n class locomotion\n mammal walks 4 0\n\n Get values at several indexes and levels\n\n >>> df.xs(('bird', 'walks'),\n ... level=[0, 'locomotion'])\n num_legs num_wings\n animal\n penguin 2 2\n\n Get values at specified column and axis\n\n >>> df.xs('num_wings', axis=1)\n class animal locomotion\n mammal cat walks 0\n dog walks 0\n bat flies 2\n bird penguin walks 2\n Name: num_wings, dtype: int64\n \"\"\"\n axis = self._get_axis_number(axis)\n labels = self._get_axis(axis)\n if level is not None:\n loc, new_ax = labels.get_loc_level(key, level=level,\n drop_level=drop_level)\n\n # create the tuple of the indexer\n indexer = [slice(None)] * self.ndim\n indexer[axis] = loc\n indexer = tuple(indexer)\n\n result = self.iloc[indexer]\n setattr(result, result._get_axis_name(axis), new_ax)\n return result\n\n if axis == 1:\n return self[key]\n\n self._consolidate_inplace()\n\n index = self.index\n if isinstance(index, MultiIndex):\n loc, new_index = self.index.get_loc_level(key,\n drop_level=drop_level)\n else:\n loc = self.index.get_loc(key)\n\n if isinstance(loc, np.ndarray):\n if loc.dtype == np.bool_:\n inds, = loc.nonzero()\n return self._take(inds, axis=axis)\n else:\n return self._take(loc, axis=axis)\n\n if not is_scalar(loc):\n new_index = self.index[loc]\n\n if is_scalar(loc):\n new_values = self._data.fast_xs(loc)\n\n # may need to box a datelike-scalar\n #\n # if we encounter an array-like and we only have 1 dim\n # that means that their are list/ndarrays inside the Series!\n # so just return them (GH 6394)\n if not is_list_like(new_values) or self.ndim == 1:\n return com.maybe_box_datetimelike(new_values)\n\n result = self._constructor_sliced(\n new_values, index=self.columns,\n name=self.index[loc], dtype=new_values.dtype)\n\n else:\n result = self.iloc[loc]\n result.index = new_index\n\n # this could be a view\n # but only in a single-dtyped view slicable case\n result._set_is_copy(self, copy=not result._is_view)\n return result"} {"instruction": "Write a Python 3 function that can\nreturn a new DataFrame with only the entries in the specified axis matching criteria.", "response": "def select(self, crit, axis=0):\n \"\"\"\n Return data corresponding to axis labels matching criteria.\n\n .. deprecated:: 0.21.0\n Use df.loc[df.index.map(crit)] to select via labels\n\n Parameters\n ----------\n crit : function\n To be called on each index (label). Should return True or False\n axis : int\n\n Returns\n -------\n selection : same type as caller\n \"\"\"\n warnings.warn(\"'select' is deprecated and will be removed in a \"\n \"future release. You can use \"\n \".loc[labels.map(crit)] as a replacement\",\n FutureWarning, stacklevel=2)\n\n axis = self._get_axis_number(axis)\n axis_name = self._get_axis_name(axis)\n axis_values = self._get_axis(axis)\n\n if len(axis_values) > 0:\n new_axis = axis_values[\n np.asarray([bool(crit(label)) for label in axis_values])]\n else:\n new_axis = axis_values\n\n return self.reindex(**{axis_name: new_axis})"} {"instruction": "How would you explain what the following Python 3 function does\ndef reindex_like(self, other, method=None, copy=True, limit=None,\n tolerance=None):\n \"\"\"\n Return an object with matching indices as other object.\n\n Conform the object to the same index on all axes. Optional\n filling logic, placing NaN in locations having no value\n in the previous index. A new object is produced unless the\n new index is equivalent to the current one and copy=False.\n\n Parameters\n ----------\n other : Object of the same data type\n Its row and column indices are used to define the new indices\n of this object.\n method : {None, 'backfill'/'bfill', 'pad'/'ffill', 'nearest'}\n Method to use for filling holes in reindexed DataFrame.\n Please note: this is only applicable to DataFrames/Series with a\n monotonically increasing/decreasing index.\n\n * None (default): don't fill gaps\n * pad / ffill: propagate last valid observation forward to next\n valid\n * backfill / bfill: use next valid observation to fill gap\n * nearest: use nearest valid observations to fill gap\n\n copy : bool, default True\n Return a new object, even if the passed indexes are the same.\n limit : int, default None\n Maximum number of consecutive labels to fill for inexact matches.\n tolerance : optional\n Maximum distance between original and new labels for inexact\n matches. The values of the index at the matching locations most\n satisfy the equation ``abs(index[indexer] - target) <= tolerance``.\n\n Tolerance may be a scalar value, which applies the same tolerance\n to all values, or list-like, which applies variable tolerance per\n element. List-like includes list, tuple, array, Series, and must be\n the same size as the index and its dtype must exactly match the\n index's type.\n\n .. versionadded:: 0.21.0 (list-like tolerance)\n\n Returns\n -------\n Series or DataFrame\n Same type as caller, but with changed indices on each axis.\n\n See Also\n --------\n DataFrame.set_index : Set row labels.\n DataFrame.reset_index : Remove row labels or move them to new columns.\n DataFrame.reindex : Change to new indices or expand indices.\n\n Notes\n -----\n Same as calling\n ``.reindex(index=other.index, columns=other.columns,...)``.\n\n Examples\n --------\n >>> df1 = pd.DataFrame([[24.3, 75.7, 'high'],\n ... [31, 87.8, 'high'],\n ... [22, 71.6, 'medium'],\n ... [35, 95, 'medium']],\n ... columns=['temp_celsius', 'temp_fahrenheit', 'windspeed'],\n ... index=pd.date_range(start='2014-02-12',\n ... end='2014-02-15', freq='D'))\n\n >>> df1\n temp_celsius temp_fahrenheit windspeed\n 2014-02-12 24.3 75.7 high\n 2014-02-13 31.0 87.8 high\n 2014-02-14 22.0 71.6 medium\n 2014-02-15 35.0 95.0 medium\n\n >>> df2 = pd.DataFrame([[28, 'low'],\n ... [30, 'low'],\n ... [35.1, 'medium']],\n ... columns=['temp_celsius', 'windspeed'],\n ... index=pd.DatetimeIndex(['2014-02-12', '2014-02-13',\n ... '2014-02-15']))\n\n >>> df2\n temp_celsius windspeed\n 2014-02-12 28.0 low\n 2014-02-13 30.0 low\n 2014-02-15 35.1 medium\n\n >>> df2.reindex_like(df1)\n temp_celsius temp_fahrenheit windspeed\n 2014-02-12 28.0 NaN low\n 2014-02-13 30.0 NaN low\n 2014-02-14 NaN NaN NaN\n 2014-02-15 35.1 NaN medium\n \"\"\"\n d = other._construct_axes_dict(axes=self._AXIS_ORDERS, method=method,\n copy=copy, limit=limit,\n tolerance=tolerance)\n\n return self.reindex(**d)", "response": "Return an object with matching indices as other object."} {"instruction": "Implement a Python 3 function for\ndropping labels from specified axis.", "response": "def _drop_axis(self, labels, axis, level=None, errors='raise'):\n \"\"\"\n Drop labels from specified axis. Used in the ``drop`` method\n internally.\n\n Parameters\n ----------\n labels : single label or list-like\n axis : int or axis name\n level : int or level name, default None\n For MultiIndex\n errors : {'ignore', 'raise'}, default 'raise'\n If 'ignore', suppress error and existing labels are dropped.\n\n \"\"\"\n axis = self._get_axis_number(axis)\n axis_name = self._get_axis_name(axis)\n axis = self._get_axis(axis)\n\n if axis.is_unique:\n if level is not None:\n if not isinstance(axis, MultiIndex):\n raise AssertionError('axis must be a MultiIndex')\n new_axis = axis.drop(labels, level=level, errors=errors)\n else:\n new_axis = axis.drop(labels, errors=errors)\n result = self.reindex(**{axis_name: new_axis})\n\n # Case for non-unique axis\n else:\n labels = ensure_object(com.index_labels_to_array(labels))\n if level is not None:\n if not isinstance(axis, MultiIndex):\n raise AssertionError('axis must be a MultiIndex')\n indexer = ~axis.get_level_values(level).isin(labels)\n\n # GH 18561 MultiIndex.drop should raise if label is absent\n if errors == 'raise' and indexer.all():\n raise KeyError('{} not found in axis'.format(labels))\n else:\n indexer = ~axis.isin(labels)\n # Check if label doesn't exist along axis\n labels_missing = (axis.get_indexer_for(labels) == -1).any()\n if errors == 'raise' and labels_missing:\n raise KeyError('{} not found in axis'.format(labels))\n\n slicer = [slice(None)] * self.ndim\n slicer[self._get_axis_number(axis_name)] = indexer\n\n result = self.loc[tuple(slicer)]\n\n return result"} {"instruction": "Write a Python 3 script for\nupdating the internal cache with the result.", "response": "def _update_inplace(self, result, verify_is_copy=True):\n \"\"\"\n Replace self internals with result.\n\n Parameters\n ----------\n verify_is_copy : boolean, default True\n provide is_copy checks\n\n \"\"\"\n # NOTE: This does *not* call __finalize__ and that's an explicit\n # decision that we may revisit in the future.\n\n self._reset_cache()\n self._clear_item_cache()\n self._data = getattr(result, '_data', result)\n self._maybe_update_cacher(verify_is_copy=verify_is_copy)"} {"instruction": "Implement a Python 3 function for\nadding prefix to the internal log entry.", "response": "def add_prefix(self, prefix):\n \"\"\"\n Prefix labels with string `prefix`.\n\n For Series, the row labels are prefixed.\n For DataFrame, the column labels are prefixed.\n\n Parameters\n ----------\n prefix : str\n The string to add before each label.\n\n Returns\n -------\n Series or DataFrame\n New Series or DataFrame with updated labels.\n\n See Also\n --------\n Series.add_suffix: Suffix row labels with string `suffix`.\n DataFrame.add_suffix: Suffix column labels with string `suffix`.\n\n Examples\n --------\n >>> s = pd.Series([1, 2, 3, 4])\n >>> s\n 0 1\n 1 2\n 2 3\n 3 4\n dtype: int64\n\n >>> s.add_prefix('item_')\n item_0 1\n item_1 2\n item_2 3\n item_3 4\n dtype: int64\n\n >>> df = pd.DataFrame({'A': [1, 2, 3, 4], 'B': [3, 4, 5, 6]})\n >>> df\n A B\n 0 1 3\n 1 2 4\n 2 3 5\n 3 4 6\n\n >>> df.add_prefix('col_')\n col_A col_B\n 0 1 3\n 1 2 4\n 2 3 5\n 3 4 6\n \"\"\"\n f = functools.partial('{prefix}{}'.format, prefix=prefix)\n\n mapper = {self._info_axis_name: f}\n return self.rename(**mapper)"} {"instruction": "Can you write a function in Python 3 where it\nadds suffix to the log entry.", "response": "def add_suffix(self, suffix):\n \"\"\"\n Suffix labels with string `suffix`.\n\n For Series, the row labels are suffixed.\n For DataFrame, the column labels are suffixed.\n\n Parameters\n ----------\n suffix : str\n The string to add after each label.\n\n Returns\n -------\n Series or DataFrame\n New Series or DataFrame with updated labels.\n\n See Also\n --------\n Series.add_prefix: Prefix row labels with string `prefix`.\n DataFrame.add_prefix: Prefix column labels with string `prefix`.\n\n Examples\n --------\n >>> s = pd.Series([1, 2, 3, 4])\n >>> s\n 0 1\n 1 2\n 2 3\n 3 4\n dtype: int64\n\n >>> s.add_suffix('_item')\n 0_item 1\n 1_item 2\n 2_item 3\n 3_item 4\n dtype: int64\n\n >>> df = pd.DataFrame({'A': [1, 2, 3, 4], 'B': [3, 4, 5, 6]})\n >>> df\n A B\n 0 1 3\n 1 2 4\n 2 3 5\n 3 4 6\n\n >>> df.add_suffix('_col')\n A_col B_col\n 0 1 3\n 1 2 4\n 2 3 5\n 3 4 6\n \"\"\"\n f = functools.partial('{}{suffix}'.format, suffix=suffix)\n\n mapper = {self._info_axis_name: f}\n return self.rename(**mapper)"} {"instruction": "How would you implement a function in Python 3 that\nsorts the values of the log entry in a single or multi - column order.", "response": "def sort_values(self, by=None, axis=0, ascending=True, inplace=False,\n kind='quicksort', na_position='last'):\n \"\"\"\n Sort by the values along either axis.\n\n Parameters\n ----------%(optional_by)s\n axis : %(axes_single_arg)s, default 0\n Axis to be sorted.\n ascending : bool or list of bool, default True\n Sort ascending vs. descending. Specify list for multiple sort\n orders. If this is a list of bools, must match the length of\n the by.\n inplace : bool, default False\n If True, perform operation in-place.\n kind : {'quicksort', 'mergesort', 'heapsort'}, default 'quicksort'\n Choice of sorting algorithm. See also ndarray.np.sort for more\n information. `mergesort` is the only stable algorithm. For\n DataFrames, this option is only applied when sorting on a single\n column or label.\n na_position : {'first', 'last'}, default 'last'\n Puts NaNs at the beginning if `first`; `last` puts NaNs at the\n end.\n\n Returns\n -------\n sorted_obj : DataFrame or None\n DataFrame with sorted values if inplace=False, None otherwise.\n\n Examples\n --------\n >>> df = pd.DataFrame({\n ... 'col1': ['A', 'A', 'B', np.nan, 'D', 'C'],\n ... 'col2': [2, 1, 9, 8, 7, 4],\n ... 'col3': [0, 1, 9, 4, 2, 3],\n ... })\n >>> df\n col1 col2 col3\n 0 A 2 0\n 1 A 1 1\n 2 B 9 9\n 3 NaN 8 4\n 4 D 7 2\n 5 C 4 3\n\n Sort by col1\n\n >>> df.sort_values(by=['col1'])\n col1 col2 col3\n 0 A 2 0\n 1 A 1 1\n 2 B 9 9\n 5 C 4 3\n 4 D 7 2\n 3 NaN 8 4\n\n Sort by multiple columns\n\n >>> df.sort_values(by=['col1', 'col2'])\n col1 col2 col3\n 1 A 1 1\n 0 A 2 0\n 2 B 9 9\n 5 C 4 3\n 4 D 7 2\n 3 NaN 8 4\n\n Sort Descending\n\n >>> df.sort_values(by='col1', ascending=False)\n col1 col2 col3\n 4 D 7 2\n 5 C 4 3\n 2 B 9 9\n 0 A 2 0\n 1 A 1 1\n 3 NaN 8 4\n\n Putting NAs first\n\n >>> df.sort_values(by='col1', ascending=False, na_position='first')\n col1 col2 col3\n 3 NaN 8 4\n 4 D 7 2\n 5 C 4 3\n 2 B 9 9\n 0 A 2 0\n 1 A 1 1\n \"\"\"\n raise NotImplementedError(\"sort_values has not been implemented \"\n \"on Panel or Panel4D objects.\")"} {"instruction": "Write a Python 3 function for\nsorting the DataFrame by index.", "response": "def sort_index(self, axis=0, level=None, ascending=True, inplace=False,\n kind='quicksort', na_position='last', sort_remaining=True):\n \"\"\"\n Sort object by labels (along an axis).\n\n Parameters\n ----------\n axis : {0 or 'index', 1 or 'columns'}, default 0\n The axis along which to sort. The value 0 identifies the rows,\n and 1 identifies the columns.\n level : int or level name or list of ints or list of level names\n If not None, sort on values in specified index level(s).\n ascending : bool, default True\n Sort ascending vs. descending.\n inplace : bool, default False\n If True, perform operation in-place.\n kind : {'quicksort', 'mergesort', 'heapsort'}, default 'quicksort'\n Choice of sorting algorithm. See also ndarray.np.sort for more\n information. `mergesort` is the only stable algorithm. For\n DataFrames, this option is only applied when sorting on a single\n column or label.\n na_position : {'first', 'last'}, default 'last'\n Puts NaNs at the beginning if `first`; `last` puts NaNs at the end.\n Not implemented for MultiIndex.\n sort_remaining : bool, default True\n If True and sorting by level and index is multilevel, sort by other\n levels too (in order) after sorting by specified level.\n\n Returns\n -------\n sorted_obj : DataFrame or None\n DataFrame with sorted index if inplace=False, None otherwise.\n \"\"\"\n inplace = validate_bool_kwarg(inplace, 'inplace')\n axis = self._get_axis_number(axis)\n axis_name = self._get_axis_name(axis)\n labels = self._get_axis(axis)\n\n if level is not None:\n raise NotImplementedError(\"level is not implemented\")\n if inplace:\n raise NotImplementedError(\"inplace is not implemented\")\n\n sort_index = labels.argsort()\n if not ascending:\n sort_index = sort_index[::-1]\n\n new_axis = labels.take(sort_index)\n return self.reindex(**{axis_name: new_axis})"} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef reindex(self, *args, **kwargs):\n # TODO: Decide if we care about having different examples for different\n # kinds\n\n # construct the args\n axes, kwargs = self._construct_axes_from_arguments(args, kwargs)\n method = missing.clean_reindex_fill_method(kwargs.pop('method', None))\n level = kwargs.pop('level', None)\n copy = kwargs.pop('copy', True)\n limit = kwargs.pop('limit', None)\n tolerance = kwargs.pop('tolerance', None)\n fill_value = kwargs.pop('fill_value', None)\n\n # Series.reindex doesn't use / need the axis kwarg\n # We pop and ignore it here, to make writing Series/Frame generic code\n # easier\n kwargs.pop(\"axis\", None)\n\n if kwargs:\n raise TypeError('reindex() got an unexpected keyword '\n 'argument \"{0}\"'.format(list(kwargs.keys())[0]))\n\n self._consolidate_inplace()\n\n # if all axes that are requested to reindex are equal, then only copy\n # if indicated must have index names equal here as well as values\n if all(self._get_axis(axis).identical(ax)\n for axis, ax in axes.items() if ax is not None):\n if copy:\n return self.copy()\n return self\n\n # check if we are a multi reindex\n if self._needs_reindex_multi(axes, method, level):\n try:\n return self._reindex_multi(axes, copy, fill_value)\n except Exception:\n pass\n\n # perform the reindex on the axes\n return self._reindex_axes(axes, level, limit, tolerance, method,\n fill_value, copy).__finalize__(self)", "response": "Reindex the most\n in the current index."} {"instruction": "Can you write a function in Python 3 where it\nperforms the reindex for all the axes.", "response": "def _reindex_axes(self, axes, level, limit, tolerance, method, fill_value,\n copy):\n \"\"\"Perform the reindex for all the axes.\"\"\"\n obj = self\n for a in self._AXIS_ORDERS:\n labels = axes[a]\n if labels is None:\n continue\n\n ax = self._get_axis(a)\n new_index, indexer = ax.reindex(labels, level=level, limit=limit,\n tolerance=tolerance, method=method)\n\n axis = self._get_axis_number(a)\n obj = obj._reindex_with_indexers({axis: [new_index, indexer]},\n fill_value=fill_value,\n copy=copy, allow_dups=False)\n\n return obj"} {"instruction": "Given the following Python 3 function, write the documentation\ndef _needs_reindex_multi(self, axes, method, level):\n return ((com.count_not_none(*axes.values()) == self._AXIS_LEN) and\n method is None and level is None and not self._is_mixed_type)", "response": "Check if we need to reindex a multi - level record."} {"instruction": "Implement a Python 3 function for\nfiltering the items of the DataFrame according to the labels in the specified index.", "response": "def filter(self, items=None, like=None, regex=None, axis=None):\n \"\"\"\n Subset rows or columns of dataframe according to labels in\n the specified index.\n\n Note that this routine does not filter a dataframe on its\n contents. The filter is applied to the labels of the index.\n\n Parameters\n ----------\n items : list-like\n Keep labels from axis which are in items.\n like : string\n Keep labels from axis for which \"like in label == True\".\n regex : string (regular expression)\n Keep labels from axis for which re.search(regex, label) == True.\n axis : int or string axis name\n The axis to filter on. By default this is the info axis,\n 'index' for Series, 'columns' for DataFrame.\n\n Returns\n -------\n same type as input object\n\n See Also\n --------\n DataFrame.loc\n\n Notes\n -----\n The ``items``, ``like``, and ``regex`` parameters are\n enforced to be mutually exclusive.\n\n ``axis`` defaults to the info axis that is used when indexing\n with ``[]``.\n\n Examples\n --------\n >>> df = pd.DataFrame(np.array(([1, 2, 3], [4, 5, 6])),\n ... index=['mouse', 'rabbit'],\n ... columns=['one', 'two', 'three'])\n\n >>> # select columns by name\n >>> df.filter(items=['one', 'three'])\n one three\n mouse 1 3\n rabbit 4 6\n\n >>> # select columns by regular expression\n >>> df.filter(regex='e$', axis=1)\n one three\n mouse 1 3\n rabbit 4 6\n\n >>> # select rows containing 'bbi'\n >>> df.filter(like='bbi', axis=0)\n one two three\n rabbit 4 5 6\n \"\"\"\n import re\n\n nkw = com.count_not_none(items, like, regex)\n if nkw > 1:\n raise TypeError('Keyword arguments `items`, `like`, or `regex` '\n 'are mutually exclusive')\n\n if axis is None:\n axis = self._info_axis_name\n labels = self._get_axis(axis)\n\n if items is not None:\n name = self._get_axis_name(axis)\n return self.reindex(\n **{name: [r for r in items if r in labels]})\n elif like:\n def f(x):\n return like in to_str(x)\n values = labels.map(f)\n return self.loc(axis=axis)[values]\n elif regex:\n def f(x):\n return matcher.search(to_str(x)) is not None\n matcher = re.compile(regex)\n values = labels.map(f)\n return self.loc(axis=axis)[values]\n else:\n raise TypeError('Must pass either `items`, `like`, or `regex`')"} {"instruction": "Explain what the following Python 3 code does\ndef sample(self, n=None, frac=None, replace=False, weights=None,\n random_state=None, axis=None):\n \"\"\"\n Return a random sample of items from an axis of object.\n\n You can use `random_state` for reproducibility.\n\n Parameters\n ----------\n n : int, optional\n Number of items from axis to return. Cannot be used with `frac`.\n Default = 1 if `frac` = None.\n frac : float, optional\n Fraction of axis items to return. Cannot be used with `n`.\n replace : bool, default False\n Sample with or without replacement.\n weights : str or ndarray-like, optional\n Default 'None' results in equal probability weighting.\n If passed a Series, will align with target object on index. Index\n values in weights not found in sampled object will be ignored and\n index values in sampled object not in weights will be assigned\n weights of zero.\n If called on a DataFrame, will accept the name of a column\n when axis = 0.\n Unless weights are a Series, weights must be same length as axis\n being sampled.\n If weights do not sum to 1, they will be normalized to sum to 1.\n Missing values in the weights column will be treated as zero.\n Infinite values not allowed.\n random_state : int or numpy.random.RandomState, optional\n Seed for the random number generator (if int), or numpy RandomState\n object.\n axis : int or string, optional\n Axis to sample. Accepts axis number or name. Default is stat axis\n for given data type (0 for Series and DataFrames, 1 for Panels).\n\n Returns\n -------\n Series or DataFrame\n A new object of same type as caller containing `n` items randomly\n sampled from the caller object.\n\n See Also\n --------\n numpy.random.choice: Generates a random sample from a given 1-D numpy\n array.\n\n Examples\n --------\n >>> df = pd.DataFrame({'num_legs': [2, 4, 8, 0],\n ... 'num_wings': [2, 0, 0, 0],\n ... 'num_specimen_seen': [10, 2, 1, 8]},\n ... index=['falcon', 'dog', 'spider', 'fish'])\n >>> df\n num_legs num_wings num_specimen_seen\n falcon 2 2 10\n dog 4 0 2\n spider 8 0 1\n fish 0 0 8\n\n Extract 3 random elements from the ``Series`` ``df['num_legs']``:\n Note that we use `random_state` to ensure the reproducibility of\n the examples.\n\n >>> df['num_legs'].sample(n=3, random_state=1)\n fish 0\n spider 8\n falcon 2\n Name: num_legs, dtype: int64\n\n A random 50% sample of the ``DataFrame`` with replacement:\n\n >>> df.sample(frac=0.5, replace=True, random_state=1)\n num_legs num_wings num_specimen_seen\n dog 4 0 2\n fish 0 0 8\n\n Using a DataFrame column as weights. Rows with larger value in the\n `num_specimen_seen` column are more likely to be sampled.\n\n >>> df.sample(n=2, weights='num_specimen_seen', random_state=1)\n num_legs num_wings num_specimen_seen\n falcon 2 2 10\n fish 0 0 8\n \"\"\"\n\n if axis is None:\n axis = self._stat_axis_number\n\n axis = self._get_axis_number(axis)\n axis_length = self.shape[axis]\n\n # Process random_state argument\n rs = com.random_state(random_state)\n\n # Check weights for compliance\n if weights is not None:\n\n # If a series, align with frame\n if isinstance(weights, pd.Series):\n weights = weights.reindex(self.axes[axis])\n\n # Strings acceptable if a dataframe and axis = 0\n if isinstance(weights, str):\n if isinstance(self, pd.DataFrame):\n if axis == 0:\n try:\n weights = self[weights]\n except KeyError:\n raise KeyError(\"String passed to weights not a \"\n \"valid column\")\n else:\n raise ValueError(\"Strings can only be passed to \"\n \"weights when sampling from rows on \"\n \"a DataFrame\")\n else:\n raise ValueError(\"Strings cannot be passed as weights \"\n \"when sampling from a Series or Panel.\")\n\n weights = pd.Series(weights, dtype='float64')\n\n if len(weights) != axis_length:\n raise ValueError(\"Weights and axis to be sampled must be of \"\n \"same length\")\n\n if (weights == np.inf).any() or (weights == -np.inf).any():\n raise ValueError(\"weight vector may not include `inf` values\")\n\n if (weights < 0).any():\n raise ValueError(\"weight vector many not include negative \"\n \"values\")\n\n # If has nan, set to zero.\n weights = weights.fillna(0)\n\n # Renormalize if don't sum to 1\n if weights.sum() != 1:\n if weights.sum() != 0:\n weights = weights / weights.sum()\n else:\n raise ValueError(\"Invalid weights: weights sum to zero\")\n\n weights = weights.values\n\n # If no frac or n, default to n=1.\n if n is None and frac is None:\n n = 1\n elif n is not None and frac is None and n % 1 != 0:\n raise ValueError(\"Only integers accepted as `n` values\")\n elif n is None and frac is not None:\n n = int(round(frac * axis_length))\n elif n is not None and frac is not None:\n raise ValueError('Please enter a value for `frac` OR `n`, not '\n 'both')\n\n # Check for negative sizes\n if n < 0:\n raise ValueError(\"A negative number of rows requested. Please \"\n \"provide positive value.\")\n\n locs = rs.choice(axis_length, size=n, replace=replace, p=weights)\n return self.take(locs, axis=axis, is_copy=False)", "response": "Return a random sample of items from a given axis of object."} {"instruction": "Given the following Python 3 function, write the documentation\ndef _dir_additions(self):\n additions = {c for c in self._info_axis.unique(level=0)[:100]\n if isinstance(c, str) and c.isidentifier()}\n return super()._dir_additions().union(additions)", "response": "Add the string - like attributes from the info_axis."} {"instruction": "Create a Python 3 function to\nprotect the consolidated data", "response": "def _protect_consolidate(self, f):\n \"\"\"Consolidate _data -- if the blocks have changed, then clear the\n cache\n \"\"\"\n blocks_before = len(self._data.blocks)\n result = f()\n if len(self._data.blocks) != blocks_before:\n self._clear_item_cache()\n return result"} {"instruction": "Explain what the following Python 3 code does\ndef _consolidate_inplace(self):\n\n def f():\n self._data = self._data.consolidate()\n\n self._protect_consolidate(f)", "response": "Consolidate data in place and return None"} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef _consolidate(self, inplace=False):\n inplace = validate_bool_kwarg(inplace, 'inplace')\n if inplace:\n self._consolidate_inplace()\n else:\n f = lambda: self._data.consolidate()\n cons_data = self._protect_consolidate(f)\n return self._constructor(cons_data).__finalize__(self)", "response": "Compute the consolidated internals of the NDFrame."} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef _check_inplace_setting(self, value):\n\n if self._is_mixed_type:\n if not self._is_numeric_mixed_type:\n\n # allow an actual np.nan thru\n try:\n if np.isnan(value):\n return True\n except Exception:\n pass\n\n raise TypeError('Cannot do inplace boolean setting on '\n 'mixed-types with a non np.nan value')\n\n return True", "response": "check whether we allow in - place setting with this type of value"} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef as_matrix(self, columns=None):\n warnings.warn(\"Method .as_matrix will be removed in a future version. \"\n \"Use .values instead.\", FutureWarning, stacklevel=2)\n self._consolidate_inplace()\n return self._data.as_array(transpose=self._AXIS_REVERSED,\n items=columns)", "response": "Convert the DataFrame to a Numpy - matrix representation."} {"instruction": "Can you create a Python 3 function that\nreturns a Numpy representation of the DataFrame.", "response": "def values(self):\n \"\"\"\n Return a Numpy representation of the DataFrame.\n\n .. warning::\n\n We recommend using :meth:`DataFrame.to_numpy` instead.\n\n Only the values in the DataFrame will be returned, the axes labels\n will be removed.\n\n Returns\n -------\n numpy.ndarray\n The values of the DataFrame.\n\n See Also\n --------\n DataFrame.to_numpy : Recommended alternative to this method.\n DataFrame.index : Retrieve the index labels.\n DataFrame.columns : Retrieving the column names.\n\n Notes\n -----\n The dtype will be a lower-common-denominator dtype (implicit\n upcasting); that is to say if the dtypes (even of numeric types)\n are mixed, the one that accommodates all will be chosen. Use this\n with care if you are not dealing with the blocks.\n\n e.g. If the dtypes are float16 and float32, dtype will be upcast to\n float32. If dtypes are int32 and uint8, dtype will be upcast to\n int32. By :func:`numpy.find_common_type` convention, mixing int64\n and uint64 will result in a float64 dtype.\n\n Examples\n --------\n A DataFrame where all columns are the same type (e.g., int64) results\n in an array of the same type.\n\n >>> df = pd.DataFrame({'age': [ 3, 29],\n ... 'height': [94, 170],\n ... 'weight': [31, 115]})\n >>> df\n age height weight\n 0 3 94 31\n 1 29 170 115\n >>> df.dtypes\n age int64\n height int64\n weight int64\n dtype: object\n >>> df.values\n array([[ 3, 94, 31],\n [ 29, 170, 115]], dtype=int64)\n\n A DataFrame with mixed type columns(e.g., str/object, int64, float32)\n results in an ndarray of the broadest type that accommodates these\n mixed types (e.g., object).\n\n >>> df2 = pd.DataFrame([('parrot', 24.0, 'second'),\n ... ('lion', 80.5, 1),\n ... ('monkey', np.nan, None)],\n ... columns=('name', 'max_speed', 'rank'))\n >>> df2.dtypes\n name object\n max_speed float64\n rank object\n dtype: object\n >>> df2.values\n array([['parrot', 24.0, 'second'],\n ['lion', 80.5, 1],\n ['monkey', nan, None]], dtype=object)\n \"\"\"\n self._consolidate_inplace()\n return self._data.as_array(transpose=self._AXIS_REVERSED)"} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef get_ftype_counts(self):\n warnings.warn(\"get_ftype_counts is deprecated and will \"\n \"be removed in a future version\",\n FutureWarning, stacklevel=2)\n\n from pandas import Series\n return Series(self._data.get_ftype_counts())", "response": "Return counts of unique ftypes in this object."} {"instruction": "Write a Python 3 script for\nreturning the dtypes in the DataFrame.", "response": "def dtypes(self):\n \"\"\"\n Return the dtypes in the DataFrame.\n\n This returns a Series with the data type of each column.\n The result's index is the original DataFrame's columns. Columns\n with mixed types are stored with the ``object`` dtype. See\n :ref:`the User Guide ` for more.\n\n Returns\n -------\n pandas.Series\n The data type of each column.\n\n See Also\n --------\n DataFrame.ftypes : Dtype and sparsity information.\n\n Examples\n --------\n >>> df = pd.DataFrame({'float': [1.0],\n ... 'int': [1],\n ... 'datetime': [pd.Timestamp('20180310')],\n ... 'string': ['foo']})\n >>> df.dtypes\n float float64\n int int64\n datetime datetime64[ns]\n string object\n dtype: object\n \"\"\"\n from pandas import Series\n return Series(self._data.get_dtypes(), index=self._info_axis,\n dtype=np.object_)"} {"instruction": "Make a summary of the following Python 3 code\ndef ftypes(self):\n from pandas import Series\n return Series(self._data.get_ftypes(), index=self._info_axis,\n dtype=np.object_)", "response": "Return the ftypes of each column in the original DataFrame."} {"instruction": "Create a Python 3 function to\nconvert the frame to a dict of types that each has has a homogeneous dtype.", "response": "def as_blocks(self, copy=True):\n \"\"\"\n Convert the frame to a dict of dtype -> Constructor Types that each has\n a homogeneous dtype.\n\n .. deprecated:: 0.21.0\n\n NOTE: the dtypes of the blocks WILL BE PRESERVED HERE (unlike in\n as_matrix)\n\n Parameters\n ----------\n copy : boolean, default True\n\n Returns\n -------\n values : a dict of dtype -> Constructor Types\n \"\"\"\n warnings.warn(\"as_blocks is deprecated and will \"\n \"be removed in a future version\",\n FutureWarning, stacklevel=2)\n return self._to_dict_of_blocks(copy=copy)"} {"instruction": "Can you write a function in Python 3 where it\nreturns a dict of dtype - > constructor Types that are homogeneous", "response": "def _to_dict_of_blocks(self, copy=True):\n \"\"\"\n Return a dict of dtype -> Constructor Types that\n each is a homogeneous dtype.\n\n Internal ONLY\n \"\"\"\n return {k: self._constructor(v).__finalize__(self)\n for k, v, in self._data.to_dict(copy=copy).items()}"} {"instruction": "Here you have a function in Python 3, explain what it does\ndef astype(self, dtype, copy=True, errors='raise', **kwargs):\n if is_dict_like(dtype):\n if self.ndim == 1: # i.e. Series\n if len(dtype) > 1 or self.name not in dtype:\n raise KeyError('Only the Series name can be used for '\n 'the key in Series dtype mappings.')\n new_type = dtype[self.name]\n return self.astype(new_type, copy, errors, **kwargs)\n elif self.ndim > 2:\n raise NotImplementedError(\n 'astype() only accepts a dtype arg of type dict when '\n 'invoked on Series and DataFrames. A single dtype must be '\n 'specified when invoked on a Panel.'\n )\n for col_name in dtype.keys():\n if col_name not in self:\n raise KeyError('Only a column name can be used for the '\n 'key in a dtype mappings argument.')\n results = []\n for col_name, col in self.iteritems():\n if col_name in dtype:\n results.append(col.astype(dtype=dtype[col_name], copy=copy,\n errors=errors, **kwargs))\n else:\n results.append(results.append(col.copy() if copy else col))\n\n elif is_extension_array_dtype(dtype) and self.ndim > 1:\n # GH 18099/22869: columnwise conversion to extension dtype\n # GH 24704: use iloc to handle duplicate column names\n results = (self.iloc[:, i].astype(dtype, copy=copy)\n for i in range(len(self.columns)))\n\n else:\n # else, only a single dtype is given\n new_data = self._data.astype(dtype=dtype, copy=copy, errors=errors,\n **kwargs)\n return self._constructor(new_data).__finalize__(self)\n\n # GH 19920: retain column metadata after concat\n result = pd.concat(results, axis=1, copy=False)\n result.columns = self.columns\n return result", "response": "Cast a pandas object to a specified type."} {"instruction": "Given the following Python 3 function, write the documentation\ndef copy(self, deep=True):\n data = self._data.copy(deep=deep)\n return self._constructor(data).__finalize__(self)", "response": "Make a deep copy of the object s data and indices and data."} {"instruction": "Write a Python 3 function for\nconverting the object to the correct type.", "response": "def _convert(self, datetime=False, numeric=False, timedelta=False,\n coerce=False, copy=True):\n \"\"\"\n Attempt to infer better dtype for object columns\n\n Parameters\n ----------\n datetime : boolean, default False\n If True, convert to date where possible.\n numeric : boolean, default False\n If True, attempt to convert to numbers (including strings), with\n unconvertible values becoming NaN.\n timedelta : boolean, default False\n If True, convert to timedelta where possible.\n coerce : boolean, default False\n If True, force conversion with unconvertible values converted to\n nulls (NaN or NaT)\n copy : boolean, default True\n If True, return a copy even if no copy is necessary (e.g. no\n conversion was done). Note: This is meant for internal use, and\n should not be confused with inplace.\n\n Returns\n -------\n converted : same as input object\n \"\"\"\n return self._constructor(\n self._data.convert(datetime=datetime, numeric=numeric,\n timedelta=timedelta, coerce=coerce,\n copy=copy)).__finalize__(self)"} {"instruction": "Create a Python 3 function for\nconverting the object columns of the object to datetime timedelta or NaT.", "response": "def convert_objects(self, convert_dates=True, convert_numeric=False,\n convert_timedeltas=True, copy=True):\n \"\"\"\n Attempt to infer better dtype for object columns.\n\n .. deprecated:: 0.21.0\n\n Parameters\n ----------\n convert_dates : boolean, default True\n If True, convert to date where possible. If 'coerce', force\n conversion, with unconvertible values becoming NaT.\n convert_numeric : boolean, default False\n If True, attempt to coerce to numbers (including strings), with\n unconvertible values becoming NaN.\n convert_timedeltas : boolean, default True\n If True, convert to timedelta where possible. If 'coerce', force\n conversion, with unconvertible values becoming NaT.\n copy : boolean, default True\n If True, return a copy even if no copy is necessary (e.g. no\n conversion was done). Note: This is meant for internal use, and\n should not be confused with inplace.\n\n Returns\n -------\n converted : same as input object\n\n See Also\n --------\n to_datetime : Convert argument to datetime.\n to_timedelta : Convert argument to timedelta.\n to_numeric : Convert argument to numeric type.\n \"\"\"\n msg = (\"convert_objects is deprecated. To re-infer data dtypes for \"\n \"object columns, use {klass}.infer_objects()\\nFor all \"\n \"other conversions use the data-type specific converters \"\n \"pd.to_datetime, pd.to_timedelta and pd.to_numeric.\"\n ).format(klass=self.__class__.__name__)\n warnings.warn(msg, FutureWarning, stacklevel=2)\n\n return self._constructor(\n self._data.convert(convert_dates=convert_dates,\n convert_numeric=convert_numeric,\n convert_timedeltas=convert_timedeltas,\n copy=copy)).__finalize__(self)"} {"instruction": "Write a Python 3 script for\ninferring objects for the given object - dtyped columns.", "response": "def infer_objects(self):\n \"\"\"\n Attempt to infer better dtypes for object columns.\n\n Attempts soft conversion of object-dtyped\n columns, leaving non-object and unconvertible\n columns unchanged. The inference rules are the\n same as during normal Series/DataFrame construction.\n\n .. versionadded:: 0.21.0\n\n Returns\n -------\n converted : same type as input object\n\n See Also\n --------\n to_datetime : Convert argument to datetime.\n to_timedelta : Convert argument to timedelta.\n to_numeric : Convert argument to numeric type.\n\n Examples\n --------\n >>> df = pd.DataFrame({\"A\": [\"a\", 1, 2, 3]})\n >>> df = df.iloc[1:]\n >>> df\n A\n 1 1\n 2 2\n 3 3\n\n >>> df.dtypes\n A object\n dtype: object\n\n >>> df.infer_objects().dtypes\n A int64\n dtype: object\n \"\"\"\n # numeric=False necessary to only soft convert;\n # python objects will still be converted to\n # native numpy numeric types\n return self._constructor(\n self._data.convert(datetime=True, numeric=False,\n timedelta=True, coerce=False,\n copy=True)).__finalize__(self)"} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef fillna(self, value=None, method=None, axis=None, inplace=False,\n limit=None, downcast=None):\n \"\"\"\n Fill NA/NaN values using the specified method.\n\n Parameters\n ----------\n value : scalar, dict, Series, or DataFrame\n Value to use to fill holes (e.g. 0), alternately a\n dict/Series/DataFrame of values specifying which value to use for\n each index (for a Series) or column (for a DataFrame). Values not\n in the dict/Series/DataFrame will not be filled. This value cannot\n be a list.\n method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None\n Method to use for filling holes in reindexed Series\n pad / ffill: propagate last valid observation forward to next valid\n backfill / bfill: use next valid observation to fill gap.\n axis : %(axes_single_arg)s\n Axis along which to fill missing values.\n inplace : bool, default False\n If True, fill in-place. Note: this will modify any\n other views on this object (e.g., a no-copy slice for a column in a\n DataFrame).\n limit : int, default None\n If method is specified, this is the maximum number of consecutive\n NaN values to forward/backward fill. In other words, if there is\n a gap with more than this number of consecutive NaNs, it will only\n be partially filled. If method is not specified, this is the\n maximum number of entries along the entire axis where NaNs will be\n filled. Must be greater than 0 if not None.\n downcast : dict, default is None\n A dict of item->dtype of what to downcast if possible,\n or the string 'infer' which will try to downcast to an appropriate\n equal type (e.g. float64 to int64 if possible).\n\n Returns\n -------\n %(klass)s\n Object with missing values filled.\n\n See Also\n --------\n interpolate : Fill NaN values using interpolation.\n reindex : Conform object to new index.\n asfreq : Convert TimeSeries to specified frequency.\n\n Examples\n --------\n >>> df = pd.DataFrame([[np.nan, 2, np.nan, 0],\n ... [3, 4, np.nan, 1],\n ... [np.nan, np.nan, np.nan, 5],\n ... [np.nan, 3, np.nan, 4]],\n ... columns=list('ABCD'))\n >>> df\n A B C D\n 0 NaN 2.0 NaN 0\n 1 3.0 4.0 NaN 1\n 2 NaN NaN NaN 5\n 3 NaN 3.0 NaN 4\n\n Replace all NaN elements with 0s.\n\n >>> df.fillna(0)\n A B C D\n 0 0.0 2.0 0.0 0\n 1 3.0 4.0 0.0 1\n 2 0.0 0.0 0.0 5\n 3 0.0 3.0 0.0 4\n\n We can also propagate non-null values forward or backward.\n\n >>> df.fillna(method='ffill')\n A B C D\n 0 NaN 2.0 NaN 0\n 1 3.0 4.0 NaN 1\n 2 3.0 4.0 NaN 5\n 3 3.0 3.0 NaN 4\n\n Replace all NaN elements in column 'A', 'B', 'C', and 'D', with 0, 1,\n 2, and 3 respectively.\n\n >>> values = {'A': 0, 'B': 1, 'C': 2, 'D': 3}\n >>> df.fillna(value=values)\n A B C D\n 0 0.0 2.0 2.0 0\n 1 3.0 4.0 2.0 1\n 2 0.0 1.0 2.0 5\n 3 0.0 3.0 2.0 4\n\n Only replace the first NaN element.\n\n >>> df.fillna(value=values, limit=1)\n A B C D\n 0 0.0 2.0 2.0 0\n 1 3.0 4.0 NaN 1\n 2 NaN 1.0 NaN 5\n 3 NaN 3.0 NaN 4\n \"\"\"\n inplace = validate_bool_kwarg(inplace, 'inplace')\n value, method = validate_fillna_kwargs(value, method)\n\n self._consolidate_inplace()\n\n # set the default here, so functions examining the signaure\n # can detect if something was set (e.g. in groupby) (GH9221)\n if axis is None:\n axis = 0\n axis = self._get_axis_number(axis)\n\n from pandas import DataFrame\n if value is None:\n\n if self._is_mixed_type and axis == 1:\n if inplace:\n raise NotImplementedError()\n result = self.T.fillna(method=method, limit=limit).T\n\n # need to downcast here because of all of the transposes\n result._data = result._data.downcast()\n\n return result\n\n # > 3d\n if self.ndim > 3:\n raise NotImplementedError('Cannot fillna with a method for > '\n '3dims')\n\n # 3d\n elif self.ndim == 3:\n # fill in 2d chunks\n result = {col: s.fillna(method=method, value=value)\n for col, s in self.iteritems()}\n prelim_obj = self._constructor.from_dict(result)\n new_obj = prelim_obj.__finalize__(self)\n new_data = new_obj._data\n\n else:\n # 2d or less\n new_data = self._data.interpolate(method=method, axis=axis,\n limit=limit, inplace=inplace,\n coerce=True,\n downcast=downcast)\n else:\n if len(self._get_axis(axis)) == 0:\n return self\n\n if self.ndim == 1:\n if isinstance(value, (dict, ABCSeries)):\n from pandas import Series\n value = Series(value)\n elif not is_list_like(value):\n pass\n else:\n raise TypeError('\"value\" parameter must be a scalar, dict '\n 'or Series, but you passed a '\n '\"{0}\"'.format(type(value).__name__))\n\n new_data = self._data.fillna(value=value, limit=limit,\n inplace=inplace,\n downcast=downcast)\n\n elif isinstance(value, (dict, ABCSeries)):\n if axis == 1:\n raise NotImplementedError('Currently only can fill '\n 'with dict/Series column '\n 'by column')\n\n result = self if inplace else self.copy()\n for k, v in value.items():\n if k not in result:\n continue\n obj = result[k]\n obj.fillna(v, limit=limit, inplace=True, downcast=downcast)\n return result if not inplace else None\n\n elif not is_list_like(value):\n new_data = self._data.fillna(value=value, limit=limit,\n inplace=inplace,\n downcast=downcast)\n elif isinstance(value, DataFrame) and self.ndim == 2:\n new_data = self.where(self.notna(), value)\n else:\n raise ValueError(\"invalid fill value with a %s\" % type(value))\n\n if inplace:\n self._update_inplace(new_data)\n else:\n return self._constructor(new_data).__finalize__(self)", "response": "Fill missing values with NaN values."} {"instruction": "Here you have a function in Python 3, explain what it does\ndef interpolate(self, method='linear', axis=0, limit=None, inplace=False,\n limit_direction='forward', limit_area=None,\n downcast=None, **kwargs):\n \"\"\"\n Interpolate values according to different methods.\n \"\"\"\n inplace = validate_bool_kwarg(inplace, 'inplace')\n\n if self.ndim > 2:\n raise NotImplementedError(\"Interpolate has not been implemented \"\n \"on Panel and Panel 4D objects.\")\n\n if axis == 0:\n ax = self._info_axis_name\n _maybe_transposed_self = self\n elif axis == 1:\n _maybe_transposed_self = self.T\n ax = 1\n else:\n _maybe_transposed_self = self\n ax = _maybe_transposed_self._get_axis_number(ax)\n\n if _maybe_transposed_self.ndim == 2:\n alt_ax = 1 - ax\n else:\n alt_ax = ax\n\n if (isinstance(_maybe_transposed_self.index, MultiIndex) and\n method != 'linear'):\n raise ValueError(\"Only `method=linear` interpolation is supported \"\n \"on MultiIndexes.\")\n\n if _maybe_transposed_self._data.get_dtype_counts().get(\n 'object') == len(_maybe_transposed_self.T):\n raise TypeError(\"Cannot interpolate with all object-dtype columns \"\n \"in the DataFrame. Try setting at least one \"\n \"column to a numeric dtype.\")\n\n # create/use the index\n if method == 'linear':\n # prior default\n index = np.arange(len(_maybe_transposed_self._get_axis(alt_ax)))\n else:\n index = _maybe_transposed_self._get_axis(alt_ax)\n methods = {\"index\", \"values\", \"nearest\", \"time\"}\n is_numeric_or_datetime = (\n is_numeric_dtype(index) or\n is_datetime64_dtype(index) or\n is_timedelta64_dtype(index)\n )\n if method not in methods and not is_numeric_or_datetime:\n raise ValueError(\n \"Index column must be numeric or datetime type when \"\n \"using {method} method other than linear. \"\n \"Try setting a numeric or datetime index column before \"\n \"interpolating.\".format(method=method))\n\n if isna(index).any():\n raise NotImplementedError(\"Interpolation with NaNs in the index \"\n \"has not been implemented. Try filling \"\n \"those NaNs before interpolating.\")\n data = _maybe_transposed_self._data\n new_data = data.interpolate(method=method, axis=ax, index=index,\n values=_maybe_transposed_self, limit=limit,\n limit_direction=limit_direction,\n limit_area=limit_area,\n inplace=inplace, downcast=downcast,\n **kwargs)\n\n if inplace:\n if axis == 1:\n new_data = self._constructor(new_data).T._data\n self._update_inplace(new_data)\n else:\n res = self._constructor(new_data).__finalize__(self)\n if axis == 1:\n res = res.T\n return res", "response": "Interpolate values according to different methods."} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef asof(self, where, subset=None):\n if isinstance(where, str):\n from pandas import to_datetime\n where = to_datetime(where)\n\n if not self.index.is_monotonic:\n raise ValueError(\"asof requires a sorted index\")\n\n is_series = isinstance(self, ABCSeries)\n if is_series:\n if subset is not None:\n raise ValueError(\"subset is not valid for Series\")\n elif self.ndim > 2:\n raise NotImplementedError(\"asof is not implemented \"\n \"for {type}\".format(type=type(self)))\n else:\n if subset is None:\n subset = self.columns\n if not is_list_like(subset):\n subset = [subset]\n\n is_list = is_list_like(where)\n if not is_list:\n start = self.index[0]\n if isinstance(self.index, PeriodIndex):\n where = Period(where, freq=self.index.freq).ordinal\n start = start.ordinal\n\n if where < start:\n if not is_series:\n from pandas import Series\n return Series(index=self.columns, name=where)\n return np.nan\n\n # It's always much faster to use a *while* loop here for\n # Series than pre-computing all the NAs. However a\n # *while* loop is extremely expensive for DataFrame\n # so we later pre-compute all the NAs and use the same\n # code path whether *where* is a scalar or list.\n # See PR: https://github.com/pandas-dev/pandas/pull/14476\n if is_series:\n loc = self.index.searchsorted(where, side='right')\n if loc > 0:\n loc -= 1\n\n values = self._values\n while loc > 0 and isna(values[loc]):\n loc -= 1\n return values[loc]\n\n if not isinstance(where, Index):\n where = Index(where) if is_list else Index([where])\n\n nulls = self.isna() if is_series else self[subset].isna().any(1)\n if nulls.all():\n if is_series:\n return self._constructor(np.nan, index=where, name=self.name)\n elif is_list:\n from pandas import DataFrame\n return DataFrame(np.nan, index=where, columns=self.columns)\n else:\n from pandas import Series\n return Series(np.nan, index=self.columns, name=where[0])\n\n locs = self.index.asof_locs(where, ~(nulls.values))\n\n # mask the missing\n missing = locs == -1\n data = self.take(locs, is_copy=False)\n data.index = where\n data.loc[missing] = np.nan\n return data if is_list else data.iloc[-1]", "response": "Return the last row without any NaNs before where."} {"instruction": "Implement a Python 3 function for\nclipping the values of a specific column of the data.", "response": "def clip(self, lower=None, upper=None, axis=None, inplace=False,\n *args, **kwargs):\n \"\"\"\n Trim values at input threshold(s).\n\n Assigns values outside boundary to boundary values. Thresholds\n can be singular values or array like, and in the latter case\n the clipping is performed element-wise in the specified axis.\n\n Parameters\n ----------\n lower : float or array_like, default None\n Minimum threshold value. All values below this\n threshold will be set to it.\n upper : float or array_like, default None\n Maximum threshold value. All values above this\n threshold will be set to it.\n axis : int or str axis name, optional\n Align object with lower and upper along the given axis.\n inplace : bool, default False\n Whether to perform the operation in place on the data.\n\n .. versionadded:: 0.21.0\n *args, **kwargs\n Additional keywords have no effect but might be accepted\n for compatibility with numpy.\n\n Returns\n -------\n Series or DataFrame\n Same type as calling object with the values outside the\n clip boundaries replaced.\n\n Examples\n --------\n >>> data = {'col_0': [9, -3, 0, -1, 5], 'col_1': [-2, -7, 6, 8, -5]}\n >>> df = pd.DataFrame(data)\n >>> df\n col_0 col_1\n 0 9 -2\n 1 -3 -7\n 2 0 6\n 3 -1 8\n 4 5 -5\n\n Clips per column using lower and upper thresholds:\n\n >>> df.clip(-4, 6)\n col_0 col_1\n 0 6 -2\n 1 -3 -4\n 2 0 6\n 3 -1 6\n 4 5 -4\n\n Clips using specific lower and upper thresholds per column element:\n\n >>> t = pd.Series([2, -4, -1, 6, 3])\n >>> t\n 0 2\n 1 -4\n 2 -1\n 3 6\n 4 3\n dtype: int64\n\n >>> df.clip(t, t + 4, axis=0)\n col_0 col_1\n 0 6 2\n 1 -3 -4\n 2 0 3\n 3 6 8\n 4 5 3\n \"\"\"\n if isinstance(self, ABCPanel):\n raise NotImplementedError(\"clip is not supported yet for panels\")\n\n inplace = validate_bool_kwarg(inplace, 'inplace')\n\n axis = nv.validate_clip_with_axis(axis, args, kwargs)\n if axis is not None:\n axis = self._get_axis_number(axis)\n\n # GH 17276\n # numpy doesn't like NaN as a clip value\n # so ignore\n # GH 19992\n # numpy doesn't drop a list-like bound containing NaN\n if not is_list_like(lower) and np.any(pd.isnull(lower)):\n lower = None\n if not is_list_like(upper) and np.any(pd.isnull(upper)):\n upper = None\n\n # GH 2747 (arguments were reversed)\n if lower is not None and upper is not None:\n if is_scalar(lower) and is_scalar(upper):\n lower, upper = min(lower, upper), max(lower, upper)\n\n # fast-path for scalars\n if ((lower is None or (is_scalar(lower) and is_number(lower))) and\n (upper is None or (is_scalar(upper) and is_number(upper)))):\n return self._clip_with_scalar(lower, upper, inplace=inplace)\n\n result = self\n if lower is not None:\n result = result._clip_with_one_bound(lower, method=self.ge,\n axis=axis, inplace=inplace)\n if upper is not None:\n if inplace:\n result = self\n result = result._clip_with_one_bound(upper, method=self.le,\n axis=axis, inplace=inplace)\n\n return result"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef clip_upper(self, threshold, axis=None, inplace=False):\n warnings.warn('clip_upper(threshold) is deprecated, '\n 'use clip(upper=threshold) instead',\n FutureWarning, stacklevel=2)\n return self._clip_with_one_bound(threshold, method=self.le,\n axis=axis, inplace=inplace)", "response": "Trim values below a given threshold."} {"instruction": "Write a Python 3 script for\ntrimming values below a given threshold.", "response": "def clip_lower(self, threshold, axis=None, inplace=False):\n \"\"\"\n Trim values below a given threshold.\n\n .. deprecated:: 0.24.0\n Use clip(lower=threshold) instead.\n\n Elements below the `threshold` will be changed to match the\n `threshold` value(s). Threshold can be a single value or an array,\n in the latter case it performs the truncation element-wise.\n\n Parameters\n ----------\n threshold : numeric or array-like\n Minimum value allowed. All values below threshold will be set to\n this value.\n\n * float : every value is compared to `threshold`.\n * array-like : The shape of `threshold` should match the object\n it's compared to. When `self` is a Series, `threshold` should be\n the length. When `self` is a DataFrame, `threshold` should 2-D\n and the same shape as `self` for ``axis=None``, or 1-D and the\n same length as the axis being compared.\n\n axis : {0 or 'index', 1 or 'columns'}, default 0\n Align `self` with `threshold` along the given axis.\n\n inplace : bool, default False\n Whether to perform the operation in place on the data.\n\n .. versionadded:: 0.21.0\n\n Returns\n -------\n Series or DataFrame\n Original data with values trimmed.\n\n See Also\n --------\n Series.clip : General purpose method to trim Series values to given\n threshold(s).\n DataFrame.clip : General purpose method to trim DataFrame values to\n given threshold(s).\n\n Examples\n --------\n\n Series single threshold clipping:\n\n >>> s = pd.Series([5, 6, 7, 8, 9])\n >>> s.clip(lower=8)\n 0 8\n 1 8\n 2 8\n 3 8\n 4 9\n dtype: int64\n\n Series clipping element-wise using an array of thresholds. `threshold`\n should be the same length as the Series.\n\n >>> elemwise_thresholds = [4, 8, 7, 2, 5]\n >>> s.clip(lower=elemwise_thresholds)\n 0 5\n 1 8\n 2 7\n 3 8\n 4 9\n dtype: int64\n\n DataFrames can be compared to a scalar.\n\n >>> df = pd.DataFrame({\"A\": [1, 3, 5], \"B\": [2, 4, 6]})\n >>> df\n A B\n 0 1 2\n 1 3 4\n 2 5 6\n\n >>> df.clip(lower=3)\n A B\n 0 3 3\n 1 3 4\n 2 5 6\n\n Or to an array of values. By default, `threshold` should be the same\n shape as the DataFrame.\n\n >>> df.clip(lower=np.array([[3, 4], [2, 2], [6, 2]]))\n A B\n 0 3 4\n 1 3 4\n 2 6 6\n\n Control how `threshold` is broadcast with `axis`. In this case\n `threshold` should be the same length as the axis specified by\n `axis`.\n\n >>> df.clip(lower=[3, 3, 5], axis='index')\n A B\n 0 3 3\n 1 3 4\n 2 5 6\n\n >>> df.clip(lower=[4, 5], axis='columns')\n A B\n 0 4 5\n 1 4 5\n 2 5 6\n \"\"\"\n warnings.warn('clip_lower(threshold) is deprecated, '\n 'use clip(lower=threshold) instead',\n FutureWarning, stacklevel=2)\n return self._clip_with_one_bound(threshold, method=self.ge,\n axis=axis, inplace=inplace)"} {"instruction": "Write a Python 3 function that can\ngroup by a DataFrame or Series or Series.", "response": "def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True,\n group_keys=True, squeeze=False, observed=False, **kwargs):\n \"\"\"\n Group DataFrame or Series using a mapper or by a Series of columns.\n\n A groupby operation involves some combination of splitting the\n object, applying a function, and combining the results. This can be\n used to group large amounts of data and compute operations on these\n groups.\n\n Parameters\n ----------\n by : mapping, function, label, or list of labels\n Used to determine the groups for the groupby.\n If ``by`` is a function, it's called on each value of the object's\n index. If a dict or Series is passed, the Series or dict VALUES\n will be used to determine the groups (the Series' values are first\n aligned; see ``.align()`` method). If an ndarray is passed, the\n values are used as-is determine the groups. A label or list of\n labels may be passed to group by the columns in ``self``. Notice\n that a tuple is interpreted a (single) key.\n axis : {0 or 'index', 1 or 'columns'}, default 0\n Split along rows (0) or columns (1).\n level : int, level name, or sequence of such, default None\n If the axis is a MultiIndex (hierarchical), group by a particular\n level or levels.\n as_index : bool, default True\n For aggregated output, return object with group labels as the\n index. Only relevant for DataFrame input. as_index=False is\n effectively \"SQL-style\" grouped output.\n sort : bool, default True\n Sort group keys. Get better performance by turning this off.\n Note this does not influence the order of observations within each\n group. Groupby preserves the order of rows within each group.\n group_keys : bool, default True\n When calling apply, add group keys to index to identify pieces.\n squeeze : bool, default False\n Reduce the dimensionality of the return type if possible,\n otherwise return a consistent type.\n observed : bool, default False\n This only applies if any of the groupers are Categoricals.\n If True: only show observed values for categorical groupers.\n If False: show all values for categorical groupers.\n\n .. versionadded:: 0.23.0\n\n **kwargs\n Optional, only accepts keyword argument 'mutated' and is passed\n to groupby.\n\n Returns\n -------\n DataFrameGroupBy or SeriesGroupBy\n Depends on the calling object and returns groupby object that\n contains information about the groups.\n\n See Also\n --------\n resample : Convenience method for frequency conversion and resampling\n of time series.\n\n Notes\n -----\n See the `user guide\n `_ for more.\n\n Examples\n --------\n >>> df = pd.DataFrame({'Animal': ['Falcon', 'Falcon',\n ... 'Parrot', 'Parrot'],\n ... 'Max Speed': [380., 370., 24., 26.]})\n >>> df\n Animal Max Speed\n 0 Falcon 380.0\n 1 Falcon 370.0\n 2 Parrot 24.0\n 3 Parrot 26.0\n >>> df.groupby(['Animal']).mean()\n Max Speed\n Animal\n Falcon 375.0\n Parrot 25.0\n\n **Hierarchical Indexes**\n\n We can groupby different levels of a hierarchical index\n using the `level` parameter:\n\n >>> arrays = [['Falcon', 'Falcon', 'Parrot', 'Parrot'],\n ... ['Captive', 'Wild', 'Captive', 'Wild']]\n >>> index = pd.MultiIndex.from_arrays(arrays, names=('Animal', 'Type'))\n >>> df = pd.DataFrame({'Max Speed': [390., 350., 30., 20.]},\n ... index=index)\n >>> df\n Max Speed\n Animal Type\n Falcon Captive 390.0\n Wild 350.0\n Parrot Captive 30.0\n Wild 20.0\n >>> df.groupby(level=0).mean()\n Max Speed\n Animal\n Falcon 370.0\n Parrot 25.0\n >>> df.groupby(level=1).mean()\n Max Speed\n Type\n Captive 210.0\n Wild 185.0\n \"\"\"\n from pandas.core.groupby.groupby import groupby\n\n if level is None and by is None:\n raise TypeError(\"You have to supply one of 'by' and 'level'\")\n axis = self._get_axis_number(axis)\n return groupby(self, by=by, axis=axis, level=level, as_index=as_index,\n sort=sort, group_keys=group_keys, squeeze=squeeze,\n observed=observed, **kwargs)"} {"instruction": "How would you explain what the following Python 3 function does\ndef asfreq(self, freq, method=None, how=None, normalize=False,\n fill_value=None):\n \"\"\"\n Convert TimeSeries to specified frequency.\n\n Optionally provide filling method to pad/backfill missing values.\n\n Returns the original data conformed to a new index with the specified\n frequency. ``resample`` is more appropriate if an operation, such as\n summarization, is necessary to represent the data at the new frequency.\n\n Parameters\n ----------\n freq : DateOffset object, or string\n method : {'backfill'/'bfill', 'pad'/'ffill'}, default None\n Method to use for filling holes in reindexed Series (note this\n does not fill NaNs that already were present):\n\n * 'pad' / 'ffill': propagate last valid observation forward to next\n valid\n * 'backfill' / 'bfill': use NEXT valid observation to fill\n how : {'start', 'end'}, default end\n For PeriodIndex only, see PeriodIndex.asfreq\n normalize : bool, default False\n Whether to reset output index to midnight\n fill_value : scalar, optional\n Value to use for missing values, applied during upsampling (note\n this does not fill NaNs that already were present).\n\n .. versionadded:: 0.20.0\n\n Returns\n -------\n converted : same type as caller\n\n See Also\n --------\n reindex\n\n Notes\n -----\n To learn more about the frequency strings, please see `this link\n `__.\n\n Examples\n --------\n\n Start by creating a series with 4 one minute timestamps.\n\n >>> index = pd.date_range('1/1/2000', periods=4, freq='T')\n >>> series = pd.Series([0.0, None, 2.0, 3.0], index=index)\n >>> df = pd.DataFrame({'s':series})\n >>> df\n s\n 2000-01-01 00:00:00 0.0\n 2000-01-01 00:01:00 NaN\n 2000-01-01 00:02:00 2.0\n 2000-01-01 00:03:00 3.0\n\n Upsample the series into 30 second bins.\n\n >>> df.asfreq(freq='30S')\n s\n 2000-01-01 00:00:00 0.0\n 2000-01-01 00:00:30 NaN\n 2000-01-01 00:01:00 NaN\n 2000-01-01 00:01:30 NaN\n 2000-01-01 00:02:00 2.0\n 2000-01-01 00:02:30 NaN\n 2000-01-01 00:03:00 3.0\n\n Upsample again, providing a ``fill value``.\n\n >>> df.asfreq(freq='30S', fill_value=9.0)\n s\n 2000-01-01 00:00:00 0.0\n 2000-01-01 00:00:30 9.0\n 2000-01-01 00:01:00 NaN\n 2000-01-01 00:01:30 9.0\n 2000-01-01 00:02:00 2.0\n 2000-01-01 00:02:30 9.0\n 2000-01-01 00:03:00 3.0\n\n Upsample again, providing a ``method``.\n\n >>> df.asfreq(freq='30S', method='bfill')\n s\n 2000-01-01 00:00:00 0.0\n 2000-01-01 00:00:30 NaN\n 2000-01-01 00:01:00 NaN\n 2000-01-01 00:01:30 2.0\n 2000-01-01 00:02:00 2.0\n 2000-01-01 00:02:30 3.0\n 2000-01-01 00:03:00 3.0\n \"\"\"\n from pandas.core.resample import asfreq\n return asfreq(self, freq, method=method, how=how, normalize=normalize,\n fill_value=fill_value)", "response": "Convert the TimeSeries to a specified frequency."} {"instruction": "Explain what the following Python 3 code does\ndef at_time(self, time, asof=False, axis=None):\n if axis is None:\n axis = self._stat_axis_number\n axis = self._get_axis_number(axis)\n\n index = self._get_axis(axis)\n try:\n indexer = index.indexer_at_time(time, asof=asof)\n except AttributeError:\n raise TypeError('Index must be DatetimeIndex')\n\n return self._take(indexer, axis=axis)", "response": "Return a new Series with the values at particular time of the day."} {"instruction": "Write a Python 3 function for\nreturning a new Series containing the entries in the log that are in the given time range.", "response": "def between_time(self, start_time, end_time, include_start=True,\n include_end=True, axis=None):\n \"\"\"\n Select values between particular times of the day (e.g., 9:00-9:30 AM).\n\n By setting ``start_time`` to be later than ``end_time``,\n you can get the times that are *not* between the two times.\n\n Parameters\n ----------\n start_time : datetime.time or str\n end_time : datetime.time or str\n include_start : bool, default True\n include_end : bool, default True\n axis : {0 or 'index', 1 or 'columns'}, default 0\n\n .. versionadded:: 0.24.0\n\n Returns\n -------\n Series or DataFrame\n\n Raises\n ------\n TypeError\n If the index is not a :class:`DatetimeIndex`\n\n See Also\n --------\n at_time : Select values at a particular time of the day.\n first : Select initial periods of time series based on a date offset.\n last : Select final periods of time series based on a date offset.\n DatetimeIndex.indexer_between_time : Get just the index locations for\n values between particular times of the day.\n\n Examples\n --------\n >>> i = pd.date_range('2018-04-09', periods=4, freq='1D20min')\n >>> ts = pd.DataFrame({'A': [1, 2, 3, 4]}, index=i)\n >>> ts\n A\n 2018-04-09 00:00:00 1\n 2018-04-10 00:20:00 2\n 2018-04-11 00:40:00 3\n 2018-04-12 01:00:00 4\n\n >>> ts.between_time('0:15', '0:45')\n A\n 2018-04-10 00:20:00 2\n 2018-04-11 00:40:00 3\n\n You get the times that are *not* between two times by setting\n ``start_time`` later than ``end_time``:\n\n >>> ts.between_time('0:45', '0:15')\n A\n 2018-04-09 00:00:00 1\n 2018-04-12 01:00:00 4\n \"\"\"\n if axis is None:\n axis = self._stat_axis_number\n axis = self._get_axis_number(axis)\n\n index = self._get_axis(axis)\n try:\n indexer = index.indexer_between_time(\n start_time, end_time, include_start=include_start,\n include_end=include_end)\n except AttributeError:\n raise TypeError('Index must be DatetimeIndex')\n\n return self._take(indexer, axis=axis)"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef resample(self, rule, how=None, axis=0, fill_method=None, closed=None,\n label=None, convention='start', kind=None, loffset=None,\n limit=None, base=0, on=None, level=None):\n \"\"\"\n Resample time-series data.\n\n Convenience method for frequency conversion and resampling of time\n series. Object must have a datetime-like index (`DatetimeIndex`,\n `PeriodIndex`, or `TimedeltaIndex`), or pass datetime-like values\n to the `on` or `level` keyword.\n\n Parameters\n ----------\n rule : str\n The offset string or object representing target conversion.\n how : str\n Method for down/re-sampling, default to 'mean' for downsampling.\n\n .. deprecated:: 0.18.0\n The new syntax is ``.resample(...).mean()``, or\n ``.resample(...).apply()``\n axis : {0 or 'index', 1 or 'columns'}, default 0\n Which axis to use for up- or down-sampling. For `Series` this\n will default to 0, i.e. along the rows. Must be\n `DatetimeIndex`, `TimedeltaIndex` or `PeriodIndex`.\n fill_method : str, default None\n Filling method for upsampling.\n\n .. deprecated:: 0.18.0\n The new syntax is ``.resample(...).()``,\n e.g. ``.resample(...).pad()``\n closed : {'right', 'left'}, default None\n Which side of bin interval is closed. The default is 'left'\n for all frequency offsets except for 'M', 'A', 'Q', 'BM',\n 'BA', 'BQ', and 'W' which all have a default of 'right'.\n label : {'right', 'left'}, default None\n Which bin edge label to label bucket with. The default is 'left'\n for all frequency offsets except for 'M', 'A', 'Q', 'BM',\n 'BA', 'BQ', and 'W' which all have a default of 'right'.\n convention : {'start', 'end', 's', 'e'}, default 'start'\n For `PeriodIndex` only, controls whether to use the start or\n end of `rule`.\n kind : {'timestamp', 'period'}, optional, default None\n Pass 'timestamp' to convert the resulting index to a\n `DateTimeIndex` or 'period' to convert it to a `PeriodIndex`.\n By default the input representation is retained.\n loffset : timedelta, default None\n Adjust the resampled time labels.\n limit : int, default None\n Maximum size gap when reindexing with `fill_method`.\n\n .. deprecated:: 0.18.0\n base : int, default 0\n For frequencies that evenly subdivide 1 day, the \"origin\" of the\n aggregated intervals. For example, for '5min' frequency, base could\n range from 0 through 4. Defaults to 0.\n on : str, optional\n For a DataFrame, column to use instead of index for resampling.\n Column must be datetime-like.\n\n .. versionadded:: 0.19.0\n\n level : str or int, optional\n For a MultiIndex, level (name or number) to use for\n resampling. `level` must be datetime-like.\n\n .. versionadded:: 0.19.0\n\n Returns\n -------\n Resampler object\n\n See Also\n --------\n groupby : Group by mapping, function, label, or list of labels.\n Series.resample : Resample a Series.\n DataFrame.resample: Resample a DataFrame.\n\n Notes\n -----\n See the `user guide\n `_\n for more.\n\n To learn more about the offset strings, please see `this link\n `__.\n\n Examples\n --------\n\n Start by creating a series with 9 one minute timestamps.\n\n >>> index = pd.date_range('1/1/2000', periods=9, freq='T')\n >>> series = pd.Series(range(9), index=index)\n >>> series\n 2000-01-01 00:00:00 0\n 2000-01-01 00:01:00 1\n 2000-01-01 00:02:00 2\n 2000-01-01 00:03:00 3\n 2000-01-01 00:04:00 4\n 2000-01-01 00:05:00 5\n 2000-01-01 00:06:00 6\n 2000-01-01 00:07:00 7\n 2000-01-01 00:08:00 8\n Freq: T, dtype: int64\n\n Downsample the series into 3 minute bins and sum the values\n of the timestamps falling into a bin.\n\n >>> series.resample('3T').sum()\n 2000-01-01 00:00:00 3\n 2000-01-01 00:03:00 12\n 2000-01-01 00:06:00 21\n Freq: 3T, dtype: int64\n\n Downsample the series into 3 minute bins as above, but label each\n bin using the right edge instead of the left. Please note that the\n value in the bucket used as the label is not included in the bucket,\n which it labels. For example, in the original series the\n bucket ``2000-01-01 00:03:00`` contains the value 3, but the summed\n value in the resampled bucket with the label ``2000-01-01 00:03:00``\n does not include 3 (if it did, the summed value would be 6, not 3).\n To include this value close the right side of the bin interval as\n illustrated in the example below this one.\n\n >>> series.resample('3T', label='right').sum()\n 2000-01-01 00:03:00 3\n 2000-01-01 00:06:00 12\n 2000-01-01 00:09:00 21\n Freq: 3T, dtype: int64\n\n Downsample the series into 3 minute bins as above, but close the right\n side of the bin interval.\n\n >>> series.resample('3T', label='right', closed='right').sum()\n 2000-01-01 00:00:00 0\n 2000-01-01 00:03:00 6\n 2000-01-01 00:06:00 15\n 2000-01-01 00:09:00 15\n Freq: 3T, dtype: int64\n\n Upsample the series into 30 second bins.\n\n >>> series.resample('30S').asfreq()[0:5] # Select first 5 rows\n 2000-01-01 00:00:00 0.0\n 2000-01-01 00:00:30 NaN\n 2000-01-01 00:01:00 1.0\n 2000-01-01 00:01:30 NaN\n 2000-01-01 00:02:00 2.0\n Freq: 30S, dtype: float64\n\n Upsample the series into 30 second bins and fill the ``NaN``\n values using the ``pad`` method.\n\n >>> series.resample('30S').pad()[0:5]\n 2000-01-01 00:00:00 0\n 2000-01-01 00:00:30 0\n 2000-01-01 00:01:00 1\n 2000-01-01 00:01:30 1\n 2000-01-01 00:02:00 2\n Freq: 30S, dtype: int64\n\n Upsample the series into 30 second bins and fill the\n ``NaN`` values using the ``bfill`` method.\n\n >>> series.resample('30S').bfill()[0:5]\n 2000-01-01 00:00:00 0\n 2000-01-01 00:00:30 1\n 2000-01-01 00:01:00 1\n 2000-01-01 00:01:30 2\n 2000-01-01 00:02:00 2\n Freq: 30S, dtype: int64\n\n Pass a custom function via ``apply``\n\n >>> def custom_resampler(array_like):\n ... return np.sum(array_like) + 5\n ...\n >>> series.resample('3T').apply(custom_resampler)\n 2000-01-01 00:00:00 8\n 2000-01-01 00:03:00 17\n 2000-01-01 00:06:00 26\n Freq: 3T, dtype: int64\n\n For a Series with a PeriodIndex, the keyword `convention` can be\n used to control whether to use the start or end of `rule`.\n\n Resample a year by quarter using 'start' `convention`. Values are\n assigned to the first quarter of the period.\n\n >>> s = pd.Series([1, 2], index=pd.period_range('2012-01-01',\n ... freq='A',\n ... periods=2))\n >>> s\n 2012 1\n 2013 2\n Freq: A-DEC, dtype: int64\n >>> s.resample('Q', convention='start').asfreq()\n 2012Q1 1.0\n 2012Q2 NaN\n 2012Q3 NaN\n 2012Q4 NaN\n 2013Q1 2.0\n 2013Q2 NaN\n 2013Q3 NaN\n 2013Q4 NaN\n Freq: Q-DEC, dtype: float64\n\n Resample quarters by month using 'end' `convention`. Values are\n assigned to the last month of the period.\n\n >>> q = pd.Series([1, 2, 3, 4], index=pd.period_range('2018-01-01',\n ... freq='Q',\n ... periods=4))\n >>> q\n 2018Q1 1\n 2018Q2 2\n 2018Q3 3\n 2018Q4 4\n Freq: Q-DEC, dtype: int64\n >>> q.resample('M', convention='end').asfreq()\n 2018-03 1.0\n 2018-04 NaN\n 2018-05 NaN\n 2018-06 2.0\n 2018-07 NaN\n 2018-08 NaN\n 2018-09 3.0\n 2018-10 NaN\n 2018-11 NaN\n 2018-12 4.0\n Freq: M, dtype: float64\n\n For DataFrame objects, the keyword `on` can be used to specify the\n column instead of the index for resampling.\n\n >>> d = dict({'price': [10, 11, 9, 13, 14, 18, 17, 19],\n ... 'volume': [50, 60, 40, 100, 50, 100, 40, 50]})\n >>> df = pd.DataFrame(d)\n >>> df['week_starting'] = pd.date_range('01/01/2018',\n ... periods=8,\n ... freq='W')\n >>> df\n price volume week_starting\n 0 10 50 2018-01-07\n 1 11 60 2018-01-14\n 2 9 40 2018-01-21\n 3 13 100 2018-01-28\n 4 14 50 2018-02-04\n 5 18 100 2018-02-11\n 6 17 40 2018-02-18\n 7 19 50 2018-02-25\n >>> df.resample('M', on='week_starting').mean()\n price volume\n week_starting\n 2018-01-31 10.75 62.5\n 2018-02-28 17.00 60.0\n\n For a DataFrame with MultiIndex, the keyword `level` can be used to\n specify on which level the resampling needs to take place.\n\n >>> days = pd.date_range('1/1/2000', periods=4, freq='D')\n >>> d2 = dict({'price': [10, 11, 9, 13, 14, 18, 17, 19],\n ... 'volume': [50, 60, 40, 100, 50, 100, 40, 50]})\n >>> df2 = pd.DataFrame(d2,\n ... index=pd.MultiIndex.from_product([days,\n ... ['morning',\n ... 'afternoon']]\n ... ))\n >>> df2\n price volume\n 2000-01-01 morning 10 50\n afternoon 11 60\n 2000-01-02 morning 9 40\n afternoon 13 100\n 2000-01-03 morning 14 50\n afternoon 18 100\n 2000-01-04 morning 17 40\n afternoon 19 50\n >>> df2.resample('D', level=0).sum()\n price volume\n 2000-01-01 21 110\n 2000-01-02 22 140\n 2000-01-03 32 150\n 2000-01-04 36 90\n \"\"\"\n\n from pandas.core.resample import (resample,\n _maybe_process_deprecations)\n axis = self._get_axis_number(axis)\n r = resample(self, freq=rule, label=label, closed=closed,\n axis=axis, kind=kind, loffset=loffset,\n convention=convention,\n base=base, key=on, level=level)\n return _maybe_process_deprecations(r,\n how=how,\n fill_method=fill_method,\n limit=limit)", "response": "Resample the time - series data."} {"instruction": "How would you implement a function in Python 3 that\nreturns the first set of values for the time series at a particular offset.", "response": "def first(self, offset):\n \"\"\"\n Convenience method for subsetting initial periods of time series data\n based on a date offset.\n\n Parameters\n ----------\n offset : string, DateOffset, dateutil.relativedelta\n\n Returns\n -------\n subset : same type as caller\n\n Raises\n ------\n TypeError\n If the index is not a :class:`DatetimeIndex`\n\n See Also\n --------\n last : Select final periods of time series based on a date offset.\n at_time : Select values at a particular time of the day.\n between_time : Select values between particular times of the day.\n\n Examples\n --------\n >>> i = pd.date_range('2018-04-09', periods=4, freq='2D')\n >>> ts = pd.DataFrame({'A': [1,2,3,4]}, index=i)\n >>> ts\n A\n 2018-04-09 1\n 2018-04-11 2\n 2018-04-13 3\n 2018-04-15 4\n\n Get the rows for the first 3 days:\n\n >>> ts.first('3D')\n A\n 2018-04-09 1\n 2018-04-11 2\n\n Notice the data for 3 first calender days were returned, not the first\n 3 days observed in the dataset, and therefore data for 2018-04-13 was\n not returned.\n \"\"\"\n if not isinstance(self.index, DatetimeIndex):\n raise TypeError(\"'first' only supports a DatetimeIndex index\")\n\n if len(self.index) == 0:\n return self\n\n offset = to_offset(offset)\n end_date = end = self.index[0] + offset\n\n # Tick-like, e.g. 3 weeks\n if not offset.isAnchored() and hasattr(offset, '_inc'):\n if end_date in self.index:\n end = self.index.searchsorted(end_date, side='left')\n return self.iloc[:end]\n\n return self.loc[:end]"} {"instruction": "Create a Python 3 function to\nreturn the last set of periods of time series entries in the index.", "response": "def last(self, offset):\n \"\"\"\n Convenience method for subsetting final periods of time series data\n based on a date offset.\n\n Parameters\n ----------\n offset : string, DateOffset, dateutil.relativedelta\n\n Returns\n -------\n subset : same type as caller\n\n Raises\n ------\n TypeError\n If the index is not a :class:`DatetimeIndex`\n\n See Also\n --------\n first : Select initial periods of time series based on a date offset.\n at_time : Select values at a particular time of the day.\n between_time : Select values between particular times of the day.\n\n Examples\n --------\n >>> i = pd.date_range('2018-04-09', periods=4, freq='2D')\n >>> ts = pd.DataFrame({'A': [1,2,3,4]}, index=i)\n >>> ts\n A\n 2018-04-09 1\n 2018-04-11 2\n 2018-04-13 3\n 2018-04-15 4\n\n Get the rows for the last 3 days:\n\n >>> ts.last('3D')\n A\n 2018-04-13 3\n 2018-04-15 4\n\n Notice the data for 3 last calender days were returned, not the last\n 3 observed days in the dataset, and therefore data for 2018-04-11 was\n not returned.\n \"\"\"\n if not isinstance(self.index, DatetimeIndex):\n raise TypeError(\"'last' only supports a DatetimeIndex index\")\n\n if len(self.index) == 0:\n return self\n\n offset = to_offset(offset)\n\n start_date = self.index[-1] - offset\n start = self.index.searchsorted(start_date, side='right')\n return self.iloc[start:]"} {"instruction": "Explain what the following Python 3 code does\ndef rank(self, axis=0, method='average', numeric_only=None,\n na_option='keep', ascending=True, pct=False):\n \"\"\"\n Compute numerical data ranks (1 through n) along axis. Equal values are\n assigned a rank that is the average of the ranks of those values.\n\n Parameters\n ----------\n axis : {0 or 'index', 1 or 'columns'}, default 0\n index to direct ranking\n method : {'average', 'min', 'max', 'first', 'dense'}\n * average: average rank of group\n * min: lowest rank in group\n * max: highest rank in group\n * first: ranks assigned in order they appear in the array\n * dense: like 'min', but rank always increases by 1 between groups\n numeric_only : boolean, default None\n Include only float, int, boolean data. Valid only for DataFrame or\n Panel objects\n na_option : {'keep', 'top', 'bottom'}\n * keep: leave NA values where they are\n * top: smallest rank if ascending\n * bottom: smallest rank if descending\n ascending : boolean, default True\n False for ranks by high (1) to low (N)\n pct : boolean, default False\n Computes percentage rank of data\n\n Returns\n -------\n ranks : same type as caller\n \"\"\"\n axis = self._get_axis_number(axis)\n\n if self.ndim > 2:\n msg = \"rank does not make sense when ndim > 2\"\n raise NotImplementedError(msg)\n\n if na_option not in {'keep', 'top', 'bottom'}:\n msg = \"na_option must be one of 'keep', 'top', or 'bottom'\"\n raise ValueError(msg)\n\n def ranker(data):\n ranks = algos.rank(data.values, axis=axis, method=method,\n ascending=ascending, na_option=na_option,\n pct=pct)\n ranks = self._constructor(ranks, **data._construct_axes_dict())\n return ranks.__finalize__(self)\n\n # if numeric_only is None, and we can't get anything, we try with\n # numeric_only=True\n if numeric_only is None:\n try:\n return ranker(self)\n except TypeError:\n numeric_only = True\n\n if numeric_only:\n data = self._get_numeric_data()\n else:\n data = self\n\n return ranker(data)", "response": "Compute the rank of the array of items in an axis."} {"instruction": "Explain what the following Python 3 code does\ndef _where(self, cond, other=np.nan, inplace=False, axis=None, level=None,\n errors='raise', try_cast=False):\n \"\"\"\n Equivalent to public method `where`, except that `other` is not\n applied as a function even if callable. Used in __setitem__.\n \"\"\"\n inplace = validate_bool_kwarg(inplace, 'inplace')\n\n # align the cond to same shape as myself\n cond = com.apply_if_callable(cond, self)\n if isinstance(cond, NDFrame):\n cond, _ = cond.align(self, join='right', broadcast_axis=1)\n else:\n if not hasattr(cond, 'shape'):\n cond = np.asanyarray(cond)\n if cond.shape != self.shape:\n raise ValueError('Array conditional must be same shape as '\n 'self')\n cond = self._constructor(cond, **self._construct_axes_dict())\n\n # make sure we are boolean\n fill_value = bool(inplace)\n cond = cond.fillna(fill_value)\n\n msg = \"Boolean array expected for the condition, not {dtype}\"\n\n if not isinstance(cond, pd.DataFrame):\n # This is a single-dimensional object.\n if not is_bool_dtype(cond):\n raise ValueError(msg.format(dtype=cond.dtype))\n elif not cond.empty:\n for dt in cond.dtypes:\n if not is_bool_dtype(dt):\n raise ValueError(msg.format(dtype=dt))\n\n cond = -cond if inplace else cond\n\n # try to align with other\n try_quick = True\n if hasattr(other, 'align'):\n\n # align with me\n if other.ndim <= self.ndim:\n\n _, other = self.align(other, join='left', axis=axis,\n level=level, fill_value=np.nan)\n\n # if we are NOT aligned, raise as we cannot where index\n if (axis is None and\n not all(other._get_axis(i).equals(ax)\n for i, ax in enumerate(self.axes))):\n raise InvalidIndexError\n\n # slice me out of the other\n else:\n raise NotImplementedError(\"cannot align with a higher \"\n \"dimensional NDFrame\")\n\n if isinstance(other, np.ndarray):\n\n if other.shape != self.shape:\n\n if self.ndim == 1:\n\n icond = cond.values\n\n # GH 2745 / GH 4192\n # treat like a scalar\n if len(other) == 1:\n other = np.array(other[0])\n\n # GH 3235\n # match True cond to other\n elif len(cond[icond]) == len(other):\n\n # try to not change dtype at first (if try_quick)\n if try_quick:\n\n try:\n new_other = com.values_from_object(self)\n new_other = new_other.copy()\n new_other[icond] = other\n other = new_other\n except Exception:\n try_quick = False\n\n # let's create a new (if we failed at the above\n # or not try_quick\n if not try_quick:\n\n dtype, fill_value = maybe_promote(other.dtype)\n new_other = np.empty(len(icond), dtype=dtype)\n new_other.fill(fill_value)\n maybe_upcast_putmask(new_other, icond, other)\n other = new_other\n\n else:\n raise ValueError('Length of replacements must equal '\n 'series length')\n\n else:\n raise ValueError('other must be the same shape as self '\n 'when an ndarray')\n\n # we are the same shape, so create an actual object for alignment\n else:\n other = self._constructor(other, **self._construct_axes_dict())\n\n if axis is None:\n axis = 0\n\n if self.ndim == getattr(other, 'ndim', 0):\n align = True\n else:\n align = (self._get_axis_number(axis) == 1)\n\n block_axis = self._get_block_manager_axis(axis)\n\n if inplace:\n # we may have different type blocks come out of putmask, so\n # reconstruct the block manager\n\n self._check_inplace_setting(other)\n new_data = self._data.putmask(mask=cond, new=other, align=align,\n inplace=True, axis=block_axis,\n transpose=self._AXIS_REVERSED)\n self._update_inplace(new_data)\n\n else:\n new_data = self._data.where(other=other, cond=cond, align=align,\n errors=errors,\n try_cast=try_cast, axis=block_axis,\n transpose=self._AXIS_REVERSED)\n\n return self._constructor(new_data).__finalize__(self)", "response": "Internal method to align a set of items with another."} {"instruction": "How would you explain what the following Python 3 function does\ndef slice_shift(self, periods=1, axis=0):\n if periods == 0:\n return self\n\n if periods > 0:\n vslicer = slice(None, -periods)\n islicer = slice(periods, None)\n else:\n vslicer = slice(-periods, None)\n islicer = slice(None, periods)\n\n new_obj = self._slice(vslicer, axis=axis)\n shifted_axis = self._get_axis(axis)[islicer]\n new_obj.set_axis(shifted_axis, axis=axis, inplace=True)\n\n return new_obj.__finalize__(self)", "response": "Shift the internal data by the specified number of periods."} {"instruction": "Can you write a function in Python 3 where it\nshifts the time index using the index s frequency if available.", "response": "def tshift(self, periods=1, freq=None, axis=0):\n \"\"\"\n Shift the time index, using the index's frequency if available.\n\n Parameters\n ----------\n periods : int\n Number of periods to move, can be positive or negative\n freq : DateOffset, timedelta, or time rule string, default None\n Increment to use from the tseries module or time rule (e.g. 'EOM')\n axis : int or basestring\n Corresponds to the axis that contains the Index\n\n Returns\n -------\n shifted : NDFrame\n\n Notes\n -----\n If freq is not specified then tries to use the freq or inferred_freq\n attributes of the index. If neither of those attributes exist, a\n ValueError is thrown\n \"\"\"\n\n index = self._get_axis(axis)\n if freq is None:\n freq = getattr(index, 'freq', None)\n\n if freq is None:\n freq = getattr(index, 'inferred_freq', None)\n\n if freq is None:\n msg = 'Freq was not given and was not set in the index'\n raise ValueError(msg)\n\n if periods == 0:\n return self\n\n if isinstance(freq, str):\n freq = to_offset(freq)\n\n block_axis = self._get_block_manager_axis(axis)\n if isinstance(index, PeriodIndex):\n orig_freq = to_offset(index.freq)\n if freq == orig_freq:\n new_data = self._data.copy()\n new_data.axes[block_axis] = index.shift(periods)\n else:\n msg = ('Given freq %s does not match PeriodIndex freq %s' %\n (freq.rule_code, orig_freq.rule_code))\n raise ValueError(msg)\n else:\n new_data = self._data.copy()\n new_data.axes[block_axis] = index.shift(periods, freq)\n\n return self._constructor(new_data).__finalize__(self)"} {"instruction": "How would you code a function in Python 3 to\ntruncate a Series or DataFrame to a certain value.", "response": "def truncate(self, before=None, after=None, axis=None, copy=True):\n \"\"\"\n Truncate a Series or DataFrame before and after some index value.\n\n This is a useful shorthand for boolean indexing based on index\n values above or below certain thresholds.\n\n Parameters\n ----------\n before : date, string, int\n Truncate all rows before this index value.\n after : date, string, int\n Truncate all rows after this index value.\n axis : {0 or 'index', 1 or 'columns'}, optional\n Axis to truncate. Truncates the index (rows) by default.\n copy : boolean, default is True,\n Return a copy of the truncated section.\n\n Returns\n -------\n type of caller\n The truncated Series or DataFrame.\n\n See Also\n --------\n DataFrame.loc : Select a subset of a DataFrame by label.\n DataFrame.iloc : Select a subset of a DataFrame by position.\n\n Notes\n -----\n If the index being truncated contains only datetime values,\n `before` and `after` may be specified as strings instead of\n Timestamps.\n\n Examples\n --------\n >>> df = pd.DataFrame({'A': ['a', 'b', 'c', 'd', 'e'],\n ... 'B': ['f', 'g', 'h', 'i', 'j'],\n ... 'C': ['k', 'l', 'm', 'n', 'o']},\n ... index=[1, 2, 3, 4, 5])\n >>> df\n A B C\n 1 a f k\n 2 b g l\n 3 c h m\n 4 d i n\n 5 e j o\n\n >>> df.truncate(before=2, after=4)\n A B C\n 2 b g l\n 3 c h m\n 4 d i n\n\n The columns of a DataFrame can be truncated.\n\n >>> df.truncate(before=\"A\", after=\"B\", axis=\"columns\")\n A B\n 1 a f\n 2 b g\n 3 c h\n 4 d i\n 5 e j\n\n For Series, only rows can be truncated.\n\n >>> df['A'].truncate(before=2, after=4)\n 2 b\n 3 c\n 4 d\n Name: A, dtype: object\n\n The index values in ``truncate`` can be datetimes or string\n dates.\n\n >>> dates = pd.date_range('2016-01-01', '2016-02-01', freq='s')\n >>> df = pd.DataFrame(index=dates, data={'A': 1})\n >>> df.tail()\n A\n 2016-01-31 23:59:56 1\n 2016-01-31 23:59:57 1\n 2016-01-31 23:59:58 1\n 2016-01-31 23:59:59 1\n 2016-02-01 00:00:00 1\n\n >>> df.truncate(before=pd.Timestamp('2016-01-05'),\n ... after=pd.Timestamp('2016-01-10')).tail()\n A\n 2016-01-09 23:59:56 1\n 2016-01-09 23:59:57 1\n 2016-01-09 23:59:58 1\n 2016-01-09 23:59:59 1\n 2016-01-10 00:00:00 1\n\n Because the index is a DatetimeIndex containing only dates, we can\n specify `before` and `after` as strings. They will be coerced to\n Timestamps before truncation.\n\n >>> df.truncate('2016-01-05', '2016-01-10').tail()\n A\n 2016-01-09 23:59:56 1\n 2016-01-09 23:59:57 1\n 2016-01-09 23:59:58 1\n 2016-01-09 23:59:59 1\n 2016-01-10 00:00:00 1\n\n Note that ``truncate`` assumes a 0 value for any unspecified time\n component (midnight). This differs from partial string slicing, which\n returns any partially matching dates.\n\n >>> df.loc['2016-01-05':'2016-01-10', :].tail()\n A\n 2016-01-10 23:59:55 1\n 2016-01-10 23:59:56 1\n 2016-01-10 23:59:57 1\n 2016-01-10 23:59:58 1\n 2016-01-10 23:59:59 1\n \"\"\"\n\n if axis is None:\n axis = self._stat_axis_number\n axis = self._get_axis_number(axis)\n ax = self._get_axis(axis)\n\n # GH 17935\n # Check that index is sorted\n if not ax.is_monotonic_increasing and not ax.is_monotonic_decreasing:\n raise ValueError(\"truncate requires a sorted index\")\n\n # if we have a date index, convert to dates, otherwise\n # treat like a slice\n if ax.is_all_dates:\n from pandas.core.tools.datetimes import to_datetime\n before = to_datetime(before)\n after = to_datetime(after)\n\n if before is not None and after is not None:\n if before > after:\n raise ValueError('Truncate: %s must be after %s' %\n (after, before))\n\n slicer = [slice(None, None)] * self._AXIS_LEN\n slicer[axis] = slice(before, after)\n result = self.loc[tuple(slicer)]\n\n if isinstance(ax, MultiIndex):\n setattr(result, self._get_axis_name(axis),\n ax.truncate(before, after))\n\n if copy:\n result = result.copy()\n\n return result"} {"instruction": "Implement a Python 3 function for\nconverting the current object to a specific time zone.", "response": "def tz_convert(self, tz, axis=0, level=None, copy=True):\n \"\"\"\n Convert tz-aware axis to target time zone.\n\n Parameters\n ----------\n tz : string or pytz.timezone object\n axis : the axis to convert\n level : int, str, default None\n If axis ia a MultiIndex, convert a specific level. Otherwise\n must be None\n copy : boolean, default True\n Also make a copy of the underlying data\n\n Returns\n -------\n\n Raises\n ------\n TypeError\n If the axis is tz-naive.\n \"\"\"\n axis = self._get_axis_number(axis)\n ax = self._get_axis(axis)\n\n def _tz_convert(ax, tz):\n if not hasattr(ax, 'tz_convert'):\n if len(ax) > 0:\n ax_name = self._get_axis_name(axis)\n raise TypeError('%s is not a valid DatetimeIndex or '\n 'PeriodIndex' % ax_name)\n else:\n ax = DatetimeIndex([], tz=tz)\n else:\n ax = ax.tz_convert(tz)\n return ax\n\n # if a level is given it must be a MultiIndex level or\n # equivalent to the axis name\n if isinstance(ax, MultiIndex):\n level = ax._get_level_number(level)\n new_level = _tz_convert(ax.levels[level], tz)\n ax = ax.set_levels(new_level, level=level)\n else:\n if level not in (None, 0, ax.name):\n raise ValueError(\"The level {0} is not valid\".format(level))\n ax = _tz_convert(ax, tz)\n\n result = self._constructor(self._data, copy=copy)\n result = result.set_axis(ax, axis=axis, inplace=False)\n return result.__finalize__(self)"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef tz_localize(self, tz, axis=0, level=None, copy=True,\n ambiguous='raise', nonexistent='raise'):\n \"\"\"\n Localize tz-naive index of a Series or DataFrame to target time zone.\n\n This operation localizes the Index. To localize the values in a\n timezone-naive Series, use :meth:`Series.dt.tz_localize`.\n\n Parameters\n ----------\n tz : string or pytz.timezone object\n axis : the axis to localize\n level : int, str, default None\n If axis ia a MultiIndex, localize a specific level. Otherwise\n must be None\n copy : boolean, default True\n Also make a copy of the underlying data\n ambiguous : 'infer', bool-ndarray, 'NaT', default 'raise'\n When clocks moved backward due to DST, ambiguous times may arise.\n For example in Central European Time (UTC+01), when going from\n 03:00 DST to 02:00 non-DST, 02:30:00 local time occurs both at\n 00:30:00 UTC and at 01:30:00 UTC. In such a situation, the\n `ambiguous` parameter dictates how ambiguous times should be\n handled.\n\n - 'infer' will attempt to infer fall dst-transition hours based on\n order\n - bool-ndarray where True signifies a DST time, False designates\n a non-DST time (note that this flag is only applicable for\n ambiguous times)\n - 'NaT' will return NaT where there are ambiguous times\n - 'raise' will raise an AmbiguousTimeError if there are ambiguous\n times\n nonexistent : str, default 'raise'\n A nonexistent time does not exist in a particular timezone\n where clocks moved forward due to DST. Valid values are:\n\n - 'shift_forward' will shift the nonexistent time forward to the\n closest existing time\n - 'shift_backward' will shift the nonexistent time backward to the\n closest existing time\n - 'NaT' will return NaT where there are nonexistent times\n - timedelta objects will shift nonexistent times by the timedelta\n - 'raise' will raise an NonExistentTimeError if there are\n nonexistent times\n\n .. versionadded:: 0.24.0\n\n Returns\n -------\n Series or DataFrame\n Same type as the input.\n\n Raises\n ------\n TypeError\n If the TimeSeries is tz-aware and tz is not None.\n\n Examples\n --------\n\n Localize local times:\n\n >>> s = pd.Series([1],\n ... index=pd.DatetimeIndex(['2018-09-15 01:30:00']))\n >>> s.tz_localize('CET')\n 2018-09-15 01:30:00+02:00 1\n dtype: int64\n\n Be careful with DST changes. When there is sequential data, pandas\n can infer the DST time:\n\n >>> s = pd.Series(range(7), index=pd.DatetimeIndex([\n ... '2018-10-28 01:30:00',\n ... '2018-10-28 02:00:00',\n ... '2018-10-28 02:30:00',\n ... '2018-10-28 02:00:00',\n ... '2018-10-28 02:30:00',\n ... '2018-10-28 03:00:00',\n ... '2018-10-28 03:30:00']))\n >>> s.tz_localize('CET', ambiguous='infer')\n 2018-10-28 01:30:00+02:00 0\n 2018-10-28 02:00:00+02:00 1\n 2018-10-28 02:30:00+02:00 2\n 2018-10-28 02:00:00+01:00 3\n 2018-10-28 02:30:00+01:00 4\n 2018-10-28 03:00:00+01:00 5\n 2018-10-28 03:30:00+01:00 6\n dtype: int64\n\n In some cases, inferring the DST is impossible. In such cases, you can\n pass an ndarray to the ambiguous parameter to set the DST explicitly\n\n >>> s = pd.Series(range(3), index=pd.DatetimeIndex([\n ... '2018-10-28 01:20:00',\n ... '2018-10-28 02:36:00',\n ... '2018-10-28 03:46:00']))\n >>> s.tz_localize('CET', ambiguous=np.array([True, True, False]))\n 2018-10-28 01:20:00+02:00 0\n 2018-10-28 02:36:00+02:00 1\n 2018-10-28 03:46:00+01:00 2\n dtype: int64\n\n If the DST transition causes nonexistent times, you can shift these\n dates forward or backwards with a timedelta object or `'shift_forward'`\n or `'shift_backwards'`.\n >>> s = pd.Series(range(2), index=pd.DatetimeIndex([\n ... '2015-03-29 02:30:00',\n ... '2015-03-29 03:30:00']))\n >>> s.tz_localize('Europe/Warsaw', nonexistent='shift_forward')\n 2015-03-29 03:00:00+02:00 0\n 2015-03-29 03:30:00+02:00 1\n dtype: int64\n >>> s.tz_localize('Europe/Warsaw', nonexistent='shift_backward')\n 2015-03-29 01:59:59.999999999+01:00 0\n 2015-03-29 03:30:00+02:00 1\n dtype: int64\n >>> s.tz_localize('Europe/Warsaw', nonexistent=pd.Timedelta('1H'))\n 2015-03-29 03:30:00+02:00 0\n 2015-03-29 03:30:00+02:00 1\n dtype: int64\n \"\"\"\n nonexistent_options = ('raise', 'NaT', 'shift_forward',\n 'shift_backward')\n if nonexistent not in nonexistent_options and not isinstance(\n nonexistent, timedelta):\n raise ValueError(\"The nonexistent argument must be one of 'raise',\"\n \" 'NaT', 'shift_forward', 'shift_backward' or\"\n \" a timedelta object\")\n\n axis = self._get_axis_number(axis)\n ax = self._get_axis(axis)\n\n def _tz_localize(ax, tz, ambiguous, nonexistent):\n if not hasattr(ax, 'tz_localize'):\n if len(ax) > 0:\n ax_name = self._get_axis_name(axis)\n raise TypeError('%s is not a valid DatetimeIndex or '\n 'PeriodIndex' % ax_name)\n else:\n ax = DatetimeIndex([], tz=tz)\n else:\n ax = ax.tz_localize(\n tz, ambiguous=ambiguous, nonexistent=nonexistent\n )\n return ax\n\n # if a level is given it must be a MultiIndex level or\n # equivalent to the axis name\n if isinstance(ax, MultiIndex):\n level = ax._get_level_number(level)\n new_level = _tz_localize(\n ax.levels[level], tz, ambiguous, nonexistent\n )\n ax = ax.set_levels(new_level, level=level)\n else:\n if level not in (None, 0, ax.name):\n raise ValueError(\"The level {0} is not valid\".format(level))\n ax = _tz_localize(ax, tz, ambiguous, nonexistent)\n\n result = self._constructor(self._data, copy=copy)\n result = result.set_axis(ax, axis=axis, inplace=False)\n return result.__finalize__(self)", "response": "Localize the Series or DataFrame to target time zone."} {"instruction": "Write a Python 3 script for\ngenerating descriptive statistics that summarize the central tendency dispersion shape and shape of a dataset.", "response": "def describe(self, percentiles=None, include=None, exclude=None):\n \"\"\"\n Generate descriptive statistics that summarize the central tendency,\n dispersion and shape of a dataset's distribution, excluding\n ``NaN`` values.\n\n Analyzes both numeric and object series, as well\n as ``DataFrame`` column sets of mixed data types. The output\n will vary depending on what is provided. Refer to the notes\n below for more detail.\n\n Parameters\n ----------\n percentiles : list-like of numbers, optional\n The percentiles to include in the output. All should\n fall between 0 and 1. The default is\n ``[.25, .5, .75]``, which returns the 25th, 50th, and\n 75th percentiles.\n include : 'all', list-like of dtypes or None (default), optional\n A white list of data types to include in the result. Ignored\n for ``Series``. Here are the options:\n\n - 'all' : All columns of the input will be included in the output.\n - A list-like of dtypes : Limits the results to the\n provided data types.\n To limit the result to numeric types submit\n ``numpy.number``. To limit it instead to object columns submit\n the ``numpy.object`` data type. Strings\n can also be used in the style of\n ``select_dtypes`` (e.g. ``df.describe(include=['O'])``). To\n select pandas categorical columns, use ``'category'``\n - None (default) : The result will include all numeric columns.\n exclude : list-like of dtypes or None (default), optional,\n A black list of data types to omit from the result. Ignored\n for ``Series``. Here are the options:\n\n - A list-like of dtypes : Excludes the provided data types\n from the result. To exclude numeric types submit\n ``numpy.number``. To exclude object columns submit the data\n type ``numpy.object``. Strings can also be used in the style of\n ``select_dtypes`` (e.g. ``df.describe(include=['O'])``). To\n exclude pandas categorical columns, use ``'category'``\n - None (default) : The result will exclude nothing.\n\n Returns\n -------\n Series or DataFrame\n Summary statistics of the Series or Dataframe provided.\n\n See Also\n --------\n DataFrame.count: Count number of non-NA/null observations.\n DataFrame.max: Maximum of the values in the object.\n DataFrame.min: Minimum of the values in the object.\n DataFrame.mean: Mean of the values.\n DataFrame.std: Standard deviation of the obersvations.\n DataFrame.select_dtypes: Subset of a DataFrame including/excluding\n columns based on their dtype.\n\n Notes\n -----\n For numeric data, the result's index will include ``count``,\n ``mean``, ``std``, ``min``, ``max`` as well as lower, ``50`` and\n upper percentiles. By default the lower percentile is ``25`` and the\n upper percentile is ``75``. The ``50`` percentile is the\n same as the median.\n\n For object data (e.g. strings or timestamps), the result's index\n will include ``count``, ``unique``, ``top``, and ``freq``. The ``top``\n is the most common value. The ``freq`` is the most common value's\n frequency. Timestamps also include the ``first`` and ``last`` items.\n\n If multiple object values have the highest count, then the\n ``count`` and ``top`` results will be arbitrarily chosen from\n among those with the highest count.\n\n For mixed data types provided via a ``DataFrame``, the default is to\n return only an analysis of numeric columns. If the dataframe consists\n only of object and categorical data without any numeric columns, the\n default is to return an analysis of both the object and categorical\n columns. If ``include='all'`` is provided as an option, the result\n will include a union of attributes of each type.\n\n The `include` and `exclude` parameters can be used to limit\n which columns in a ``DataFrame`` are analyzed for the output.\n The parameters are ignored when analyzing a ``Series``.\n\n Examples\n --------\n Describing a numeric ``Series``.\n\n >>> s = pd.Series([1, 2, 3])\n >>> s.describe()\n count 3.0\n mean 2.0\n std 1.0\n min 1.0\n 25% 1.5\n 50% 2.0\n 75% 2.5\n max 3.0\n dtype: float64\n\n Describing a categorical ``Series``.\n\n >>> s = pd.Series(['a', 'a', 'b', 'c'])\n >>> s.describe()\n count 4\n unique 3\n top a\n freq 2\n dtype: object\n\n Describing a timestamp ``Series``.\n\n >>> s = pd.Series([\n ... np.datetime64(\"2000-01-01\"),\n ... np.datetime64(\"2010-01-01\"),\n ... np.datetime64(\"2010-01-01\")\n ... ])\n >>> s.describe()\n count 3\n unique 2\n top 2010-01-01 00:00:00\n freq 2\n first 2000-01-01 00:00:00\n last 2010-01-01 00:00:00\n dtype: object\n\n Describing a ``DataFrame``. By default only numeric fields\n are returned.\n\n >>> df = pd.DataFrame({'categorical': pd.Categorical(['d','e','f']),\n ... 'numeric': [1, 2, 3],\n ... 'object': ['a', 'b', 'c']\n ... })\n >>> df.describe()\n numeric\n count 3.0\n mean 2.0\n std 1.0\n min 1.0\n 25% 1.5\n 50% 2.0\n 75% 2.5\n max 3.0\n\n Describing all columns of a ``DataFrame`` regardless of data type.\n\n >>> df.describe(include='all')\n categorical numeric object\n count 3 3.0 3\n unique 3 NaN 3\n top f NaN c\n freq 1 NaN 1\n mean NaN 2.0 NaN\n std NaN 1.0 NaN\n min NaN 1.0 NaN\n 25% NaN 1.5 NaN\n 50% NaN 2.0 NaN\n 75% NaN 2.5 NaN\n max NaN 3.0 NaN\n\n Describing a column from a ``DataFrame`` by accessing it as\n an attribute.\n\n >>> df.numeric.describe()\n count 3.0\n mean 2.0\n std 1.0\n min 1.0\n 25% 1.5\n 50% 2.0\n 75% 2.5\n max 3.0\n Name: numeric, dtype: float64\n\n Including only numeric columns in a ``DataFrame`` description.\n\n >>> df.describe(include=[np.number])\n numeric\n count 3.0\n mean 2.0\n std 1.0\n min 1.0\n 25% 1.5\n 50% 2.0\n 75% 2.5\n max 3.0\n\n Including only string columns in a ``DataFrame`` description.\n\n >>> df.describe(include=[np.object])\n object\n count 3\n unique 3\n top c\n freq 1\n\n Including only categorical columns from a ``DataFrame`` description.\n\n >>> df.describe(include=['category'])\n categorical\n count 3\n unique 3\n top f\n freq 1\n\n Excluding numeric columns from a ``DataFrame`` description.\n\n >>> df.describe(exclude=[np.number])\n categorical object\n count 3 3\n unique 3 3\n top f c\n freq 1 1\n\n Excluding object columns from a ``DataFrame`` description.\n\n >>> df.describe(exclude=[np.object])\n categorical numeric\n count 3 3.0\n unique 3 NaN\n top f NaN\n freq 1 NaN\n mean NaN 2.0\n std NaN 1.0\n min NaN 1.0\n 25% NaN 1.5\n 50% NaN 2.0\n 75% NaN 2.5\n max NaN 3.0\n \"\"\"\n if self.ndim >= 3:\n msg = \"describe is not implemented on Panel objects.\"\n raise NotImplementedError(msg)\n elif self.ndim == 2 and self.columns.size == 0:\n raise ValueError(\"Cannot describe a DataFrame without columns\")\n\n if percentiles is not None:\n # explicit conversion of `percentiles` to list\n percentiles = list(percentiles)\n\n # get them all to be in [0, 1]\n self._check_percentile(percentiles)\n\n # median should always be included\n if 0.5 not in percentiles:\n percentiles.append(0.5)\n percentiles = np.asarray(percentiles)\n else:\n percentiles = np.array([0.25, 0.5, 0.75])\n\n # sort and check for duplicates\n unique_pcts = np.unique(percentiles)\n if len(unique_pcts) < len(percentiles):\n raise ValueError(\"percentiles cannot contain duplicates\")\n percentiles = unique_pcts\n\n formatted_percentiles = format_percentiles(percentiles)\n\n def describe_numeric_1d(series):\n stat_index = (['count', 'mean', 'std', 'min'] +\n formatted_percentiles + ['max'])\n d = ([series.count(), series.mean(), series.std(), series.min()] +\n series.quantile(percentiles).tolist() + [series.max()])\n return pd.Series(d, index=stat_index, name=series.name)\n\n def describe_categorical_1d(data):\n names = ['count', 'unique']\n objcounts = data.value_counts()\n count_unique = len(objcounts[objcounts != 0])\n result = [data.count(), count_unique]\n if result[1] > 0:\n top, freq = objcounts.index[0], objcounts.iloc[0]\n\n if is_datetime64_any_dtype(data):\n tz = data.dt.tz\n asint = data.dropna().values.view('i8')\n top = Timestamp(top)\n if top.tzinfo is not None and tz is not None:\n # Don't tz_localize(None) if key is already tz-aware\n top = top.tz_convert(tz)\n else:\n top = top.tz_localize(tz)\n names += ['top', 'freq', 'first', 'last']\n result += [top, freq,\n Timestamp(asint.min(), tz=tz),\n Timestamp(asint.max(), tz=tz)]\n else:\n names += ['top', 'freq']\n result += [top, freq]\n\n return pd.Series(result, index=names, name=data.name)\n\n def describe_1d(data):\n if is_bool_dtype(data):\n return describe_categorical_1d(data)\n elif is_numeric_dtype(data):\n return describe_numeric_1d(data)\n elif is_timedelta64_dtype(data):\n return describe_numeric_1d(data)\n else:\n return describe_categorical_1d(data)\n\n if self.ndim == 1:\n return describe_1d(self)\n elif (include is None) and (exclude is None):\n # when some numerics are found, keep only numerics\n data = self.select_dtypes(include=[np.number])\n if len(data.columns) == 0:\n data = self\n elif include == 'all':\n if exclude is not None:\n msg = \"exclude must be None when include is 'all'\"\n raise ValueError(msg)\n data = self\n else:\n data = self.select_dtypes(include=include, exclude=exclude)\n\n ldesc = [describe_1d(s) for _, s in data.iteritems()]\n # set a convenient order for rows\n names = []\n ldesc_indexes = sorted((x.index for x in ldesc), key=len)\n for idxnames in ldesc_indexes:\n for name in idxnames:\n if name not in names:\n names.append(name)\n\n d = pd.concat(ldesc, join_axes=pd.Index([names]), axis=1)\n d.columns = data.columns.copy()\n return d"} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef _check_percentile(self, q):\n\n msg = (\"percentiles should all be in the interval [0, 1]. \"\n \"Try {0} instead.\")\n q = np.asarray(q)\n if q.ndim == 0:\n if not 0 <= q <= 1:\n raise ValueError(msg.format(q / 100.0))\n else:\n if not all(0 <= qs <= 1 for qs in q):\n raise ValueError(msg.format(q / 100.0))\n return q", "response": "Validate the percentile of the log entry."} {"instruction": "Make a summary of the following Python 3 code\ndef _add_numeric_operations(cls):\n\n axis_descr, name, name2 = _doc_parms(cls)\n\n cls.any = _make_logical_function(\n cls, 'any', name, name2, axis_descr, _any_desc, nanops.nanany,\n _any_see_also, _any_examples, empty_value=False)\n cls.all = _make_logical_function(\n cls, 'all', name, name2, axis_descr, _all_desc, nanops.nanall,\n _all_see_also, _all_examples, empty_value=True)\n\n @Substitution(desc=\"Return the mean absolute deviation of the values \"\n \"for the requested axis.\",\n name1=name, name2=name2, axis_descr=axis_descr,\n min_count='', see_also='', examples='')\n @Appender(_num_doc)\n def mad(self, axis=None, skipna=None, level=None):\n if skipna is None:\n skipna = True\n if axis is None:\n axis = self._stat_axis_number\n if level is not None:\n return self._agg_by_level('mad', axis=axis, level=level,\n skipna=skipna)\n\n data = self._get_numeric_data()\n if axis == 0:\n demeaned = data - data.mean(axis=0)\n else:\n demeaned = data.sub(data.mean(axis=1), axis=0)\n return np.abs(demeaned).mean(axis=axis, skipna=skipna)\n\n cls.mad = mad\n\n cls.sem = _make_stat_function_ddof(\n cls, 'sem', name, name2, axis_descr,\n \"Return unbiased standard error of the mean over requested \"\n \"axis.\\n\\nNormalized by N-1 by default. This can be changed \"\n \"using the ddof argument\",\n nanops.nansem)\n cls.var = _make_stat_function_ddof(\n cls, 'var', name, name2, axis_descr,\n \"Return unbiased variance over requested axis.\\n\\nNormalized by \"\n \"N-1 by default. This can be changed using the ddof argument\",\n nanops.nanvar)\n cls.std = _make_stat_function_ddof(\n cls, 'std', name, name2, axis_descr,\n \"Return sample standard deviation over requested axis.\"\n \"\\n\\nNormalized by N-1 by default. This can be changed using the \"\n \"ddof argument\",\n nanops.nanstd)\n\n @Substitution(desc=\"Return the compound percentage of the values for \"\n \"the requested axis.\", name1=name, name2=name2,\n axis_descr=axis_descr,\n min_count='', see_also='', examples='')\n @Appender(_num_doc)\n def compound(self, axis=None, skipna=None, level=None):\n if skipna is None:\n skipna = True\n return (1 + self).prod(axis=axis, skipna=skipna, level=level) - 1\n\n cls.compound = compound\n\n cls.cummin = _make_cum_function(\n cls, 'cummin', name, name2, axis_descr, \"minimum\",\n lambda y, axis: np.minimum.accumulate(y, axis), \"min\",\n np.inf, np.nan, _cummin_examples)\n cls.cumsum = _make_cum_function(\n cls, 'cumsum', name, name2, axis_descr, \"sum\",\n lambda y, axis: y.cumsum(axis), \"sum\", 0.,\n np.nan, _cumsum_examples)\n cls.cumprod = _make_cum_function(\n cls, 'cumprod', name, name2, axis_descr, \"product\",\n lambda y, axis: y.cumprod(axis), \"prod\", 1.,\n np.nan, _cumprod_examples)\n cls.cummax = _make_cum_function(\n cls, 'cummax', name, name2, axis_descr, \"maximum\",\n lambda y, axis: np.maximum.accumulate(y, axis), \"max\",\n -np.inf, np.nan, _cummax_examples)\n\n cls.sum = _make_min_count_stat_function(\n cls, 'sum', name, name2, axis_descr,\n \"\"\"Return the sum of the values for the requested axis.\\n\n This is equivalent to the method ``numpy.sum``.\"\"\",\n nanops.nansum, _stat_func_see_also, _sum_examples)\n cls.mean = _make_stat_function(\n cls, 'mean', name, name2, axis_descr,\n 'Return the mean of the values for the requested axis.',\n nanops.nanmean)\n cls.skew = _make_stat_function(\n cls, 'skew', name, name2, axis_descr,\n 'Return unbiased skew over requested axis\\nNormalized by N-1.',\n nanops.nanskew)\n cls.kurt = _make_stat_function(\n cls, 'kurt', name, name2, axis_descr,\n \"Return unbiased kurtosis over requested axis using Fisher's \"\n \"definition of\\nkurtosis (kurtosis of normal == 0.0). Normalized \"\n \"by N-1.\",\n nanops.nankurt)\n cls.kurtosis = cls.kurt\n cls.prod = _make_min_count_stat_function(\n cls, 'prod', name, name2, axis_descr,\n 'Return the product of the values for the requested axis.',\n nanops.nanprod, examples=_prod_examples)\n cls.product = cls.prod\n cls.median = _make_stat_function(\n cls, 'median', name, name2, axis_descr,\n 'Return the median of the values for the requested axis.',\n nanops.nanmedian)\n cls.max = _make_stat_function(\n cls, 'max', name, name2, axis_descr,\n \"\"\"Return the maximum of the values for the requested axis.\\n\n If you want the *index* of the maximum, use ``idxmax``. This is\n the equivalent of the ``numpy.ndarray`` method ``argmax``.\"\"\",\n nanops.nanmax, _stat_func_see_also, _max_examples)\n cls.min = _make_stat_function(\n cls, 'min', name, name2, axis_descr,\n \"\"\"Return the minimum of the values for the requested axis.\\n\n If you want the *index* of the minimum, use ``idxmin``. This is\n the equivalent of the ``numpy.ndarray`` method ``argmin``.\"\"\",\n nanops.nanmin, _stat_func_see_also, _min_examples)", "response": "Add the numeric operations to the class."} {"instruction": "Can you write a function in Python 3 where it\nadds the series only operations to the class.", "response": "def _add_series_only_operations(cls):\n \"\"\"\n Add the series only operations to the cls; evaluate the doc\n strings again.\n \"\"\"\n\n axis_descr, name, name2 = _doc_parms(cls)\n\n def nanptp(values, axis=0, skipna=True):\n nmax = nanops.nanmax(values, axis, skipna)\n nmin = nanops.nanmin(values, axis, skipna)\n warnings.warn(\"Method .ptp is deprecated and will be removed \"\n \"in a future version. Use numpy.ptp instead.\",\n FutureWarning, stacklevel=4)\n return nmax - nmin\n\n cls.ptp = _make_stat_function(\n cls, 'ptp', name, name2, axis_descr,\n \"\"\"Return the difference between the maximum value and the\n minimum value in the object. This is the equivalent of the\n ``numpy.ndarray`` method ``ptp``.\\n\\n.. deprecated:: 0.24.0\n Use numpy.ptp instead\"\"\",\n nanptp)"} {"instruction": "Make a summary of the following Python 3 code\ndef _add_series_or_dataframe_operations(cls):\n\n from pandas.core import window as rwindow\n\n @Appender(rwindow.rolling.__doc__)\n def rolling(self, window, min_periods=None, center=False,\n win_type=None, on=None, axis=0, closed=None):\n axis = self._get_axis_number(axis)\n return rwindow.rolling(self, window=window,\n min_periods=min_periods,\n center=center, win_type=win_type,\n on=on, axis=axis, closed=closed)\n\n cls.rolling = rolling\n\n @Appender(rwindow.expanding.__doc__)\n def expanding(self, min_periods=1, center=False, axis=0):\n axis = self._get_axis_number(axis)\n return rwindow.expanding(self, min_periods=min_periods,\n center=center, axis=axis)\n\n cls.expanding = expanding\n\n @Appender(rwindow.ewm.__doc__)\n def ewm(self, com=None, span=None, halflife=None, alpha=None,\n min_periods=0, adjust=True, ignore_na=False,\n axis=0):\n axis = self._get_axis_number(axis)\n return rwindow.ewm(self, com=com, span=span, halflife=halflife,\n alpha=alpha, min_periods=min_periods,\n adjust=adjust, ignore_na=ignore_na, axis=axis)\n\n cls.ewm = ewm", "response": "Add the series or dataframe only operations to the cls ; evaluate\n the doc strings again."} {"instruction": "Can you create a Python 3 function that\nreturns the index of the first valid value in the log entry.", "response": "def _find_valid_index(self, how):\n \"\"\"\n Retrieves the index of the first valid value.\n\n Parameters\n ----------\n how : {'first', 'last'}\n Use this parameter to change between the first or last valid index.\n\n Returns\n -------\n idx_first_valid : type of index\n \"\"\"\n assert how in ['first', 'last']\n\n if len(self) == 0: # early stop\n return None\n is_valid = ~self.isna()\n\n if self.ndim == 2:\n is_valid = is_valid.any(1) # reduce axis 1\n\n if how == 'first':\n idxpos = is_valid.values[::].argmax()\n\n if how == 'last':\n idxpos = len(self) - 1 - is_valid.values[::-1].argmax()\n\n chk_notna = is_valid.iat[idxpos]\n idx = self.index[idxpos]\n\n if not chk_notna:\n return None\n return idx"} {"instruction": "Create a Python 3 function to\nreset cached properties. If ``key`` is passed, only clears that key.", "response": "def _reset_cache(self, key=None):\n \"\"\"\n Reset cached properties. If ``key`` is passed, only clears that key.\n \"\"\"\n if getattr(self, '_cache', None) is None:\n return\n if key is None:\n self._cache.clear()\n else:\n self._cache.pop(key, None)"} {"instruction": "Create a Python 3 function to\ntry to operate on a string function or attribute on ourselves.", "response": "def _try_aggregate_string_function(self, arg, *args, **kwargs):\n \"\"\"\n if arg is a string, then try to operate on it:\n - try to find a function (or attribute) on ourselves\n - try to find a numpy function\n - raise\n\n \"\"\"\n assert isinstance(arg, str)\n\n f = getattr(self, arg, None)\n if f is not None:\n if callable(f):\n return f(*args, **kwargs)\n\n # people may try to aggregate on a non-callable attribute\n # but don't let them think they can pass args to it\n assert len(args) == 0\n assert len([kwarg for kwarg in kwargs\n if kwarg not in ['axis', '_level']]) == 0\n return f\n\n f = getattr(np, arg, None)\n if f is not None:\n return f(self, *args, **kwargs)\n\n raise ValueError(\"{arg} is an unknown string function\".format(arg=arg))"} {"instruction": "Here you have a function in Python 3, explain what it does\ndef _aggregate(self, arg, *args, **kwargs):\n is_aggregator = lambda x: isinstance(x, (list, tuple, dict))\n is_nested_renamer = False\n\n _axis = kwargs.pop('_axis', None)\n if _axis is None:\n _axis = getattr(self, 'axis', 0)\n _level = kwargs.pop('_level', None)\n\n if isinstance(arg, str):\n return self._try_aggregate_string_function(arg, *args,\n **kwargs), None\n\n if isinstance(arg, dict):\n\n # aggregate based on the passed dict\n if _axis != 0: # pragma: no cover\n raise ValueError('Can only pass dict with axis=0')\n\n obj = self._selected_obj\n\n def nested_renaming_depr(level=4):\n # deprecation of nested renaming\n # GH 15931\n warnings.warn(\n (\"using a dict with renaming \"\n \"is deprecated and will be removed in a future \"\n \"version\"),\n FutureWarning, stacklevel=level)\n\n # if we have a dict of any non-scalars\n # eg. {'A' : ['mean']}, normalize all to\n # be list-likes\n if any(is_aggregator(x) for x in arg.values()):\n new_arg = OrderedDict()\n for k, v in arg.items():\n if not isinstance(v, (tuple, list, dict)):\n new_arg[k] = [v]\n else:\n new_arg[k] = v\n\n # the keys must be in the columns\n # for ndim=2, or renamers for ndim=1\n\n # ok for now, but deprecated\n # {'A': { 'ra': 'mean' }}\n # {'A': { 'ra': ['mean'] }}\n # {'ra': ['mean']}\n\n # not ok\n # {'ra' : { 'A' : 'mean' }}\n if isinstance(v, dict):\n is_nested_renamer = True\n\n if k not in obj.columns:\n msg = ('cannot perform renaming for {key} with a '\n 'nested dictionary').format(key=k)\n raise SpecificationError(msg)\n nested_renaming_depr(4 + (_level or 0))\n\n elif isinstance(obj, ABCSeries):\n nested_renaming_depr()\n elif (isinstance(obj, ABCDataFrame) and\n k not in obj.columns):\n raise KeyError(\n \"Column '{col}' does not exist!\".format(col=k))\n\n arg = new_arg\n\n else:\n # deprecation of renaming keys\n # GH 15931\n keys = list(arg.keys())\n if (isinstance(obj, ABCDataFrame) and\n len(obj.columns.intersection(keys)) != len(keys)):\n nested_renaming_depr()\n\n from pandas.core.reshape.concat import concat\n\n def _agg_1dim(name, how, subset=None):\n \"\"\"\n aggregate a 1-dim with how\n \"\"\"\n colg = self._gotitem(name, ndim=1, subset=subset)\n if colg.ndim != 1:\n raise SpecificationError(\"nested dictionary is ambiguous \"\n \"in aggregation\")\n return colg.aggregate(how, _level=(_level or 0) + 1)\n\n def _agg_2dim(name, how):\n \"\"\"\n aggregate a 2-dim with how\n \"\"\"\n colg = self._gotitem(self._selection, ndim=2,\n subset=obj)\n return colg.aggregate(how, _level=None)\n\n def _agg(arg, func):\n \"\"\"\n run the aggregations over the arg with func\n return an OrderedDict\n \"\"\"\n result = OrderedDict()\n for fname, agg_how in arg.items():\n result[fname] = func(fname, agg_how)\n return result\n\n # set the final keys\n keys = list(arg.keys())\n result = OrderedDict()\n\n # nested renamer\n if is_nested_renamer:\n result = list(_agg(arg, _agg_1dim).values())\n\n if all(isinstance(r, dict) for r in result):\n\n result, results = OrderedDict(), result\n for r in results:\n result.update(r)\n keys = list(result.keys())\n\n else:\n\n if self._selection is not None:\n keys = None\n\n # some selection on the object\n elif self._selection is not None:\n\n sl = set(self._selection_list)\n\n # we are a Series like object,\n # but may have multiple aggregations\n if len(sl) == 1:\n\n result = _agg(arg, lambda fname,\n agg_how: _agg_1dim(self._selection, agg_how))\n\n # we are selecting the same set as we are aggregating\n elif not len(sl - set(keys)):\n\n result = _agg(arg, _agg_1dim)\n\n # we are a DataFrame, with possibly multiple aggregations\n else:\n\n result = _agg(arg, _agg_2dim)\n\n # no selection\n else:\n\n try:\n result = _agg(arg, _agg_1dim)\n except SpecificationError:\n\n # we are aggregating expecting all 1d-returns\n # but we have 2d\n result = _agg(arg, _agg_2dim)\n\n # combine results\n\n def is_any_series():\n # return a boolean if we have *any* nested series\n return any(isinstance(r, ABCSeries) for r in result.values())\n\n def is_any_frame():\n # return a boolean if we have *any* nested series\n return any(isinstance(r, ABCDataFrame)\n for r in result.values())\n\n if isinstance(result, list):\n return concat(result, keys=keys, axis=1, sort=True), True\n\n elif is_any_frame():\n # we have a dict of DataFrames\n # return a MI DataFrame\n\n return concat([result[k] for k in keys],\n keys=keys, axis=1), True\n\n elif isinstance(self, ABCSeries) and is_any_series():\n\n # we have a dict of Series\n # return a MI Series\n try:\n result = concat(result)\n except TypeError:\n # we want to give a nice error here if\n # we have non-same sized objects, so\n # we don't automatically broadcast\n\n raise ValueError(\"cannot perform both aggregation \"\n \"and transformation operations \"\n \"simultaneously\")\n\n return result, True\n\n # fall thru\n from pandas import DataFrame, Series\n try:\n result = DataFrame(result)\n except ValueError:\n\n # we have a dict of scalars\n result = Series(result,\n name=getattr(self, 'name', None))\n\n return result, True\n elif is_list_like(arg):\n # we require a list, but not an 'str'\n return self._aggregate_multiple_funcs(arg,\n _level=_level,\n _axis=_axis), None\n else:\n result = None\n\n f = self._is_cython_func(arg)\n if f and not args and not kwargs:\n return getattr(self, f)(), None\n\n # caller can react\n return result, True", "response": "This function is used to aggregate the keys of the object and returns the result of the function_aggregate_function."} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef _shallow_copy(self, obj=None, obj_type=None, **kwargs):\n if obj is None:\n obj = self._selected_obj.copy()\n if obj_type is None:\n obj_type = self._constructor\n if isinstance(obj, obj_type):\n obj = obj.obj\n for attr in self._attributes:\n if attr not in kwargs:\n kwargs[attr] = getattr(self, attr)\n return obj_type(obj, **kwargs)", "response": "Returns a shallow copy of the object with the replacement attributes"} {"instruction": "Can you create a Python 3 function that\nreturns the size of the underlying data.", "response": "def itemsize(self):\n \"\"\"\n Return the size of the dtype of the item of the underlying data.\n\n .. deprecated:: 0.23.0\n \"\"\"\n warnings.warn(\"{obj}.itemsize is deprecated and will be removed \"\n \"in a future version\".format(obj=type(self).__name__),\n FutureWarning, stacklevel=2)\n return self._ndarray_values.itemsize"} {"instruction": "Create a Python 3 function for\nreturning the base object of the underlying data.", "response": "def base(self):\n \"\"\"\n Return the base object if the memory of the underlying data is shared.\n\n .. deprecated:: 0.23.0\n \"\"\"\n warnings.warn(\"{obj}.base is deprecated and will be removed \"\n \"in a future version\".format(obj=type(self).__name__),\n FutureWarning, stacklevel=2)\n return self.values.base"} {"instruction": "Write a Python 3 function that can\nreturn an array of the values stored within this Series or Index.", "response": "def array(self) -> ExtensionArray:\n \"\"\"\n The ExtensionArray of the data backing this Series or Index.\n\n .. versionadded:: 0.24.0\n\n Returns\n -------\n ExtensionArray\n An ExtensionArray of the values stored within. For extension\n types, this is the actual array. For NumPy native types, this\n is a thin (no copy) wrapper around :class:`numpy.ndarray`.\n\n ``.array`` differs ``.values`` which may require converting the\n data to a different form.\n\n See Also\n --------\n Index.to_numpy : Similar method that always returns a NumPy array.\n Series.to_numpy : Similar method that always returns a NumPy array.\n\n Notes\n -----\n This table lays out the different array types for each extension\n dtype within pandas.\n\n ================== =============================\n dtype array type\n ================== =============================\n category Categorical\n period PeriodArray\n interval IntervalArray\n IntegerNA IntegerArray\n datetime64[ns, tz] DatetimeArray\n ================== =============================\n\n For any 3rd-party extension types, the array type will be an\n ExtensionArray.\n\n For all remaining dtypes ``.array`` will be a\n :class:`arrays.NumpyExtensionArray` wrapping the actual ndarray\n stored within. If you absolutely need a NumPy array (possibly with\n copying / coercing data), then use :meth:`Series.to_numpy` instead.\n\n Examples\n --------\n\n For regular NumPy types like int, and float, a PandasArray\n is returned.\n\n >>> pd.Series([1, 2, 3]).array\n \n [1, 2, 3]\n Length: 3, dtype: int64\n\n For extension types, like Categorical, the actual ExtensionArray\n is returned\n\n >>> ser = pd.Series(pd.Categorical(['a', 'b', 'a']))\n >>> ser.array\n [a, b, a]\n Categories (2, object): [a, b]\n \"\"\"\n result = self._values\n\n if is_datetime64_ns_dtype(result.dtype):\n from pandas.arrays import DatetimeArray\n result = DatetimeArray(result)\n elif is_timedelta64_ns_dtype(result.dtype):\n from pandas.arrays import TimedeltaArray\n result = TimedeltaArray(result)\n\n elif not is_extension_array_dtype(result.dtype):\n from pandas.core.arrays.numpy_ import PandasArray\n result = PandasArray(result)\n\n return result"} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef to_numpy(self, dtype=None, copy=False):\n if is_datetime64tz_dtype(self.dtype) and dtype is None:\n # note: this is going to change very soon.\n # I have a WIP PR making this unnecessary, but it's\n # a bit out of scope for the DatetimeArray PR.\n dtype = \"object\"\n\n result = np.asarray(self._values, dtype=dtype)\n # TODO(GH-24345): Avoid potential double copy\n if copy:\n result = result.copy()\n return result", "response": "Returns a NumPy ndarray representing the values in this Series or Index."} {"instruction": "How would you explain what the following Python 3 function does\ndef _ndarray_values(self) -> np.ndarray:\n if is_extension_array_dtype(self):\n return self.array._ndarray_values\n return self.values", "response": "The data as an ndarray possibly losing information."} {"instruction": "Write a Python 3 script for\nreturning the maximum value of the Index or Series.", "response": "def max(self, axis=None, skipna=True):\n \"\"\"\n Return the maximum value of the Index.\n\n Parameters\n ----------\n axis : int, optional\n For compatibility with NumPy. Only 0 or None are allowed.\n skipna : bool, default True\n\n Returns\n -------\n scalar\n Maximum value.\n\n See Also\n --------\n Index.min : Return the minimum value in an Index.\n Series.max : Return the maximum value in a Series.\n DataFrame.max : Return the maximum values in a DataFrame.\n\n Examples\n --------\n >>> idx = pd.Index([3, 2, 1])\n >>> idx.max()\n 3\n\n >>> idx = pd.Index(['c', 'b', 'a'])\n >>> idx.max()\n 'c'\n\n For a MultiIndex, the maximum is determined lexicographically.\n\n >>> idx = pd.MultiIndex.from_product([('a', 'b'), (2, 1)])\n >>> idx.max()\n ('b', 2)\n \"\"\"\n nv.validate_minmax_axis(axis)\n return nanops.nanmax(self._values, skipna=skipna)"} {"instruction": "Can you implement a function in Python 3 that\nreturns an ndarray of the maximum argument indexer.", "response": "def argmax(self, axis=None, skipna=True):\n \"\"\"\n Return an ndarray of the maximum argument indexer.\n\n Parameters\n ----------\n axis : {None}\n Dummy argument for consistency with Series\n skipna : bool, default True\n\n See Also\n --------\n numpy.ndarray.argmax\n \"\"\"\n nv.validate_minmax_axis(axis)\n return nanops.nanargmax(self._values, skipna=skipna)"} {"instruction": "Make a summary of the following Python 3 code\ndef min(self, axis=None, skipna=True):\n nv.validate_minmax_axis(axis)\n return nanops.nanmin(self._values, skipna=skipna)", "response": "Return the minimum value of the Index or DataFrame."} {"instruction": "Can you create a Python 3 function that\nreturns a ndarray of the minimum argument indexer.", "response": "def argmin(self, axis=None, skipna=True):\n \"\"\"\n Return a ndarray of the minimum argument indexer.\n\n Parameters\n ----------\n axis : {None}\n Dummy argument for consistency with Series\n skipna : bool, default True\n\n Returns\n -------\n numpy.ndarray\n\n See Also\n --------\n numpy.ndarray.argmin\n \"\"\"\n nv.validate_minmax_axis(axis)\n return nanops.nanargmin(self._values, skipna=skipna)"} {"instruction": "Here you have a function in Python 3, explain what it does\ndef tolist(self):\n if is_datetimelike(self._values):\n return [com.maybe_box_datetimelike(x) for x in self._values]\n elif is_extension_array_dtype(self._values):\n return list(self._values)\n else:\n return self._values.tolist()", "response": "Return a list of the values."} {"instruction": "How would you explain what the following Python 3 function does\ndef _reduce(self, op, name, axis=0, skipna=True, numeric_only=None,\n filter_type=None, **kwds):\n \"\"\" perform the reduction type operation if we can \"\"\"\n func = getattr(self, name, None)\n if func is None:\n raise TypeError(\"{klass} cannot perform the operation {op}\".format(\n klass=self.__class__.__name__, op=name))\n return func(skipna=skipna, **kwds)", "response": "perform the reduction operation if we can"} {"instruction": "Given the following Python 3 function, write the documentation\ndef _map_values(self, mapper, na_action=None):\n\n # we can fastpath dict/Series to an efficient map\n # as we know that we are not going to have to yield\n # python types\n if isinstance(mapper, dict):\n if hasattr(mapper, '__missing__'):\n # If a dictionary subclass defines a default value method,\n # convert mapper to a lookup function (GH #15999).\n dict_with_default = mapper\n mapper = lambda x: dict_with_default[x]\n else:\n # Dictionary does not have a default. Thus it's safe to\n # convert to an Series for efficiency.\n # we specify the keys here to handle the\n # possibility that they are tuples\n from pandas import Series\n mapper = Series(mapper)\n\n if isinstance(mapper, ABCSeries):\n # Since values were input this means we came from either\n # a dict or a series and mapper should be an index\n if is_extension_type(self.dtype):\n values = self._values\n else:\n values = self.values\n\n indexer = mapper.index.get_indexer(values)\n new_values = algorithms.take_1d(mapper._values, indexer)\n\n return new_values\n\n # we must convert to python types\n if is_extension_type(self.dtype):\n values = self._values\n if na_action is not None:\n raise NotImplementedError\n map_f = lambda values, f: values.map(f)\n else:\n values = self.astype(object)\n values = getattr(values, 'values', values)\n if na_action == 'ignore':\n def map_f(values, f):\n return lib.map_infer_mask(values, f,\n isna(values).view(np.uint8))\n else:\n map_f = lib.map_infer\n\n # mapper is a function\n new_values = map_f(values, mapper)\n\n return new_values", "response": "Internal function that maps values using the input correspondence object."} {"instruction": "Can you implement a function in Python 3 that\nreturns a Series containing counts of unique values for the specified entry.", "response": "def value_counts(self, normalize=False, sort=True, ascending=False,\n bins=None, dropna=True):\n \"\"\"\n Return a Series containing counts of unique values.\n\n The resulting object will be in descending order so that the\n first element is the most frequently-occurring element.\n Excludes NA values by default.\n\n Parameters\n ----------\n normalize : boolean, default False\n If True then the object returned will contain the relative\n frequencies of the unique values.\n sort : boolean, default True\n Sort by frequencies.\n ascending : boolean, default False\n Sort in ascending order.\n bins : integer, optional\n Rather than count values, group them into half-open bins,\n a convenience for ``pd.cut``, only works with numeric data.\n dropna : boolean, default True\n Don't include counts of NaN.\n\n Returns\n -------\n Series\n\n See Also\n --------\n Series.count: Number of non-NA elements in a Series.\n DataFrame.count: Number of non-NA elements in a DataFrame.\n\n Examples\n --------\n >>> index = pd.Index([3, 1, 2, 3, 4, np.nan])\n >>> index.value_counts()\n 3.0 2\n 4.0 1\n 2.0 1\n 1.0 1\n dtype: int64\n\n With `normalize` set to `True`, returns the relative frequency by\n dividing all values by the sum of values.\n\n >>> s = pd.Series([3, 1, 2, 3, 4, np.nan])\n >>> s.value_counts(normalize=True)\n 3.0 0.4\n 4.0 0.2\n 2.0 0.2\n 1.0 0.2\n dtype: float64\n\n **bins**\n\n Bins can be useful for going from a continuous variable to a\n categorical variable; instead of counting unique\n apparitions of values, divide the index in the specified\n number of half-open bins.\n\n >>> s.value_counts(bins=3)\n (2.0, 3.0] 2\n (0.996, 2.0] 2\n (3.0, 4.0] 1\n dtype: int64\n\n **dropna**\n\n With `dropna` set to `False` we can also see NaN index values.\n\n >>> s.value_counts(dropna=False)\n 3.0 2\n NaN 1\n 4.0 1\n 2.0 1\n 1.0 1\n dtype: int64\n \"\"\"\n from pandas.core.algorithms import value_counts\n result = value_counts(self, sort=sort, ascending=ascending,\n normalize=normalize, bins=bins, dropna=dropna)\n return result"} {"instruction": "Can you implement a function in Python 3 that\nreturns the number of unique elements in the object.", "response": "def nunique(self, dropna=True):\n \"\"\"\n Return number of unique elements in the object.\n\n Excludes NA values by default.\n\n Parameters\n ----------\n dropna : bool, default True\n Don't include NaN in the count.\n\n Returns\n -------\n int\n\n See Also\n --------\n DataFrame.nunique: Method nunique for DataFrame.\n Series.count: Count non-NA/null observations in the Series.\n\n Examples\n --------\n >>> s = pd.Series([1, 3, 5, 7, 7])\n >>> s\n 0 1\n 1 3\n 2 5\n 3 7\n 4 7\n dtype: int64\n\n >>> s.nunique()\n 4\n \"\"\"\n uniqs = self.unique()\n n = len(uniqs)\n if dropna and isna(uniqs).any():\n n -= 1\n return n"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef memory_usage(self, deep=False):\n if hasattr(self.array, 'memory_usage'):\n return self.array.memory_usage(deep=deep)\n\n v = self.array.nbytes\n if deep and is_object_dtype(self) and not PYPY:\n v += lib.memory_usage_of_objects(self.array)\n return v", "response": "Returns the memory usage of the values in the array."} {"instruction": "Can you implement a function in Python 3 that\nexpands user home directory.", "response": "def _expand_user(filepath_or_buffer):\n \"\"\"Return the argument with an initial component of ~ or ~user\n replaced by that user's home directory.\n\n Parameters\n ----------\n filepath_or_buffer : object to be converted if possible\n\n Returns\n -------\n expanded_filepath_or_buffer : an expanded filepath or the\n input if not expandable\n \"\"\"\n if isinstance(filepath_or_buffer, str):\n return os.path.expanduser(filepath_or_buffer)\n return filepath_or_buffer"} {"instruction": "Make a summary of the following Python 3 code\ndef _stringify_path(filepath_or_buffer):\n try:\n import pathlib\n _PATHLIB_INSTALLED = True\n except ImportError:\n _PATHLIB_INSTALLED = False\n\n try:\n from py.path import local as LocalPath\n _PY_PATH_INSTALLED = True\n except ImportError:\n _PY_PATH_INSTALLED = False\n\n if hasattr(filepath_or_buffer, '__fspath__'):\n return filepath_or_buffer.__fspath__()\n if _PATHLIB_INSTALLED and isinstance(filepath_or_buffer, pathlib.Path):\n return str(filepath_or_buffer)\n if _PY_PATH_INSTALLED and isinstance(filepath_or_buffer, LocalPath):\n return filepath_or_buffer.strpath\n return _expand_user(filepath_or_buffer)", "response": "Attempt to convert a path - like object to a string."} {"instruction": "Write a Python 3 script for\ntranslating and return the filepath_or_buffer.", "response": "def get_filepath_or_buffer(filepath_or_buffer, encoding=None,\n compression=None, mode=None):\n \"\"\"\n If the filepath_or_buffer is a url, translate and return the buffer.\n Otherwise passthrough.\n\n Parameters\n ----------\n filepath_or_buffer : a url, filepath (str, py.path.local or pathlib.Path),\n or buffer\n compression : {{'gzip', 'bz2', 'zip', 'xz', None}}, optional\n encoding : the encoding to use to decode bytes, default is 'utf-8'\n mode : str, optional\n\n Returns\n -------\n tuple of ({a filepath_ or buffer or S3File instance},\n encoding, str,\n compression, str,\n should_close, bool)\n \"\"\"\n filepath_or_buffer = _stringify_path(filepath_or_buffer)\n\n if _is_url(filepath_or_buffer):\n req = urlopen(filepath_or_buffer)\n content_encoding = req.headers.get('Content-Encoding', None)\n if content_encoding == 'gzip':\n # Override compression based on Content-Encoding header\n compression = 'gzip'\n reader = BytesIO(req.read())\n req.close()\n return reader, encoding, compression, True\n\n if is_s3_url(filepath_or_buffer):\n from pandas.io import s3\n return s3.get_filepath_or_buffer(filepath_or_buffer,\n encoding=encoding,\n compression=compression,\n mode=mode)\n\n if is_gcs_url(filepath_or_buffer):\n from pandas.io import gcs\n return gcs.get_filepath_or_buffer(filepath_or_buffer,\n encoding=encoding,\n compression=compression,\n mode=mode)\n\n if isinstance(filepath_or_buffer, (str, bytes, mmap.mmap)):\n return _expand_user(filepath_or_buffer), None, compression, False\n\n if not is_file_like(filepath_or_buffer):\n msg = \"Invalid file path or buffer object type: {_type}\"\n raise ValueError(msg.format(_type=type(filepath_or_buffer)))\n\n return filepath_or_buffer, None, compression, False"} {"instruction": "Explain what the following Python 3 code does\ndef _infer_compression(filepath_or_buffer, compression):\n\n # No compression has been explicitly specified\n if compression is None:\n return None\n\n # Infer compression\n if compression == 'infer':\n # Convert all path types (e.g. pathlib.Path) to strings\n filepath_or_buffer = _stringify_path(filepath_or_buffer)\n if not isinstance(filepath_or_buffer, str):\n # Cannot infer compression of a buffer, assume no compression\n return None\n\n # Infer compression from the filename/URL extension\n for compression, extension in _compression_to_extension.items():\n if filepath_or_buffer.endswith(extension):\n return compression\n return None\n\n # Compression has been specified. Check that it's valid\n if compression in _compression_to_extension:\n return compression\n\n msg = 'Unrecognized compression type: {}'.format(compression)\n valid = ['infer', None] + sorted(_compression_to_extension)\n msg += '\\nValid compression types are {}'.format(valid)\n raise ValueError(msg)", "response": "Infer the compression method for filepath_or_buffer."} {"instruction": "Write a Python 3 script to\nreturn a file - like object that can be used to read and write the log file for the given path or buffer and mode.", "response": "def _get_handle(path_or_buf, mode, encoding=None, compression=None,\n memory_map=False, is_text=True):\n \"\"\"\n Get file handle for given path/buffer and mode.\n\n Parameters\n ----------\n path_or_buf :\n a path (str) or buffer\n mode : str\n mode to open path_or_buf with\n encoding : str or None\n compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default None\n If 'infer' and `filepath_or_buffer` is path-like, then detect\n compression from the following extensions: '.gz', '.bz2', '.zip',\n or '.xz' (otherwise no compression).\n memory_map : boolean, default False\n See parsers._parser_params for more information.\n is_text : boolean, default True\n whether file/buffer is in text format (csv, json, etc.), or in binary\n mode (pickle, etc.)\n\n Returns\n -------\n f : file-like\n A file-like object\n handles : list of file-like objects\n A list of file-like object that were opened in this function.\n \"\"\"\n try:\n from s3fs import S3File\n need_text_wrapping = (BytesIO, S3File)\n except ImportError:\n need_text_wrapping = (BytesIO,)\n\n handles = list()\n f = path_or_buf\n\n # Convert pathlib.Path/py.path.local or string\n path_or_buf = _stringify_path(path_or_buf)\n is_path = isinstance(path_or_buf, str)\n\n if is_path:\n compression = _infer_compression(path_or_buf, compression)\n\n if compression:\n\n # GZ Compression\n if compression == 'gzip':\n if is_path:\n f = gzip.open(path_or_buf, mode)\n else:\n f = gzip.GzipFile(fileobj=path_or_buf)\n\n # BZ Compression\n elif compression == 'bz2':\n if is_path:\n f = bz2.BZ2File(path_or_buf, mode)\n else:\n f = bz2.BZ2File(path_or_buf)\n\n # ZIP Compression\n elif compression == 'zip':\n zf = BytesZipFile(path_or_buf, mode)\n # Ensure the container is closed as well.\n handles.append(zf)\n if zf.mode == 'w':\n f = zf\n elif zf.mode == 'r':\n zip_names = zf.namelist()\n if len(zip_names) == 1:\n f = zf.open(zip_names.pop())\n elif len(zip_names) == 0:\n raise ValueError('Zero files found in ZIP file {}'\n .format(path_or_buf))\n else:\n raise ValueError('Multiple files found in ZIP file.'\n ' Only one file per ZIP: {}'\n .format(zip_names))\n\n # XZ Compression\n elif compression == 'xz':\n f = lzma.LZMAFile(path_or_buf, mode)\n\n # Unrecognized Compression\n else:\n msg = 'Unrecognized compression type: {}'.format(compression)\n raise ValueError(msg)\n\n handles.append(f)\n\n elif is_path:\n if encoding:\n # Encoding\n f = open(path_or_buf, mode, encoding=encoding, newline=\"\")\n elif is_text:\n # No explicit encoding\n f = open(path_or_buf, mode, errors='replace', newline=\"\")\n else:\n # Binary mode\n f = open(path_or_buf, mode)\n handles.append(f)\n\n # Convert BytesIO or file objects passed with an encoding\n if is_text and (compression or isinstance(f, need_text_wrapping)):\n from io import TextIOWrapper\n f = TextIOWrapper(f, encoding=encoding, newline='')\n handles.append(f)\n\n if memory_map and hasattr(f, 'fileno'):\n try:\n g = MMapWrapper(f)\n f.close()\n f = g\n except Exception:\n # we catch any errors that may have occurred\n # because that is consistent with the lower-level\n # functionality of the C engine (pd.read_csv), so\n # leave the file handler as is then\n pass\n\n return f, handles"} {"instruction": "Write a Python 3 script to\nwrap comparison operations to convert timedelta - like to timedelta64", "response": "def _td_array_cmp(cls, op):\n \"\"\"\n Wrap comparison operations to convert timedelta-like to timedelta64\n \"\"\"\n opname = '__{name}__'.format(name=op.__name__)\n nat_result = opname == '__ne__'\n\n def wrapper(self, other):\n if isinstance(other, (ABCDataFrame, ABCSeries, ABCIndexClass)):\n return NotImplemented\n\n if _is_convertible_to_td(other) or other is NaT:\n try:\n other = Timedelta(other)\n except ValueError:\n # failed to parse as timedelta\n return ops.invalid_comparison(self, other, op)\n\n result = op(self.view('i8'), other.value)\n if isna(other):\n result.fill(nat_result)\n\n elif not is_list_like(other):\n return ops.invalid_comparison(self, other, op)\n\n elif len(other) != len(self):\n raise ValueError(\"Lengths must match\")\n\n else:\n try:\n other = type(self)._from_sequence(other)._data\n except (ValueError, TypeError):\n return ops.invalid_comparison(self, other, op)\n\n result = op(self.view('i8'), other.view('i8'))\n result = com.values_from_object(result)\n\n o_mask = np.array(isna(other))\n if o_mask.any():\n result[o_mask] = nat_result\n\n if self._hasnans:\n result[self._isnan] = nat_result\n\n return result\n\n return compat.set_function_name(wrapper, opname, cls)"} {"instruction": "How would you code a function in Python 3 to\nconvert a sequence of timedelta64 values into a single timedelta64 array.", "response": "def sequence_to_td64ns(data, copy=False, unit=\"ns\", errors=\"raise\"):\n \"\"\"\n Parameters\n ----------\n array : list-like\n copy : bool, default False\n unit : str, default \"ns\"\n The timedelta unit to treat integers as multiples of.\n errors : {\"raise\", \"coerce\", \"ignore\"}, default \"raise\"\n How to handle elements that cannot be converted to timedelta64[ns].\n See ``pandas.to_timedelta`` for details.\n\n Returns\n -------\n converted : numpy.ndarray\n The sequence converted to a numpy array with dtype ``timedelta64[ns]``.\n inferred_freq : Tick or None\n The inferred frequency of the sequence.\n\n Raises\n ------\n ValueError : Data cannot be converted to timedelta64[ns].\n\n Notes\n -----\n Unlike `pandas.to_timedelta`, if setting ``errors=ignore`` will not cause\n errors to be ignored; they are caught and subsequently ignored at a\n higher level.\n \"\"\"\n inferred_freq = None\n unit = parse_timedelta_unit(unit)\n\n # Unwrap whatever we have into a np.ndarray\n if not hasattr(data, 'dtype'):\n # e.g. list, tuple\n if np.ndim(data) == 0:\n # i.e. generator\n data = list(data)\n data = np.array(data, copy=False)\n elif isinstance(data, ABCSeries):\n data = data._values\n elif isinstance(data, (ABCTimedeltaIndex, TimedeltaArray)):\n inferred_freq = data.freq\n data = data._data\n\n # Convert whatever we have into timedelta64[ns] dtype\n if is_object_dtype(data.dtype) or is_string_dtype(data.dtype):\n # no need to make a copy, need to convert if string-dtyped\n data = objects_to_td64ns(data, unit=unit, errors=errors)\n copy = False\n\n elif is_integer_dtype(data.dtype):\n # treat as multiples of the given unit\n data, copy_made = ints_to_td64ns(data, unit=unit)\n copy = copy and not copy_made\n\n elif is_float_dtype(data.dtype):\n # cast the unit, multiply base/frace separately\n # to avoid precision issues from float -> int\n mask = np.isnan(data)\n m, p = precision_from_unit(unit)\n base = data.astype(np.int64)\n frac = data - base\n if p:\n frac = np.round(frac, p)\n data = (base * m + (frac * m).astype(np.int64)).view('timedelta64[ns]')\n data[mask] = iNaT\n copy = False\n\n elif is_timedelta64_dtype(data.dtype):\n if data.dtype != _TD_DTYPE:\n # non-nano unit\n # TODO: watch out for overflows\n data = data.astype(_TD_DTYPE)\n copy = False\n\n elif is_datetime64_dtype(data):\n # GH#23539\n warnings.warn(\"Passing datetime64-dtype data to TimedeltaIndex is \"\n \"deprecated, will raise a TypeError in a future \"\n \"version\",\n FutureWarning, stacklevel=4)\n data = ensure_int64(data).view(_TD_DTYPE)\n\n else:\n raise TypeError(\"dtype {dtype} cannot be converted to timedelta64[ns]\"\n .format(dtype=data.dtype))\n\n data = np.array(data, copy=copy)\n if data.ndim != 1:\n raise ValueError(\"Only 1-dimensional input arrays are supported.\")\n\n assert data.dtype == 'm8[ns]', data\n return data, inferred_freq"} {"instruction": "Write a Python 3 script to\nconvert an ndarray with integer - dtype to timedelta64 [ ns ] dtype treating the integers as multiples of the given timedelta unit.", "response": "def ints_to_td64ns(data, unit=\"ns\"):\n \"\"\"\n Convert an ndarray with integer-dtype to timedelta64[ns] dtype, treating\n the integers as multiples of the given timedelta unit.\n\n Parameters\n ----------\n data : numpy.ndarray with integer-dtype\n unit : str, default \"ns\"\n The timedelta unit to treat integers as multiples of.\n\n Returns\n -------\n numpy.ndarray : timedelta64[ns] array converted from data\n bool : whether a copy was made\n \"\"\"\n copy_made = False\n unit = unit if unit is not None else \"ns\"\n\n if data.dtype != np.int64:\n # converting to int64 makes a copy, so we can avoid\n # re-copying later\n data = data.astype(np.int64)\n copy_made = True\n\n if unit != \"ns\":\n dtype_str = \"timedelta64[{unit}]\".format(unit=unit)\n data = data.view(dtype_str)\n\n # TODO: watch out for overflows when converting from lower-resolution\n data = data.astype(\"timedelta64[ns]\")\n # the astype conversion makes a copy, so we can avoid re-copying later\n copy_made = True\n\n else:\n data = data.view(\"timedelta64[ns]\")\n\n return data, copy_made"} {"instruction": "Make a summary of the following Python 3 code\ndef objects_to_td64ns(data, unit=\"ns\", errors=\"raise\"):\n # coerce Index to np.ndarray, converting string-dtype if necessary\n values = np.array(data, dtype=np.object_, copy=False)\n\n result = array_to_timedelta64(values,\n unit=unit, errors=errors)\n return result.view('timedelta64[ns]')", "response": "Convert a object - dtyped array into a timedelta64 array."} {"instruction": "Explain what the following Python 3 code does\ndef _add_datetime_arraylike(self, other):\n if isinstance(other, np.ndarray):\n # At this point we have already checked that dtype is datetime64\n from pandas.core.arrays import DatetimeArray\n other = DatetimeArray(other)\n\n # defer to implementation in DatetimeArray\n return other + self", "response": "Add DatetimeArray or Index or ndarray to TimedeltaArray."} {"instruction": "Can you tell what is the following Python 3 function doing\ndef components(self):\n from pandas import DataFrame\n\n columns = ['days', 'hours', 'minutes', 'seconds',\n 'milliseconds', 'microseconds', 'nanoseconds']\n hasnans = self._hasnans\n if hasnans:\n def f(x):\n if isna(x):\n return [np.nan] * len(columns)\n return x.components\n else:\n def f(x):\n return x.components\n\n result = DataFrame([f(x) for x in self], columns=columns)\n if not hasnans:\n result = result.astype('int64')\n return result", "response": "Returns a dataframe of the components of the Timedeltas."} {"instruction": "Make a summary of the following Python 3 code\ndef register_writer(klass):\n if not callable(klass):\n raise ValueError(\"Can only register callables as engines\")\n engine_name = klass.engine\n _writers[engine_name] = klass", "response": "Register a new ExcelWriter class to the excel writer registry. io. excel."} {"instruction": "Explain what the following Python 3 code does\ndef _excel2num(x):\n index = 0\n\n for c in x.upper().strip():\n cp = ord(c)\n\n if cp < ord(\"A\") or cp > ord(\"Z\"):\n raise ValueError(\"Invalid column name: {x}\".format(x=x))\n\n index = index * 26 + cp - ord(\"A\") + 1\n\n return index - 1", "response": "Convert Excel column name like A to 0 - based column index."} {"instruction": "Can you tell what is the following Python 3 function doing\ndef _range2cols(areas):\n cols = []\n\n for rng in areas.split(\",\"):\n if \":\" in rng:\n rng = rng.split(\":\")\n cols.extend(lrange(_excel2num(rng[0]), _excel2num(rng[1]) + 1))\n else:\n cols.append(_excel2num(rng))\n\n return cols", "response": "Convert comma separated list of column names and ranges to indices."} {"instruction": "Write a Python 3 script for\nconverting usecols into a compatible format for parsing in parsers. py.", "response": "def _maybe_convert_usecols(usecols):\n \"\"\"\n Convert `usecols` into a compatible format for parsing in `parsers.py`.\n\n Parameters\n ----------\n usecols : object\n The use-columns object to potentially convert.\n\n Returns\n -------\n converted : object\n The compatible format of `usecols`.\n \"\"\"\n if usecols is None:\n return usecols\n\n if is_integer(usecols):\n warnings.warn((\"Passing in an integer for `usecols` has been \"\n \"deprecated. Please pass in a list of int from \"\n \"0 to `usecols` inclusive instead.\"),\n FutureWarning, stacklevel=2)\n return lrange(usecols + 1)\n\n if isinstance(usecols, str):\n return _range2cols(usecols)\n\n return usecols"} {"instruction": "How would you explain what the following Python 3 function does\ndef _fill_mi_header(row, control_row):\n last = row[0]\n for i in range(1, len(row)):\n if not control_row[i]:\n last = row[i]\n\n if row[i] == '' or row[i] is None:\n row[i] = last\n else:\n control_row[i] = False\n last = row[i]\n\n return row, control_row", "response": "Forward fill blank entries in row but only inside the same parent index."} {"instruction": "Given the following Python 3 function, write the documentation\ndef _pop_header_name(row, index_col):\n # Pop out header name and fill w/blank.\n i = index_col if not is_list_like(index_col) else max(index_col)\n\n header_name = row[i]\n header_name = None if header_name == \"\" else header_name\n\n return header_name, row[:i] + [''] + row[i + 1:]", "response": "Pop the header name for MultiIndex parsing."} {"instruction": "Can you write a function in Python 3 where it\nensures that we are grabbing the correct scope.", "response": "def _ensure_scope(level, global_dict=None, local_dict=None, resolvers=(),\n target=None, **kwargs):\n \"\"\"Ensure that we are grabbing the correct scope.\"\"\"\n return Scope(level + 1, global_dict=global_dict, local_dict=local_dict,\n resolvers=resolvers, target=target)"} {"instruction": "How would you implement a function in Python 3 that\nreplaces a number with its hexadecimal representation. Used to tag temporary variables with their calling scope s id.", "response": "def _replacer(x):\n \"\"\"Replace a number with its hexadecimal representation. Used to tag\n temporary variables with their calling scope's id.\n \"\"\"\n # get the hex repr of the binary char and remove 0x and pad by pad_size\n # zeros\n try:\n hexin = ord(x)\n except TypeError:\n # bytes literals masquerade as ints when iterating in py3\n hexin = x\n\n return hex(hexin)"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef _raw_hex_id(obj):\n # interpret as a pointer since that's what really what id returns\n packed = struct.pack('@P', id(obj))\n return ''.join(map(_replacer, packed))", "response": "Return the padded hexadecimal id of obj."} {"instruction": "How would you explain what the following Python 3 function does\ndef _get_pretty_string(obj):\n sio = StringIO()\n pprint.pprint(obj, stream=sio)\n return sio.getvalue()", "response": "Return a prettier version of obj"} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef resolve(self, key, is_local):\n try:\n # only look for locals in outer scope\n if is_local:\n return self.scope[key]\n\n # not a local variable so check in resolvers if we have them\n if self.has_resolvers:\n return self.resolvers[key]\n\n # if we're here that means that we have no locals and we also have\n # no resolvers\n assert not is_local and not self.has_resolvers\n return self.scope[key]\n except KeyError:\n try:\n # last ditch effort we look in temporaries\n # these are created when parsing indexing expressions\n # e.g., df[df > 0]\n return self.temps[key]\n except KeyError:\n raise compu.ops.UndefinedVariableError(key, is_local)", "response": "Resolves a variable name in a possibly local context."} {"instruction": "Write a Python 3 function that can\nreplace a variable name with a potentially new value.", "response": "def swapkey(self, old_key, new_key, new_value=None):\n \"\"\"Replace a variable name, with a potentially new value.\n\n Parameters\n ----------\n old_key : str\n Current variable name to replace\n new_key : str\n New variable name to replace `old_key` with\n new_value : object\n Value to be replaced along with the possible renaming\n \"\"\"\n if self.has_resolvers:\n maps = self.resolvers.maps + self.scope.maps\n else:\n maps = self.scope.maps\n\n maps.append(self.temps)\n\n for mapping in maps:\n if old_key in mapping:\n mapping[new_key] = new_value\n return"} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef _get_vars(self, stack, scopes):\n variables = itertools.product(scopes, stack)\n for scope, (frame, _, _, _, _, _) in variables:\n try:\n d = getattr(frame, 'f_' + scope)\n self.scope = self.scope.new_child(d)\n finally:\n # won't remove it, but DECREF it\n # in Py3 this probably isn't necessary since frame won't be\n # scope after the loop\n del frame", "response": "Get specifically scoped variables from a list of stack frames."} {"instruction": "Can you tell what is the following Python 3 function doing\ndef update(self, level):\n sl = level + 1\n\n # add sl frames to the scope starting with the\n # most distant and overwriting with more current\n # makes sure that we can capture variable scope\n stack = inspect.stack()\n\n try:\n self._get_vars(stack[:sl], scopes=['locals'])\n finally:\n del stack[:], stack", "response": "Update the current scope by going back level levels."} {"instruction": "How would you explain what the following Python 3 function does\ndef add_tmp(self, value):\n name = '{name}_{num}_{hex_id}'.format(name=type(value).__name__,\n num=self.ntemps,\n hex_id=_raw_hex_id(self))\n\n # add to inner most scope\n assert name not in self.temps\n self.temps[name] = value\n assert name in self.temps\n\n # only increment if the variable gets put in the scope\n return name", "response": "Adds a temporary variable to the scope."} {"instruction": "Explain what the following Python 3 code does\ndef full_scope(self):\n maps = [self.temps] + self.resolvers.maps + self.scope.maps\n return DeepChainMap(*maps)", "response": "Return the full scope for use with passing to engines transparently\n as a mapping."} {"instruction": "Given the following Python 3 function, write the documentation\ndef read_sas(filepath_or_buffer, format=None, index=None, encoding=None,\n chunksize=None, iterator=False):\n \"\"\"\n Read SAS files stored as either XPORT or SAS7BDAT format files.\n\n Parameters\n ----------\n filepath_or_buffer : string or file-like object\n Path to the SAS file.\n format : string {'xport', 'sas7bdat'} or None\n If None, file format is inferred from file extension. If 'xport' or\n 'sas7bdat', uses the corresponding format.\n index : identifier of index column, defaults to None\n Identifier of column that should be used as index of the DataFrame.\n encoding : string, default is None\n Encoding for text data. If None, text data are stored as raw bytes.\n chunksize : int\n Read file `chunksize` lines at a time, returns iterator.\n iterator : bool, defaults to False\n If True, returns an iterator for reading the file incrementally.\n\n Returns\n -------\n DataFrame if iterator=False and chunksize=None, else SAS7BDATReader\n or XportReader\n \"\"\"\n if format is None:\n buffer_error_msg = (\"If this is a buffer object rather \"\n \"than a string name, you must specify \"\n \"a format string\")\n filepath_or_buffer = _stringify_path(filepath_or_buffer)\n if not isinstance(filepath_or_buffer, str):\n raise ValueError(buffer_error_msg)\n fname = filepath_or_buffer.lower()\n if fname.endswith(\".xpt\"):\n format = \"xport\"\n elif fname.endswith(\".sas7bdat\"):\n format = \"sas7bdat\"\n else:\n raise ValueError(\"unable to infer format of SAS file\")\n\n if format.lower() == 'xport':\n from pandas.io.sas.sas_xport import XportReader\n reader = XportReader(filepath_or_buffer, index=index,\n encoding=encoding,\n chunksize=chunksize)\n elif format.lower() == 'sas7bdat':\n from pandas.io.sas.sas7bdat import SAS7BDATReader\n reader = SAS7BDATReader(filepath_or_buffer, index=index,\n encoding=encoding,\n chunksize=chunksize)\n else:\n raise ValueError('unknown SAS format')\n\n if iterator or chunksize:\n return reader\n\n data = reader.read()\n reader.close()\n return data", "response": "Reads a SAS file into a DataFrame."} {"instruction": "Here you have a function in Python 3, explain what it does\ndef _coerce_method(converter):\n\n def wrapper(self):\n if len(self) == 1:\n return converter(self.iloc[0])\n raise TypeError(\"cannot convert the series to \"\n \"{0}\".format(str(converter)))\n\n wrapper.__name__ = \"__{name}__\".format(name=converter.__name__)\n return wrapper", "response": "Install the scalar coercion methods."} {"instruction": "Create a Python 3 function to\ninitialize the _data and index attributes of a new Series from a dictionary input.", "response": "def _init_dict(self, data, index=None, dtype=None):\n \"\"\"\n Derive the \"_data\" and \"index\" attributes of a new Series from a\n dictionary input.\n\n Parameters\n ----------\n data : dict or dict-like\n Data used to populate the new Series\n index : Index or index-like, default None\n index for the new Series: if None, use dict keys\n dtype : dtype, default None\n dtype for the new Series: if None, infer from data\n\n Returns\n -------\n _data : BlockManager for the new Series\n index : index for the new Series\n \"\"\"\n # Looking for NaN in dict doesn't work ({np.nan : 1}[float('nan')]\n # raises KeyError), so we iterate the entire dict, and align\n if data:\n keys, values = zip(*data.items())\n values = list(values)\n elif index is not None:\n # fastpath for Series(data=None). Just use broadcasting a scalar\n # instead of reindexing.\n values = na_value_for_dtype(dtype)\n keys = index\n else:\n keys, values = [], []\n\n # Input is now list-like, so rely on \"standard\" construction:\n s = Series(values, index=keys, dtype=dtype)\n\n # Now we just make sure the order is respected, if any\n if data and index is not None:\n s = s.reindex(index, copy=False)\n elif not PY36 and not isinstance(data, OrderedDict) and data:\n # Need the `and data` to avoid sorting Series(None, index=[...])\n # since that isn't really dict-like\n try:\n s = s.sort_index()\n except TypeError:\n pass\n return s._data, s.index"} {"instruction": "How would you explain what the following Python 3 function does\ndef from_array(cls, arr, index=None, name=None, dtype=None, copy=False,\n fastpath=False):\n \"\"\"\n Construct Series from array.\n\n .. deprecated :: 0.23.0\n Use pd.Series(..) constructor instead.\n \"\"\"\n warnings.warn(\"'from_array' is deprecated and will be removed in a \"\n \"future version. Please use the pd.Series(..) \"\n \"constructor instead.\", FutureWarning, stacklevel=2)\n if isinstance(arr, ABCSparseArray):\n from pandas.core.sparse.series import SparseSeries\n cls = SparseSeries\n return cls(arr, index=index, name=name, dtype=dtype,\n copy=copy, fastpath=fastpath)", "response": "Construct a Series from an array."} {"instruction": "Write a Python 3 function that can\noverride generic to set the _typ and _index attributes.", "response": "def _set_axis(self, axis, labels, fastpath=False):\n \"\"\"\n Override generic, we want to set the _typ here.\n \"\"\"\n\n if not fastpath:\n labels = ensure_index(labels)\n\n is_all_dates = labels.is_all_dates\n if is_all_dates:\n if not isinstance(labels,\n (DatetimeIndex, PeriodIndex, TimedeltaIndex)):\n try:\n labels = DatetimeIndex(labels)\n # need to set here because we changed the index\n if fastpath:\n self._data.set_axis(axis, labels)\n except (tslibs.OutOfBoundsDatetime, ValueError):\n # labels may exceeds datetime bounds,\n # or not be a DatetimeIndex\n pass\n\n self._set_subtyp(is_all_dates)\n\n object.__setattr__(self, '_index', labels)\n if not fastpath:\n self._data.set_axis(axis, labels)"} {"instruction": "Here you have a function in Python 3, explain what it does\ndef asobject(self):\n warnings.warn(\"'asobject' is deprecated. Use 'astype(object)'\"\n \" instead\", FutureWarning, stacklevel=2)\n return self.astype(object).values", "response": "Return the object Series which contains boxed values."} {"instruction": "How would you implement a function in Python 3 that\nreturns selected slices of an array along given axis as a Series.", "response": "def compress(self, condition, *args, **kwargs):\n \"\"\"\n Return selected slices of an array along given axis as a Series.\n\n .. deprecated:: 0.24.0\n\n See Also\n --------\n numpy.ndarray.compress\n \"\"\"\n msg = (\"Series.compress(condition) is deprecated. \"\n \"Use 'Series[condition]' or \"\n \"'np.asarray(series).compress(condition)' instead.\")\n warnings.warn(msg, FutureWarning, stacklevel=2)\n nv.validate_compress(args, kwargs)\n return self[condition]"} {"instruction": "Can you write a function in Python 3 where it\nreturns the integer indices of the elements that are non - zero.", "response": "def nonzero(self):\n \"\"\"\n Return the *integer* indices of the elements that are non-zero.\n\n .. deprecated:: 0.24.0\n Please use .to_numpy().nonzero() as a replacement.\n\n This method is equivalent to calling `numpy.nonzero` on the\n series data. For compatibility with NumPy, the return value is\n the same (a tuple with an array of indices for each dimension),\n but it will always be a one-item tuple because series only have\n one dimension.\n\n See Also\n --------\n numpy.nonzero\n\n Examples\n --------\n >>> s = pd.Series([0, 3, 0, 4])\n >>> s.nonzero()\n (array([1, 3]),)\n >>> s.iloc[s.nonzero()[0]]\n 1 3\n 3 4\n dtype: int64\n\n >>> s = pd.Series([0, 3, 0, 4], index=['a', 'b', 'c', 'd'])\n # same return although index of s is different\n >>> s.nonzero()\n (array([1, 3]),)\n >>> s.iloc[s.nonzero()[0]]\n b 3\n d 4\n dtype: int64\n \"\"\"\n msg = (\"Series.nonzero() is deprecated \"\n \"and will be removed in a future version.\"\n \"Use Series.to_numpy().nonzero() instead\")\n warnings.warn(msg, FutureWarning, stacklevel=2)\n return self._values.nonzero()"} {"instruction": "Can you write a function in Python 3 where it\ncreates a new view of the Series with the same underlying values in memory and optionally reinterpreted with a new data type.", "response": "def view(self, dtype=None):\n \"\"\"\n Create a new view of the Series.\n\n This function will return a new Series with a view of the same\n underlying values in memory, optionally reinterpreted with a new data\n type. The new data type must preserve the same size in bytes as to not\n cause index misalignment.\n\n Parameters\n ----------\n dtype : data type\n Data type object or one of their string representations.\n\n Returns\n -------\n Series\n A new Series object as a view of the same data in memory.\n\n See Also\n --------\n numpy.ndarray.view : Equivalent numpy function to create a new view of\n the same data in memory.\n\n Notes\n -----\n Series are instantiated with ``dtype=float64`` by default. While\n ``numpy.ndarray.view()`` will return a view with the same data type as\n the original array, ``Series.view()`` (without specified dtype)\n will try using ``float64`` and may fail if the original data type size\n in bytes is not the same.\n\n Examples\n --------\n >>> s = pd.Series([-2, -1, 0, 1, 2], dtype='int8')\n >>> s\n 0 -2\n 1 -1\n 2 0\n 3 1\n 4 2\n dtype: int8\n\n The 8 bit signed integer representation of `-1` is `0b11111111`, but\n the same bytes represent 255 if read as an 8 bit unsigned integer:\n\n >>> us = s.view('uint8')\n >>> us\n 0 254\n 1 255\n 2 0\n 3 1\n 4 2\n dtype: uint8\n\n The views share the same underlying values:\n\n >>> us[0] = 128\n >>> s\n 0 -128\n 1 -1\n 2 0\n 3 1\n 4 2\n dtype: int8\n \"\"\"\n return self._constructor(self._values.view(dtype),\n index=self.index).__finalize__(self)"} {"instruction": "Explain what the following Python 3 code does\ndef _ixs(self, i, axis=0):\n try:\n\n # dispatch to the values if we need\n values = self._values\n if isinstance(values, np.ndarray):\n return libindex.get_value_at(values, i)\n else:\n return values[i]\n except IndexError:\n raise\n except Exception:\n if isinstance(i, slice):\n indexer = self.index._convert_slice_indexer(i, kind='iloc')\n return self._get_values(indexer)\n else:\n label = self.index[i]\n if isinstance(label, Index):\n return self.take(i, axis=axis, convert=True)\n else:\n return libindex.get_value_at(self, i)", "response": "Return the i - th value or values in the Series by location."} {"instruction": "Can you tell what is the following Python 3 function doing\ndef repeat(self, repeats, axis=None):\n nv.validate_repeat(tuple(), dict(axis=axis))\n new_index = self.index.repeat(repeats)\n new_values = self._values.repeat(repeats)\n return self._constructor(new_values,\n index=new_index).__finalize__(self)", "response": "Returns a new Series with repeated elements of the current Series."} {"instruction": "Can you tell what is the following Python 3 function doing\ndef reset_index(self, level=None, drop=False, name=None, inplace=False):\n inplace = validate_bool_kwarg(inplace, 'inplace')\n if drop:\n new_index = ibase.default_index(len(self))\n if level is not None:\n if not isinstance(level, (tuple, list)):\n level = [level]\n level = [self.index._get_level_number(lev) for lev in level]\n if len(level) < self.index.nlevels:\n new_index = self.index.droplevel(level)\n\n if inplace:\n self.index = new_index\n # set name if it was passed, otherwise, keep the previous name\n self.name = name or self.name\n else:\n return self._constructor(self._values.copy(),\n index=new_index).__finalize__(self)\n elif inplace:\n raise TypeError('Cannot reset_index inplace on a Series '\n 'to create a DataFrame')\n else:\n df = self.to_frame(name)\n return df.reset_index(level=level, drop=drop)", "response": "This function will generate a new DataFrame or Series with the index reset."} {"instruction": "Given the following Python 3 function, write the documentation\ndef to_string(self, buf=None, na_rep='NaN', float_format=None, header=True,\n index=True, length=False, dtype=False, name=False,\n max_rows=None):\n \"\"\"\n Render a string representation of the Series.\n\n Parameters\n ----------\n buf : StringIO-like, optional\n Buffer to write to.\n na_rep : str, optional\n String representation of NaN to use, default 'NaN'.\n float_format : one-parameter function, optional\n Formatter function to apply to columns' elements if they are\n floats, default None.\n header : bool, default True\n Add the Series header (index name).\n index : bool, optional\n Add index (row) labels, default True.\n length : bool, default False\n Add the Series length.\n dtype : bool, default False\n Add the Series dtype.\n name : bool, default False\n Add the Series name if not None.\n max_rows : int, optional\n Maximum number of rows to show before truncating. If None, show\n all.\n\n Returns\n -------\n str or None\n String representation of Series if ``buf=None``, otherwise None.\n \"\"\"\n\n formatter = fmt.SeriesFormatter(self, name=name, length=length,\n header=header, index=index,\n dtype=dtype, na_rep=na_rep,\n float_format=float_format,\n max_rows=max_rows)\n result = formatter.to_string()\n\n # catch contract violations\n if not isinstance(result, str):\n raise AssertionError(\"result must be of type unicode, type\"\n \" of result is {0!r}\"\n \"\".format(result.__class__.__name__))\n\n if buf is None:\n return result\n else:\n try:\n buf.write(result)\n except AttributeError:\n with open(buf, 'w') as f:\n f.write(result)", "response": "Render a string representation of the Series."} {"instruction": "Make a summary of the following Python 3 code\ndef to_dict(self, into=dict):\n # GH16122\n into_c = com.standardize_mapping(into)\n return into_c(self.items())", "response": "Convert Series to dict or dict - like object."} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef to_frame(self, name=None):\n if name is None:\n df = self._constructor_expanddim(self)\n else:\n df = self._constructor_expanddim({name: self})\n\n return df", "response": "Convert Series to DataFrame."} {"instruction": "Write a Python 3 function that can\nconvert the Series to a SparseSeries.", "response": "def to_sparse(self, kind='block', fill_value=None):\n \"\"\"\n Convert Series to SparseSeries.\n\n Parameters\n ----------\n kind : {'block', 'integer'}, default 'block'\n fill_value : float, defaults to NaN (missing)\n Value to use for filling NaN values.\n\n Returns\n -------\n SparseSeries\n Sparse representation of the Series.\n \"\"\"\n # TODO: deprecate\n from pandas.core.sparse.series import SparseSeries\n\n values = SparseArray(self, kind=kind, fill_value=fill_value)\n return SparseSeries(\n values, index=self.index, name=self.name\n ).__finalize__(self)"} {"instruction": "Write a Python 3 function for\nsetting the Series name.", "response": "def _set_name(self, name, inplace=False):\n \"\"\"\n Set the Series name.\n\n Parameters\n ----------\n name : str\n inplace : bool\n whether to modify `self` directly or return a copy\n \"\"\"\n inplace = validate_bool_kwarg(inplace, 'inplace')\n ser = self if inplace else self.copy()\n ser.name = name\n return ser"} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef count(self, level=None):\n if level is None:\n return notna(com.values_from_object(self)).sum()\n\n if isinstance(level, str):\n level = self.index._get_level_number(level)\n\n lev = self.index.levels[level]\n level_codes = np.array(self.index.codes[level], subok=False, copy=True)\n\n mask = level_codes == -1\n if mask.any():\n level_codes[mask] = cnt = len(lev)\n lev = lev.insert(cnt, lev._na_value)\n\n obs = level_codes[notna(self.values)]\n out = np.bincount(obs, minlength=len(lev) or None)\n return self._constructor(out, index=lev,\n dtype='int64').__finalize__(self)", "response": "Count the number of non - NA observations in the Series."} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef drop_duplicates(self, keep='first', inplace=False):\n return super().drop_duplicates(keep=keep, inplace=inplace)", "response": "Drop duplicates from the Series."} {"instruction": "Create a Python 3 function for\nreturning the index of the minimum value in the Series or a Series along a given axis.", "response": "def idxmin(self, axis=0, skipna=True, *args, **kwargs):\n \"\"\"\n Return the row label of the minimum value.\n\n If multiple values equal the minimum, the first row label with that\n value is returned.\n\n Parameters\n ----------\n skipna : bool, default True\n Exclude NA/null values. If the entire Series is NA, the result\n will be NA.\n axis : int, default 0\n For compatibility with DataFrame.idxmin. Redundant for application\n on Series.\n *args, **kwargs\n Additional keywords have no effect but might be accepted\n for compatibility with NumPy.\n\n Returns\n -------\n Index\n Label of the minimum value.\n\n Raises\n ------\n ValueError\n If the Series is empty.\n\n See Also\n --------\n numpy.argmin : Return indices of the minimum values\n along the given axis.\n DataFrame.idxmin : Return index of first occurrence of minimum\n over requested axis.\n Series.idxmax : Return index *label* of the first occurrence\n of maximum of values.\n\n Notes\n -----\n This method is the Series version of ``ndarray.argmin``. This method\n returns the label of the minimum, while ``ndarray.argmin`` returns\n the position. To get the position, use ``series.values.argmin()``.\n\n Examples\n --------\n >>> s = pd.Series(data=[1, None, 4, 1],\n ... index=['A', 'B', 'C', 'D'])\n >>> s\n A 1.0\n B NaN\n C 4.0\n D 1.0\n dtype: float64\n\n >>> s.idxmin()\n 'A'\n\n If `skipna` is False and there is an NA value in the data,\n the function returns ``nan``.\n\n >>> s.idxmin(skipna=False)\n nan\n \"\"\"\n skipna = nv.validate_argmin_with_skipna(skipna, args, kwargs)\n i = nanops.nanargmin(com.values_from_object(self), skipna=skipna)\n if i == -1:\n return np.nan\n return self.index[i]"} {"instruction": "Write a Python 3 script to\nreturn the index of the maximum value in the Series or a Series along a given axis.", "response": "def idxmax(self, axis=0, skipna=True, *args, **kwargs):\n \"\"\"\n Return the row label of the maximum value.\n\n If multiple values equal the maximum, the first row label with that\n value is returned.\n\n Parameters\n ----------\n skipna : bool, default True\n Exclude NA/null values. If the entire Series is NA, the result\n will be NA.\n axis : int, default 0\n For compatibility with DataFrame.idxmax. Redundant for application\n on Series.\n *args, **kwargs\n Additional keywords have no effect but might be accepted\n for compatibility with NumPy.\n\n Returns\n -------\n Index\n Label of the maximum value.\n\n Raises\n ------\n ValueError\n If the Series is empty.\n\n See Also\n --------\n numpy.argmax : Return indices of the maximum values\n along the given axis.\n DataFrame.idxmax : Return index of first occurrence of maximum\n over requested axis.\n Series.idxmin : Return index *label* of the first occurrence\n of minimum of values.\n\n Notes\n -----\n This method is the Series version of ``ndarray.argmax``. This method\n returns the label of the maximum, while ``ndarray.argmax`` returns\n the position. To get the position, use ``series.values.argmax()``.\n\n Examples\n --------\n >>> s = pd.Series(data=[1, None, 4, 3, 4],\n ... index=['A', 'B', 'C', 'D', 'E'])\n >>> s\n A 1.0\n B NaN\n C 4.0\n D 3.0\n E 4.0\n dtype: float64\n\n >>> s.idxmax()\n 'C'\n\n If `skipna` is False and there is an NA value in the data,\n the function returns ``nan``.\n\n >>> s.idxmax(skipna=False)\n nan\n \"\"\"\n skipna = nv.validate_argmax_with_skipna(skipna, args, kwargs)\n i = nanops.nanargmax(com.values_from_object(self), skipna=skipna)\n if i == -1:\n return np.nan\n return self.index[i]"} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef round(self, decimals=0, *args, **kwargs):\n nv.validate_round(args, kwargs)\n result = com.values_from_object(self).round(decimals)\n result = self._constructor(result, index=self.index).__finalize__(self)\n\n return result", "response": "Round each value in a Series to the given number of decimals."} {"instruction": "Here you have a function in Python 3, explain what it does\ndef quantile(self, q=0.5, interpolation='linear'):\n\n self._check_percentile(q)\n\n # We dispatch to DataFrame so that core.internals only has to worry\n # about 2D cases.\n df = self.to_frame()\n\n result = df.quantile(q=q, interpolation=interpolation,\n numeric_only=False)\n if result.ndim == 2:\n result = result.iloc[:, 0]\n\n if is_list_like(q):\n result.name = self.name\n return self._constructor(result,\n index=Float64Index(q),\n name=self.name)\n else:\n # scalar\n return result.iloc[0]", "response": "Return the value at the given quantile."} {"instruction": "Create a Python 3 function for\ncomputing the correlation between two Series objects.", "response": "def corr(self, other, method='pearson', min_periods=None):\n \"\"\"\n Compute correlation with `other` Series, excluding missing values.\n\n Parameters\n ----------\n other : Series\n Series with which to compute the correlation.\n method : {'pearson', 'kendall', 'spearman'} or callable\n * pearson : standard correlation coefficient\n * kendall : Kendall Tau correlation coefficient\n * spearman : Spearman rank correlation\n * callable: callable with input two 1d ndarrays\n and returning a float. Note that the returned matrix from corr\n will have 1 along the diagonals and will be symmetric\n regardless of the callable's behavior\n .. versionadded:: 0.24.0\n\n min_periods : int, optional\n Minimum number of observations needed to have a valid result.\n\n Returns\n -------\n float\n Correlation with other.\n\n Examples\n --------\n >>> def histogram_intersection(a, b):\n ... v = np.minimum(a, b).sum().round(decimals=1)\n ... return v\n >>> s1 = pd.Series([.2, .0, .6, .2])\n >>> s2 = pd.Series([.3, .6, .0, .1])\n >>> s1.corr(s2, method=histogram_intersection)\n 0.3\n \"\"\"\n this, other = self.align(other, join='inner', copy=False)\n if len(this) == 0:\n return np.nan\n\n if method in ['pearson', 'spearman', 'kendall'] or callable(method):\n return nanops.nancorr(this.values, other.values, method=method,\n min_periods=min_periods)\n\n raise ValueError(\"method must be either 'pearson', \"\n \"'spearman', 'kendall', or a callable, \"\n \"'{method}' was supplied\".format(method=method))"} {"instruction": "Explain what the following Python 3 code does\ndef cov(self, other, min_periods=None):\n this, other = self.align(other, join='inner', copy=False)\n if len(this) == 0:\n return np.nan\n return nanops.nancov(this.values, other.values,\n min_periods=min_periods)", "response": "Compute the covariance of two Series objects."} {"instruction": "How would you code a function in Python 3 to\nreturn the first discrete difference of an object.", "response": "def diff(self, periods=1):\n \"\"\"\n First discrete difference of element.\n\n Calculates the difference of a Series element compared with another\n element in the Series (default is element in previous row).\n\n Parameters\n ----------\n periods : int, default 1\n Periods to shift for calculating difference, accepts negative\n values.\n\n Returns\n -------\n Series\n First differences of the Series.\n\n See Also\n --------\n Series.pct_change: Percent change over given number of periods.\n Series.shift: Shift index by desired number of periods with an\n optional time freq.\n DataFrame.diff: First discrete difference of object.\n\n Examples\n --------\n Difference with previous row\n\n >>> s = pd.Series([1, 1, 2, 3, 5, 8])\n >>> s.diff()\n 0 NaN\n 1 0.0\n 2 1.0\n 3 1.0\n 4 2.0\n 5 3.0\n dtype: float64\n\n Difference with 3rd previous row\n\n >>> s.diff(periods=3)\n 0 NaN\n 1 NaN\n 2 NaN\n 3 2.0\n 4 4.0\n 5 6.0\n dtype: float64\n\n Difference with following row\n\n >>> s.diff(periods=-1)\n 0 0.0\n 1 -1.0\n 2 -1.0\n 3 -2.0\n 4 -3.0\n 5 NaN\n dtype: float64\n \"\"\"\n result = algorithms.diff(com.values_from_object(self), periods)\n return self._constructor(result, index=self.index).__finalize__(self)"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef dot(self, other):\n from pandas.core.frame import DataFrame\n if isinstance(other, (Series, DataFrame)):\n common = self.index.union(other.index)\n if (len(common) > len(self.index) or\n len(common) > len(other.index)):\n raise ValueError('matrices are not aligned')\n\n left = self.reindex(index=common, copy=False)\n right = other.reindex(index=common, copy=False)\n lvals = left.values\n rvals = right.values\n else:\n lvals = self.values\n rvals = np.asarray(other)\n if lvals.shape[0] != rvals.shape[0]:\n raise Exception('Dot product shape mismatch, %s vs %s' %\n (lvals.shape, rvals.shape))\n\n if isinstance(other, DataFrame):\n return self._constructor(np.dot(lvals, rvals),\n index=other.columns).__finalize__(self)\n elif isinstance(other, Series):\n return np.dot(lvals, rvals)\n elif isinstance(rvals, np.ndarray):\n return np.dot(lvals, rvals)\n else: # pragma: no cover\n raise TypeError('unsupported type: %s' % type(other))", "response": "Compute the dot product of two Series or array - like objects."} {"instruction": "How would you implement a function in Python 3 that\nappends a Series or list of Series to the end of the current Series.", "response": "def append(self, to_append, ignore_index=False, verify_integrity=False):\n \"\"\"\n Concatenate two or more Series.\n\n Parameters\n ----------\n to_append : Series or list/tuple of Series\n Series to append with self.\n ignore_index : bool, default False\n If True, do not use the index labels.\n\n .. versionadded:: 0.19.0\n\n verify_integrity : bool, default False\n If True, raise Exception on creating index with duplicates.\n\n Returns\n -------\n Series\n Concatenated Series.\n\n See Also\n --------\n concat : General function to concatenate DataFrame, Series\n or Panel objects.\n\n Notes\n -----\n Iteratively appending to a Series can be more computationally intensive\n than a single concatenate. A better solution is to append values to a\n list and then concatenate the list with the original Series all at\n once.\n\n Examples\n --------\n >>> s1 = pd.Series([1, 2, 3])\n >>> s2 = pd.Series([4, 5, 6])\n >>> s3 = pd.Series([4, 5, 6], index=[3, 4, 5])\n >>> s1.append(s2)\n 0 1\n 1 2\n 2 3\n 0 4\n 1 5\n 2 6\n dtype: int64\n\n >>> s1.append(s3)\n 0 1\n 1 2\n 2 3\n 3 4\n 4 5\n 5 6\n dtype: int64\n\n With `ignore_index` set to True:\n\n >>> s1.append(s2, ignore_index=True)\n 0 1\n 1 2\n 2 3\n 3 4\n 4 5\n 5 6\n dtype: int64\n\n With `verify_integrity` set to True:\n\n >>> s1.append(s2, verify_integrity=True)\n Traceback (most recent call last):\n ...\n ValueError: Indexes have overlapping values: [0, 1, 2]\n \"\"\"\n from pandas.core.reshape.concat import concat\n\n if isinstance(to_append, (list, tuple)):\n to_concat = [self] + to_append\n else:\n to_concat = [self, to_append]\n return concat(to_concat, ignore_index=ignore_index,\n verify_integrity=verify_integrity)"} {"instruction": "Explain what the following Python 3 code does\ndef _binop(self, other, func, level=None, fill_value=None):\n\n if not isinstance(other, Series):\n raise AssertionError('Other operand must be Series')\n\n new_index = self.index\n this = self\n\n if not self.index.equals(other.index):\n this, other = self.align(other, level=level, join='outer',\n copy=False)\n new_index = this.index\n\n this_vals, other_vals = ops.fill_binop(this.values, other.values,\n fill_value)\n\n with np.errstate(all='ignore'):\n result = func(this_vals, other_vals)\n\n name = ops.get_op_result_name(self, other)\n if func.__name__ in ['divmod', 'rdivmod']:\n ret = ops._construct_divmod_result(self, result, new_index, name)\n else:\n ret = ops._construct_result(self, result, new_index, name)\n return ret", "response": "Perform generic binary operation on two Series objects."} {"instruction": "Create a Python 3 function to\ncombine the Series with another Series or scalar according to func.", "response": "def combine(self, other, func, fill_value=None):\n \"\"\"\n Combine the Series with a Series or scalar according to `func`.\n\n Combine the Series and `other` using `func` to perform elementwise\n selection for combined Series.\n `fill_value` is assumed when value is missing at some index\n from one of the two objects being combined.\n\n Parameters\n ----------\n other : Series or scalar\n The value(s) to be combined with the `Series`.\n func : function\n Function that takes two scalars as inputs and returns an element.\n fill_value : scalar, optional\n The value to assume when an index is missing from\n one Series or the other. The default specifies to use the\n appropriate NaN value for the underlying dtype of the Series.\n\n Returns\n -------\n Series\n The result of combining the Series with the other object.\n\n See Also\n --------\n Series.combine_first : Combine Series values, choosing the calling\n Series' values first.\n\n Examples\n --------\n Consider 2 Datasets ``s1`` and ``s2`` containing\n highest clocked speeds of different birds.\n\n >>> s1 = pd.Series({'falcon': 330.0, 'eagle': 160.0})\n >>> s1\n falcon 330.0\n eagle 160.0\n dtype: float64\n >>> s2 = pd.Series({'falcon': 345.0, 'eagle': 200.0, 'duck': 30.0})\n >>> s2\n falcon 345.0\n eagle 200.0\n duck 30.0\n dtype: float64\n\n Now, to combine the two datasets and view the highest speeds\n of the birds across the two datasets\n\n >>> s1.combine(s2, max)\n duck NaN\n eagle 200.0\n falcon 345.0\n dtype: float64\n\n In the previous example, the resulting value for duck is missing,\n because the maximum of a NaN and a float is a NaN.\n So, in the example, we set ``fill_value=0``,\n so the maximum value returned will be the value from some dataset.\n\n >>> s1.combine(s2, max, fill_value=0)\n duck 30.0\n eagle 200.0\n falcon 345.0\n dtype: float64\n \"\"\"\n if fill_value is None:\n fill_value = na_value_for_dtype(self.dtype, compat=False)\n\n if isinstance(other, Series):\n # If other is a Series, result is based on union of Series,\n # so do this element by element\n new_index = self.index.union(other.index)\n new_name = ops.get_op_result_name(self, other)\n new_values = []\n for idx in new_index:\n lv = self.get(idx, fill_value)\n rv = other.get(idx, fill_value)\n with np.errstate(all='ignore'):\n new_values.append(func(lv, rv))\n else:\n # Assume that other is a scalar, so apply the function for\n # each element in the Series\n new_index = self.index\n with np.errstate(all='ignore'):\n new_values = [func(lv, other) for lv in self._values]\n new_name = self.name\n\n if is_categorical_dtype(self.values):\n pass\n elif is_extension_array_dtype(self.values):\n # The function can return something of any type, so check\n # if the type is compatible with the calling EA.\n try:\n new_values = self._values._from_sequence(new_values)\n except Exception:\n # https://github.com/pandas-dev/pandas/issues/22850\n # pandas has no control over what 3rd-party ExtensionArrays\n # do in _values_from_sequence. We still want ops to work\n # though, so we catch any regular Exception.\n pass\n\n return self._constructor(new_values, index=new_index, name=new_name)"} {"instruction": "Create a Python 3 function for\ncombining the values of two Series objects choosing the calling Series s values first.", "response": "def combine_first(self, other):\n \"\"\"\n Combine Series values, choosing the calling Series's values first.\n\n Parameters\n ----------\n other : Series\n The value(s) to be combined with the `Series`.\n\n Returns\n -------\n Series\n The result of combining the Series with the other object.\n\n See Also\n --------\n Series.combine : Perform elementwise operation on two Series\n using a given function.\n\n Notes\n -----\n Result index will be the union of the two indexes.\n\n Examples\n --------\n >>> s1 = pd.Series([1, np.nan])\n >>> s2 = pd.Series([3, 4])\n >>> s1.combine_first(s2)\n 0 1.0\n 1 4.0\n dtype: float64\n \"\"\"\n new_index = self.index.union(other.index)\n this = self.reindex(new_index, copy=False)\n other = other.reindex(new_index, copy=False)\n if is_datetimelike(this) and not is_datetimelike(other):\n other = to_datetime(other)\n\n return this.where(notna(this), other)"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef update(self, other):\n other = other.reindex_like(self)\n mask = notna(other)\n\n self._data = self._data.putmask(mask=mask, new=other, inplace=True)\n self._maybe_update_cacher()", "response": "Modify Series in place using non - NA values from passed Series."} {"instruction": "Here you have a function in Python 3, explain what it does\ndef sort_values(self, axis=0, ascending=True, inplace=False,\n kind='quicksort', na_position='last'):\n \"\"\"\n Sort by the values.\n\n Sort a Series in ascending or descending order by some\n criterion.\n\n Parameters\n ----------\n axis : {0 or 'index'}, default 0\n Axis to direct sorting. The value 'index' is accepted for\n compatibility with DataFrame.sort_values.\n ascending : bool, default True\n If True, sort values in ascending order, otherwise descending.\n inplace : bool, default False\n If True, perform operation in-place.\n kind : {'quicksort', 'mergesort' or 'heapsort'}, default 'quicksort'\n Choice of sorting algorithm. See also :func:`numpy.sort` for more\n information. 'mergesort' is the only stable algorithm.\n na_position : {'first' or 'last'}, default 'last'\n Argument 'first' puts NaNs at the beginning, 'last' puts NaNs at\n the end.\n\n Returns\n -------\n Series\n Series ordered by values.\n\n See Also\n --------\n Series.sort_index : Sort by the Series indices.\n DataFrame.sort_values : Sort DataFrame by the values along either axis.\n DataFrame.sort_index : Sort DataFrame by indices.\n\n Examples\n --------\n >>> s = pd.Series([np.nan, 1, 3, 10, 5])\n >>> s\n 0 NaN\n 1 1.0\n 2 3.0\n 3 10.0\n 4 5.0\n dtype: float64\n\n Sort values ascending order (default behaviour)\n\n >>> s.sort_values(ascending=True)\n 1 1.0\n 2 3.0\n 4 5.0\n 3 10.0\n 0 NaN\n dtype: float64\n\n Sort values descending order\n\n >>> s.sort_values(ascending=False)\n 3 10.0\n 4 5.0\n 2 3.0\n 1 1.0\n 0 NaN\n dtype: float64\n\n Sort values inplace\n\n >>> s.sort_values(ascending=False, inplace=True)\n >>> s\n 3 10.0\n 4 5.0\n 2 3.0\n 1 1.0\n 0 NaN\n dtype: float64\n\n Sort values putting NAs first\n\n >>> s.sort_values(na_position='first')\n 0 NaN\n 1 1.0\n 2 3.0\n 4 5.0\n 3 10.0\n dtype: float64\n\n Sort a series of strings\n\n >>> s = pd.Series(['z', 'b', 'd', 'a', 'c'])\n >>> s\n 0 z\n 1 b\n 2 d\n 3 a\n 4 c\n dtype: object\n\n >>> s.sort_values()\n 3 a\n 1 b\n 4 c\n 2 d\n 0 z\n dtype: object\n \"\"\"\n inplace = validate_bool_kwarg(inplace, 'inplace')\n # Validate the axis parameter\n self._get_axis_number(axis)\n\n # GH 5856/5853\n if inplace and self._is_cached:\n raise ValueError(\"This Series is a view of some other array, to \"\n \"sort in-place you must create a copy\")\n\n def _try_kind_sort(arr):\n # easier to ask forgiveness than permission\n try:\n # if kind==mergesort, it can fail for object dtype\n return arr.argsort(kind=kind)\n except TypeError:\n # stable sort not available for object dtype\n # uses the argsort default quicksort\n return arr.argsort(kind='quicksort')\n\n arr = self._values\n sortedIdx = np.empty(len(self), dtype=np.int32)\n\n bad = isna(arr)\n\n good = ~bad\n idx = ibase.default_index(len(self))\n\n argsorted = _try_kind_sort(arr[good])\n\n if is_list_like(ascending):\n if len(ascending) != 1:\n raise ValueError('Length of ascending (%d) must be 1 '\n 'for Series' % (len(ascending)))\n ascending = ascending[0]\n\n if not is_bool(ascending):\n raise ValueError('ascending must be boolean')\n\n if not ascending:\n argsorted = argsorted[::-1]\n\n if na_position == 'last':\n n = good.sum()\n sortedIdx[:n] = idx[good][argsorted]\n sortedIdx[n:] = idx[bad]\n elif na_position == 'first':\n n = bad.sum()\n sortedIdx[n:] = idx[good][argsorted]\n sortedIdx[:n] = idx[bad]\n else:\n raise ValueError('invalid na_position: {!r}'.format(na_position))\n\n result = self._constructor(arr[sortedIdx], index=self.index[sortedIdx])\n\n if inplace:\n self._update_inplace(result)\n else:\n return result.__finalize__(self)", "response": "Sort a Series or Series by values along a specific axis."} {"instruction": "Given the following Python 3 function, write the documentation\ndef sort_index(self, axis=0, level=None, ascending=True, inplace=False,\n kind='quicksort', na_position='last', sort_remaining=True):\n \"\"\"\n Sort Series by index labels.\n\n Returns a new Series sorted by label if `inplace` argument is\n ``False``, otherwise updates the original series and returns None.\n\n Parameters\n ----------\n axis : int, default 0\n Axis to direct sorting. This can only be 0 for Series.\n level : int, optional\n If not None, sort on values in specified index level(s).\n ascending : bool, default true\n Sort ascending vs. descending.\n inplace : bool, default False\n If True, perform operation in-place.\n kind : {'quicksort', 'mergesort', 'heapsort'}, default 'quicksort'\n Choice of sorting algorithm. See also :func:`numpy.sort` for more\n information. 'mergesort' is the only stable algorithm. For\n DataFrames, this option is only applied when sorting on a single\n column or label.\n na_position : {'first', 'last'}, default 'last'\n If 'first' puts NaNs at the beginning, 'last' puts NaNs at the end.\n Not implemented for MultiIndex.\n sort_remaining : bool, default True\n If True and sorting by level and index is multilevel, sort by other\n levels too (in order) after sorting by specified level.\n\n Returns\n -------\n Series\n The original Series sorted by the labels.\n\n See Also\n --------\n DataFrame.sort_index: Sort DataFrame by the index.\n DataFrame.sort_values: Sort DataFrame by the value.\n Series.sort_values : Sort Series by the value.\n\n Examples\n --------\n >>> s = pd.Series(['a', 'b', 'c', 'd'], index=[3, 2, 1, 4])\n >>> s.sort_index()\n 1 c\n 2 b\n 3 a\n 4 d\n dtype: object\n\n Sort Descending\n\n >>> s.sort_index(ascending=False)\n 4 d\n 3 a\n 2 b\n 1 c\n dtype: object\n\n Sort Inplace\n\n >>> s.sort_index(inplace=True)\n >>> s\n 1 c\n 2 b\n 3 a\n 4 d\n dtype: object\n\n By default NaNs are put at the end, but use `na_position` to place\n them at the beginning\n\n >>> s = pd.Series(['a', 'b', 'c', 'd'], index=[3, 2, 1, np.nan])\n >>> s.sort_index(na_position='first')\n NaN d\n 1.0 c\n 2.0 b\n 3.0 a\n dtype: object\n\n Specify index level to sort\n\n >>> arrays = [np.array(['qux', 'qux', 'foo', 'foo',\n ... 'baz', 'baz', 'bar', 'bar']),\n ... np.array(['two', 'one', 'two', 'one',\n ... 'two', 'one', 'two', 'one'])]\n >>> s = pd.Series([1, 2, 3, 4, 5, 6, 7, 8], index=arrays)\n >>> s.sort_index(level=1)\n bar one 8\n baz one 6\n foo one 4\n qux one 2\n bar two 7\n baz two 5\n foo two 3\n qux two 1\n dtype: int64\n\n Does not sort by remaining levels when sorting by levels\n\n >>> s.sort_index(level=1, sort_remaining=False)\n qux one 2\n foo one 4\n baz one 6\n bar one 8\n qux two 1\n foo two 3\n baz two 5\n bar two 7\n dtype: int64\n \"\"\"\n # TODO: this can be combined with DataFrame.sort_index impl as\n # almost identical\n inplace = validate_bool_kwarg(inplace, 'inplace')\n # Validate the axis parameter\n self._get_axis_number(axis)\n index = self.index\n\n if level is not None:\n new_index, indexer = index.sortlevel(level, ascending=ascending,\n sort_remaining=sort_remaining)\n elif isinstance(index, MultiIndex):\n from pandas.core.sorting import lexsort_indexer\n labels = index._sort_levels_monotonic()\n indexer = lexsort_indexer(labels._get_codes_for_sorting(),\n orders=ascending,\n na_position=na_position)\n else:\n from pandas.core.sorting import nargsort\n\n # Check monotonic-ness before sort an index\n # GH11080\n if ((ascending and index.is_monotonic_increasing) or\n (not ascending and index.is_monotonic_decreasing)):\n if inplace:\n return\n else:\n return self.copy()\n\n indexer = nargsort(index, kind=kind, ascending=ascending,\n na_position=na_position)\n\n indexer = ensure_platform_int(indexer)\n new_index = index.take(indexer)\n new_index = new_index._sort_levels_monotonic()\n\n new_values = self._values.take(indexer)\n result = self._constructor(new_values, index=new_index)\n\n if inplace:\n self._update_inplace(result)\n else:\n return result.__finalize__(self)", "response": "Sort the Series by index."} {"instruction": "Write a Python 3 script for\noverriding ndarray.argsort. Argsorts the value, omitting NA/null values, and places the result in the same locations as the non-NA values. Parameters ---------- axis : int Has no effect but is accepted for compatibility with numpy. kind : {'mergesort', 'quicksort', 'heapsort'}, default 'quicksort' Choice of sorting algorithm. See np.sort for more information. 'mergesort' is the only stable algorithm order : None Has no effect but is accepted for compatibility with numpy. Returns ------- Series Positions of values within the sort order with -1 indicating nan values. See Also -------- numpy.ndarray.argsort", "response": "def argsort(self, axis=0, kind='quicksort', order=None):\n \"\"\"\n Override ndarray.argsort. Argsorts the value, omitting NA/null values,\n and places the result in the same locations as the non-NA values.\n\n Parameters\n ----------\n axis : int\n Has no effect but is accepted for compatibility with numpy.\n kind : {'mergesort', 'quicksort', 'heapsort'}, default 'quicksort'\n Choice of sorting algorithm. See np.sort for more\n information. 'mergesort' is the only stable algorithm\n order : None\n Has no effect but is accepted for compatibility with numpy.\n\n Returns\n -------\n Series\n Positions of values within the sort order with -1 indicating\n nan values.\n\n See Also\n --------\n numpy.ndarray.argsort\n \"\"\"\n values = self._values\n mask = isna(values)\n\n if mask.any():\n result = Series(-1, index=self.index, name=self.name,\n dtype='int64')\n notmask = ~mask\n result[notmask] = np.argsort(values[notmask], kind=kind)\n return self._constructor(result,\n index=self.index).__finalize__(self)\n else:\n return self._constructor(\n np.argsort(values, kind=kind), index=self.index,\n dtype='int64').__finalize__(self)"} {"instruction": "How would you explain what the following Python 3 function does\ndef nlargest(self, n=5, keep='first'):\n return algorithms.SelectNSeries(self, n=n, keep=keep).nlargest()", "response": "Return the n largest values in a Series."} {"instruction": "Here you have a function in Python 3, explain what it does\ndef nsmallest(self, n=5, keep='first'):\n return algorithms.SelectNSeries(self, n=n, keep=keep).nsmallest()", "response": "Return the n smallest values in a Series."} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef swaplevel(self, i=-2, j=-1, copy=True):\n new_index = self.index.swaplevel(i, j)\n return self._constructor(self._values, index=new_index,\n copy=copy).__finalize__(self)", "response": "Returns a new Series with the levels swapped in the MultiIndex."} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef reorder_levels(self, order):\n if not isinstance(self.index, MultiIndex): # pragma: no cover\n raise Exception('Can only reorder levels on a hierarchical axis.')\n\n result = self.copy()\n result.index = result.index.reorder_levels(order)\n return result", "response": "Rearrange index levels using input order."} {"instruction": "Implement a Python 3 function for\nmapping values of a dictionary or Series to a unique value.", "response": "def map(self, arg, na_action=None):\n \"\"\"\n Map values of Series according to input correspondence.\n\n Used for substituting each value in a Series with another value,\n that may be derived from a function, a ``dict`` or\n a :class:`Series`.\n\n Parameters\n ----------\n arg : function, dict, or Series\n Mapping correspondence.\n na_action : {None, 'ignore'}, default None\n If 'ignore', propagate NaN values, without passing them to the\n mapping correspondence.\n\n Returns\n -------\n Series\n Same index as caller.\n\n See Also\n --------\n Series.apply : For applying more complex functions on a Series.\n DataFrame.apply : Apply a function row-/column-wise.\n DataFrame.applymap : Apply a function elementwise on a whole DataFrame.\n\n Notes\n -----\n When ``arg`` is a dictionary, values in Series that are not in the\n dictionary (as keys) are converted to ``NaN``. However, if the\n dictionary is a ``dict`` subclass that defines ``__missing__`` (i.e.\n provides a method for default values), then this default is used\n rather than ``NaN``.\n\n Examples\n --------\n >>> s = pd.Series(['cat', 'dog', np.nan, 'rabbit'])\n >>> s\n 0 cat\n 1 dog\n 2 NaN\n 3 rabbit\n dtype: object\n\n ``map`` accepts a ``dict`` or a ``Series``. Values that are not found\n in the ``dict`` are converted to ``NaN``, unless the dict has a default\n value (e.g. ``defaultdict``):\n\n >>> s.map({'cat': 'kitten', 'dog': 'puppy'})\n 0 kitten\n 1 puppy\n 2 NaN\n 3 NaN\n dtype: object\n\n It also accepts a function:\n\n >>> s.map('I am a {}'.format)\n 0 I am a cat\n 1 I am a dog\n 2 I am a nan\n 3 I am a rabbit\n dtype: object\n\n To avoid applying the function to missing values (and keep them as\n ``NaN``) ``na_action='ignore'`` can be used:\n\n >>> s.map('I am a {}'.format, na_action='ignore')\n 0 I am a cat\n 1 I am a dog\n 2 NaN\n 3 I am a rabbit\n dtype: object\n \"\"\"\n new_values = super()._map_values(\n arg, na_action=na_action)\n return self._constructor(new_values,\n index=self.index).__finalize__(self)"} {"instruction": "Implement a Python 3 function for\napplying a function to the values of the Series and returns a DataFrame.", "response": "def apply(self, func, convert_dtype=True, args=(), **kwds):\n \"\"\"\n Invoke function on values of Series.\n\n Can be ufunc (a NumPy function that applies to the entire Series)\n or a Python function that only works on single values.\n\n Parameters\n ----------\n func : function\n Python function or NumPy ufunc to apply.\n convert_dtype : bool, default True\n Try to find better dtype for elementwise function results. If\n False, leave as dtype=object.\n args : tuple\n Positional arguments passed to func after the series value.\n **kwds\n Additional keyword arguments passed to func.\n\n Returns\n -------\n Series or DataFrame\n If func returns a Series object the result will be a DataFrame.\n\n See Also\n --------\n Series.map: For element-wise operations.\n Series.agg: Only perform aggregating type operations.\n Series.transform: Only perform transforming type operations.\n\n Examples\n --------\n Create a series with typical summer temperatures for each city.\n\n >>> s = pd.Series([20, 21, 12],\n ... index=['London', 'New York', 'Helsinki'])\n >>> s\n London 20\n New York 21\n Helsinki 12\n dtype: int64\n\n Square the values by defining a function and passing it as an\n argument to ``apply()``.\n\n >>> def square(x):\n ... return x ** 2\n >>> s.apply(square)\n London 400\n New York 441\n Helsinki 144\n dtype: int64\n\n Square the values by passing an anonymous function as an\n argument to ``apply()``.\n\n >>> s.apply(lambda x: x ** 2)\n London 400\n New York 441\n Helsinki 144\n dtype: int64\n\n Define a custom function that needs additional positional\n arguments and pass these additional arguments using the\n ``args`` keyword.\n\n >>> def subtract_custom_value(x, custom_value):\n ... return x - custom_value\n\n >>> s.apply(subtract_custom_value, args=(5,))\n London 15\n New York 16\n Helsinki 7\n dtype: int64\n\n Define a custom function that takes keyword arguments\n and pass these arguments to ``apply``.\n\n >>> def add_custom_values(x, **kwargs):\n ... for month in kwargs:\n ... x += kwargs[month]\n ... return x\n\n >>> s.apply(add_custom_values, june=30, july=20, august=25)\n London 95\n New York 96\n Helsinki 87\n dtype: int64\n\n Use a function from the Numpy library.\n\n >>> s.apply(np.log)\n London 2.995732\n New York 3.044522\n Helsinki 2.484907\n dtype: float64\n \"\"\"\n if len(self) == 0:\n return self._constructor(dtype=self.dtype,\n index=self.index).__finalize__(self)\n\n # dispatch to agg\n if isinstance(func, (list, dict)):\n return self.aggregate(func, *args, **kwds)\n\n # if we are a string, try to dispatch\n if isinstance(func, str):\n return self._try_aggregate_string_function(func, *args, **kwds)\n\n # handle ufuncs and lambdas\n if kwds or args and not isinstance(func, np.ufunc):\n def f(x):\n return func(x, *args, **kwds)\n else:\n f = func\n\n with np.errstate(all='ignore'):\n if isinstance(f, np.ufunc):\n return f(self)\n\n # row-wise access\n if is_extension_type(self.dtype):\n mapped = self._values.map(f)\n else:\n values = self.astype(object).values\n mapped = lib.map_infer(values, f, convert=convert_dtype)\n\n if len(mapped) and isinstance(mapped[0], Series):\n from pandas.core.frame import DataFrame\n return DataFrame(mapped.tolist(), index=self.index)\n else:\n return self._constructor(mapped,\n index=self.index).__finalize__(self)"} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef _reduce(self, op, name, axis=0, skipna=True, numeric_only=None,\n filter_type=None, **kwds):\n \"\"\"\n Perform a reduction operation.\n\n If we have an ndarray as a value, then simply perform the operation,\n otherwise delegate to the object.\n \"\"\"\n delegate = self._values\n\n if axis is not None:\n self._get_axis_number(axis)\n\n if isinstance(delegate, Categorical):\n # TODO deprecate numeric_only argument for Categorical and use\n # skipna as well, see GH25303\n return delegate._reduce(name, numeric_only=numeric_only, **kwds)\n elif isinstance(delegate, ExtensionArray):\n # dispatch to ExtensionArray interface\n return delegate._reduce(name, skipna=skipna, **kwds)\n elif is_datetime64_dtype(delegate):\n # use DatetimeIndex implementation to handle skipna correctly\n delegate = DatetimeIndex(delegate)\n\n # dispatch to numpy arrays\n elif isinstance(delegate, np.ndarray):\n if numeric_only:\n raise NotImplementedError('Series.{0} does not implement '\n 'numeric_only.'.format(name))\n with np.errstate(all='ignore'):\n return op(delegate, skipna=skipna, **kwds)\n\n # TODO(EA) dispatch to Index\n # remove once all internals extension types are\n # moved to ExtensionArrays\n return delegate._reduce(op=op, name=name, axis=axis, skipna=skipna,\n numeric_only=numeric_only,\n filter_type=filter_type, **kwds)", "response": "Perform a reduction operation on the object."} {"instruction": "Can you write a function in Python 3 where it\nrenames the name of the Series or MultiIndex.", "response": "def rename(self, index=None, **kwargs):\n \"\"\"\n Alter Series index labels or name.\n\n Function / dict values must be unique (1-to-1). Labels not contained in\n a dict / Series will be left as-is. Extra labels listed don't throw an\n error.\n\n Alternatively, change ``Series.name`` with a scalar value.\n\n See the :ref:`user guide ` for more.\n\n Parameters\n ----------\n index : scalar, hashable sequence, dict-like or function, optional\n dict-like or functions are transformations to apply to\n the index.\n Scalar or hashable sequence-like will alter the ``Series.name``\n attribute.\n copy : bool, default True\n Whether to copy underlying data.\n inplace : bool, default False\n Whether to return a new Series. If True then value of copy is\n ignored.\n level : int or level name, default None\n In case of a MultiIndex, only rename labels in the specified\n level.\n\n Returns\n -------\n Series\n Series with index labels or name altered.\n\n See Also\n --------\n Series.rename_axis : Set the name of the axis.\n\n Examples\n --------\n >>> s = pd.Series([1, 2, 3])\n >>> s\n 0 1\n 1 2\n 2 3\n dtype: int64\n >>> s.rename(\"my_name\") # scalar, changes Series.name\n 0 1\n 1 2\n 2 3\n Name: my_name, dtype: int64\n >>> s.rename(lambda x: x ** 2) # function, changes labels\n 0 1\n 1 2\n 4 3\n dtype: int64\n >>> s.rename({1: 3, 2: 5}) # mapping, changes labels\n 0 1\n 3 2\n 5 3\n dtype: int64\n \"\"\"\n kwargs['inplace'] = validate_bool_kwarg(kwargs.get('inplace', False),\n 'inplace')\n\n non_mapping = is_scalar(index) or (is_list_like(index) and\n not is_dict_like(index))\n if non_mapping:\n return self._set_name(index, inplace=kwargs.get('inplace'))\n return super().rename(index=index, **kwargs)"} {"instruction": "Here you have a function in Python 3, explain what it does\ndef reindex_axis(self, labels, axis=0, **kwargs):\n # for compatibility with higher dims\n if axis != 0:\n raise ValueError(\"cannot reindex series on non-zero axis!\")\n msg = (\"'.reindex_axis' is deprecated and will be removed in a future \"\n \"version. Use '.reindex' instead.\")\n warnings.warn(msg, FutureWarning, stacklevel=2)\n\n return self.reindex(index=labels, **kwargs)", "response": "Conform Series to new index with optional filling logic."} {"instruction": "Can you implement a function in Python 3 that\nreturns the memory usage of the Series.", "response": "def memory_usage(self, index=True, deep=False):\n \"\"\"\n Return the memory usage of the Series.\n\n The memory usage can optionally include the contribution of\n the index and of elements of `object` dtype.\n\n Parameters\n ----------\n index : bool, default True\n Specifies whether to include the memory usage of the Series index.\n deep : bool, default False\n If True, introspect the data deeply by interrogating\n `object` dtypes for system-level memory consumption, and include\n it in the returned value.\n\n Returns\n -------\n int\n Bytes of memory consumed.\n\n See Also\n --------\n numpy.ndarray.nbytes : Total bytes consumed by the elements of the\n array.\n DataFrame.memory_usage : Bytes consumed by a DataFrame.\n\n Examples\n --------\n >>> s = pd.Series(range(3))\n >>> s.memory_usage()\n 104\n\n Not including the index gives the size of the rest of the data, which\n is necessarily smaller:\n\n >>> s.memory_usage(index=False)\n 24\n\n The memory footprint of `object` values is ignored by default:\n\n >>> s = pd.Series([\"a\", \"b\"])\n >>> s.values\n array(['a', 'b'], dtype=object)\n >>> s.memory_usage()\n 96\n >>> s.memory_usage(deep=True)\n 212\n \"\"\"\n v = super().memory_usage(deep=deep)\n if index:\n v += self.index.memory_usage(deep=deep)\n return v"} {"instruction": "Create a Python 3 function to\nreturn a Series showing whether each element in values is contained in self.", "response": "def isin(self, values):\n \"\"\"\n Check whether `values` are contained in Series.\n\n Return a boolean Series showing whether each element in the Series\n matches an element in the passed sequence of `values` exactly.\n\n Parameters\n ----------\n values : set or list-like\n The sequence of values to test. Passing in a single string will\n raise a ``TypeError``. Instead, turn a single string into a\n list of one element.\n\n .. versionadded:: 0.18.1\n\n Support for values as a set.\n\n Returns\n -------\n Series\n Series of booleans indicating if each element is in values.\n\n Raises\n ------\n TypeError\n * If `values` is a string\n\n See Also\n --------\n DataFrame.isin : Equivalent method on DataFrame.\n\n Examples\n --------\n >>> s = pd.Series(['lama', 'cow', 'lama', 'beetle', 'lama',\n ... 'hippo'], name='animal')\n >>> s.isin(['cow', 'lama'])\n 0 True\n 1 True\n 2 True\n 3 False\n 4 True\n 5 False\n Name: animal, dtype: bool\n\n Passing a single string as ``s.isin('lama')`` will raise an error. Use\n a list of one element instead:\n\n >>> s.isin(['lama'])\n 0 True\n 1 False\n 2 True\n 3 False\n 4 True\n 5 False\n Name: animal, dtype: bool\n \"\"\"\n result = algorithms.isin(self, values)\n return self._constructor(result, index=self.index).__finalize__(self)"} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef between(self, left, right, inclusive=True):\n if inclusive:\n lmask = self >= left\n rmask = self <= right\n else:\n lmask = self > left\n rmask = self < right\n\n return lmask & rmask", "response": "This function returns a boolean vector containing True where the series element is between left and right."} {"instruction": "Write a Python 3 script to\nread a new entry from a CSV file.", "response": "def from_csv(cls, path, sep=',', parse_dates=True, header=None,\n index_col=0, encoding=None, infer_datetime_format=False):\n \"\"\"\n Read CSV file.\n\n .. deprecated:: 0.21.0\n Use :func:`pandas.read_csv` instead.\n\n It is preferable to use the more powerful :func:`pandas.read_csv`\n for most general purposes, but ``from_csv`` makes for an easy\n roundtrip to and from a file (the exact counterpart of\n ``to_csv``), especially with a time Series.\n\n This method only differs from :func:`pandas.read_csv` in some defaults:\n\n - `index_col` is ``0`` instead of ``None`` (take first column as index\n by default)\n - `header` is ``None`` instead of ``0`` (the first row is not used as\n the column names)\n - `parse_dates` is ``True`` instead of ``False`` (try parsing the index\n as datetime by default)\n\n With :func:`pandas.read_csv`, the option ``squeeze=True`` can be used\n to return a Series like ``from_csv``.\n\n Parameters\n ----------\n path : str, file path, or file handle / StringIO\n sep : str, default ','\n Field delimiter.\n parse_dates : bool, default True\n Parse dates. Different default from read_table.\n header : int, default None\n Row to use as header (skip prior rows).\n index_col : int or sequence, default 0\n Column to use for index. If a sequence is given, a MultiIndex\n is used. Different default from read_table.\n encoding : str, optional\n A string representing the encoding to use if the contents are\n non-ascii, for python versions prior to 3.\n infer_datetime_format : bool, default False\n If True and `parse_dates` is True for a column, try to infer the\n datetime format based on the first datetime string. If the format\n can be inferred, there often will be a large parsing speed-up.\n\n Returns\n -------\n Series\n\n See Also\n --------\n read_csv\n \"\"\"\n\n # We're calling `DataFrame.from_csv` in the implementation,\n # which will propagate a warning regarding `from_csv` deprecation.\n from pandas.core.frame import DataFrame\n df = DataFrame.from_csv(path, header=header, index_col=index_col,\n sep=sep, parse_dates=parse_dates,\n encoding=encoding,\n infer_datetime_format=infer_datetime_format)\n result = df.iloc[:, 0]\n if header is None:\n result.index.name = result.name = None\n\n return result"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef dropna(self, axis=0, inplace=False, **kwargs):\n inplace = validate_bool_kwarg(inplace, 'inplace')\n kwargs.pop('how', None)\n if kwargs:\n raise TypeError('dropna() got an unexpected keyword '\n 'argument \"{0}\"'.format(list(kwargs.keys())[0]))\n # Validate the axis parameter\n self._get_axis_number(axis or 0)\n\n if self._can_hold_na:\n result = remove_na_arraylike(self)\n if inplace:\n self._update_inplace(result)\n else:\n return result\n else:\n if inplace:\n # do nothing\n pass\n else:\n return self.copy()", "response": "Returns a new Series with missing values removed from the current Series."} {"instruction": "Implement a Python 3 function for\nreturning Series with null values removed.", "response": "def valid(self, inplace=False, **kwargs):\n \"\"\"\n Return Series without null values.\n\n .. deprecated:: 0.23.0\n Use :meth:`Series.dropna` instead.\n \"\"\"\n warnings.warn(\"Method .valid will be removed in a future version. \"\n \"Use .dropna instead.\", FutureWarning, stacklevel=2)\n return self.dropna(inplace=inplace, **kwargs)"} {"instruction": "Can you write a function in Python 3 where it\ncasts to DatetimeIndex of Timestamps at beginning of period.", "response": "def to_timestamp(self, freq=None, how='start', copy=True):\n \"\"\"\n Cast to DatetimeIndex of Timestamps, at *beginning* of period.\n\n Parameters\n ----------\n freq : str, default frequency of PeriodIndex\n Desired frequency.\n how : {'s', 'e', 'start', 'end'}\n Convention for converting period to timestamp; start of period\n vs. end.\n copy : bool, default True\n Whether or not to return a copy.\n\n Returns\n -------\n Series with DatetimeIndex\n \"\"\"\n new_values = self._values\n if copy:\n new_values = new_values.copy()\n\n new_index = self.index.to_timestamp(freq=freq, how=how)\n return self._constructor(new_values,\n index=new_index).__finalize__(self)"} {"instruction": "Explain what the following Python 3 code does\ndef to_period(self, freq=None, copy=True):\n new_values = self._values\n if copy:\n new_values = new_values.copy()\n\n new_index = self.index.to_period(freq=freq)\n return self._constructor(new_values,\n index=new_index).__finalize__(self)", "response": "Convert Series from DatetimeIndex to PeriodIndex with desired\n frequency."} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef to_numeric(arg, errors='raise', downcast=None):\n if downcast not in (None, 'integer', 'signed', 'unsigned', 'float'):\n raise ValueError('invalid downcasting method provided')\n\n is_series = False\n is_index = False\n is_scalars = False\n\n if isinstance(arg, ABCSeries):\n is_series = True\n values = arg.values\n elif isinstance(arg, ABCIndexClass):\n is_index = True\n values = arg.asi8\n if values is None:\n values = arg.values\n elif isinstance(arg, (list, tuple)):\n values = np.array(arg, dtype='O')\n elif is_scalar(arg):\n if is_decimal(arg):\n return float(arg)\n if is_number(arg):\n return arg\n is_scalars = True\n values = np.array([arg], dtype='O')\n elif getattr(arg, 'ndim', 1) > 1:\n raise TypeError('arg must be a list, tuple, 1-d array, or Series')\n else:\n values = arg\n\n try:\n if is_numeric_dtype(values):\n pass\n elif is_datetime_or_timedelta_dtype(values):\n values = values.astype(np.int64)\n else:\n values = ensure_object(values)\n coerce_numeric = errors not in ('ignore', 'raise')\n values = lib.maybe_convert_numeric(values, set(),\n coerce_numeric=coerce_numeric)\n\n except Exception:\n if errors == 'raise':\n raise\n\n # attempt downcast only if the data has been successfully converted\n # to a numerical dtype and if a downcast method has been specified\n if downcast is not None and is_numeric_dtype(values):\n typecodes = None\n\n if downcast in ('integer', 'signed'):\n typecodes = np.typecodes['Integer']\n elif downcast == 'unsigned' and np.min(values) >= 0:\n typecodes = np.typecodes['UnsignedInteger']\n elif downcast == 'float':\n typecodes = np.typecodes['Float']\n\n # pandas support goes only to np.float32,\n # as float dtypes smaller than that are\n # extremely rare and not well supported\n float_32_char = np.dtype(np.float32).char\n float_32_ind = typecodes.index(float_32_char)\n typecodes = typecodes[float_32_ind:]\n\n if typecodes is not None:\n # from smallest to largest\n for dtype in typecodes:\n if np.dtype(dtype).itemsize <= values.dtype.itemsize:\n values = maybe_downcast_to_dtype(values, dtype)\n\n # successful conversion\n if values.dtype == dtype:\n break\n\n if is_series:\n return pd.Series(values, index=arg.index, name=arg.name)\n elif is_index:\n # because we want to coerce to numeric if possible,\n # do not use _shallow_copy_with_infer\n return pd.Index(values, name=arg.name)\n elif is_scalars:\n return values[0]\n else:\n return values", "response": "Convert an argument to a numeric type."} {"instruction": "Can you tell what is the following Python 3 function doing\ndef _get_fill(arr: ABCSparseArray) -> np.ndarray:\n try:\n return np.asarray(arr.fill_value, dtype=arr.dtype.subtype)\n except ValueError:\n return np.asarray(arr.fill_value)", "response": "Create a 0 - dim ndarray containing the fill value of the node."} {"instruction": "Write a Python 3 script to\nperform a binary operation between two sparse arrays.", "response": "def _sparse_array_op(\n left: ABCSparseArray,\n right: ABCSparseArray,\n op: Callable,\n name: str\n) -> Any:\n \"\"\"\n Perform a binary operation between two arrays.\n\n Parameters\n ----------\n left : Union[SparseArray, ndarray]\n right : Union[SparseArray, ndarray]\n op : Callable\n The binary operation to perform\n name str\n Name of the callable.\n\n Returns\n -------\n SparseArray\n \"\"\"\n if name.startswith('__'):\n # For lookups in _libs.sparse we need non-dunder op name\n name = name[2:-2]\n\n # dtype used to find corresponding sparse method\n ltype = left.dtype.subtype\n rtype = right.dtype.subtype\n\n if not is_dtype_equal(ltype, rtype):\n subtype = find_common_type([ltype, rtype])\n ltype = SparseDtype(subtype, left.fill_value)\n rtype = SparseDtype(subtype, right.fill_value)\n\n # TODO(GH-23092): pass copy=False. Need to fix astype_nansafe\n left = left.astype(ltype)\n right = right.astype(rtype)\n dtype = ltype.subtype\n else:\n dtype = ltype\n\n # dtype the result must have\n result_dtype = None\n\n if left.sp_index.ngaps == 0 or right.sp_index.ngaps == 0:\n with np.errstate(all='ignore'):\n result = op(left.get_values(), right.get_values())\n fill = op(_get_fill(left), _get_fill(right))\n\n if left.sp_index.ngaps == 0:\n index = left.sp_index\n else:\n index = right.sp_index\n elif left.sp_index.equals(right.sp_index):\n with np.errstate(all='ignore'):\n result = op(left.sp_values, right.sp_values)\n fill = op(_get_fill(left), _get_fill(right))\n index = left.sp_index\n else:\n if name[0] == 'r':\n left, right = right, left\n name = name[1:]\n\n if name in ('and', 'or') and dtype == 'bool':\n opname = 'sparse_{name}_uint8'.format(name=name)\n # to make template simple, cast here\n left_sp_values = left.sp_values.view(np.uint8)\n right_sp_values = right.sp_values.view(np.uint8)\n result_dtype = np.bool\n else:\n opname = 'sparse_{name}_{dtype}'.format(name=name, dtype=dtype)\n left_sp_values = left.sp_values\n right_sp_values = right.sp_values\n\n sparse_op = getattr(splib, opname)\n\n with np.errstate(all='ignore'):\n result, index, fill = sparse_op(\n left_sp_values, left.sp_index, left.fill_value,\n right_sp_values, right.sp_index, right.fill_value)\n\n if result_dtype is None:\n result_dtype = result.dtype\n\n return _wrap_result(name, result, index, fill, dtype=result_dtype)"} {"instruction": "Make a summary of the following Python 3 code\ndef _wrap_result(name, data, sparse_index, fill_value, dtype=None):\n if name.startswith('__'):\n # e.g. __eq__ --> eq\n name = name[2:-2]\n\n if name in ('eq', 'ne', 'lt', 'gt', 'le', 'ge'):\n dtype = np.bool\n\n fill_value = lib.item_from_zerodim(fill_value)\n\n if is_bool_dtype(dtype):\n # fill_value may be np.bool_\n fill_value = bool(fill_value)\n return SparseArray(data,\n sparse_index=sparse_index,\n fill_value=fill_value,\n dtype=dtype)", "response": "Wrap op result to have correct dtype"} {"instruction": "Can you create a Python 3 function that\nconverts array to SparseSeries if it is not.", "response": "def _maybe_to_sparse(array):\n \"\"\"\n array must be SparseSeries or SparseArray\n \"\"\"\n if isinstance(array, ABCSparseSeries):\n array = array.values.copy()\n return array"} {"instruction": "Create a Python 3 function for\nsanitizing the values of the n - term array", "response": "def _sanitize_values(arr):\n \"\"\"\n return an ndarray for our input,\n in a platform independent manner\n \"\"\"\n\n if hasattr(arr, 'values'):\n arr = arr.values\n else:\n\n # scalar\n if is_scalar(arr):\n arr = [arr]\n\n # ndarray\n if isinstance(arr, np.ndarray):\n pass\n\n elif is_list_like(arr) and len(arr) > 0:\n arr = maybe_convert_platform(arr)\n\n else:\n arr = np.asarray(arr)\n\n return arr"} {"instruction": "Can you write a function in Python 3 where it\nconverts an array to sparse format.", "response": "def make_sparse(arr, kind='block', fill_value=None, dtype=None, copy=False):\n \"\"\"\n Convert ndarray to sparse format\n\n Parameters\n ----------\n arr : ndarray\n kind : {'block', 'integer'}\n fill_value : NaN or another value\n dtype : np.dtype, optional\n copy : bool, default False\n\n Returns\n -------\n (sparse_values, index, fill_value) : (ndarray, SparseIndex, Scalar)\n \"\"\"\n\n arr = _sanitize_values(arr)\n\n if arr.ndim > 1:\n raise TypeError(\"expected dimension <= 1 data\")\n\n if fill_value is None:\n fill_value = na_value_for_dtype(arr.dtype)\n\n if isna(fill_value):\n mask = notna(arr)\n else:\n # cast to object comparison to be safe\n if is_string_dtype(arr):\n arr = arr.astype(object)\n\n if is_object_dtype(arr.dtype):\n # element-wise equality check method in numpy doesn't treat\n # each element type, eg. 0, 0.0, and False are treated as\n # same. So we have to check the both of its type and value.\n mask = splib.make_mask_object_ndarray(arr, fill_value)\n else:\n mask = arr != fill_value\n\n length = len(arr)\n if length != len(mask):\n # the arr is a SparseArray\n indices = mask.sp_index.indices\n else:\n indices = mask.nonzero()[0].astype(np.int32)\n\n index = _make_index(length, indices, kind)\n sparsified_values = arr[mask]\n if dtype is not None:\n sparsified_values = astype_nansafe(sparsified_values, dtype=dtype)\n # TODO: copy\n return sparsified_values, index, fill_value"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef density(self):\n r = float(self.sp_index.npoints) / float(self.sp_index.length)\n return r", "response": "Returns the density of the non - fill_value points in the current index as decimal."} {"instruction": "Can you write a function in Python 3 where it\nfills missing values with value.", "response": "def fillna(self, value=None, method=None, limit=None):\n \"\"\"\n Fill missing values with `value`.\n\n Parameters\n ----------\n value : scalar, optional\n method : str, optional\n\n .. warning::\n\n Using 'method' will result in high memory use,\n as all `fill_value` methods will be converted to\n an in-memory ndarray\n\n limit : int, optional\n\n Returns\n -------\n SparseArray\n\n Notes\n -----\n When `value` is specified, the result's ``fill_value`` depends on\n ``self.fill_value``. The goal is to maintain low-memory use.\n\n If ``self.fill_value`` is NA, the result dtype will be\n ``SparseDtype(self.dtype, fill_value=value)``. This will preserve\n amount of memory used before and after filling.\n\n When ``self.fill_value`` is not NA, the result dtype will be\n ``self.dtype``. Again, this preserves the amount of memory used.\n \"\"\"\n if ((method is None and value is None) or\n (method is not None and value is not None)):\n raise ValueError(\"Must specify one of 'method' or 'value'.\")\n\n elif method is not None:\n msg = \"fillna with 'method' requires high memory usage.\"\n warnings.warn(msg, PerformanceWarning)\n filled = interpolate_2d(np.asarray(self), method=method,\n limit=limit)\n return type(self)(filled, fill_value=self.fill_value)\n\n else:\n new_values = np.where(isna(self.sp_values), value, self.sp_values)\n\n if self._null_fill_value:\n # This is essentially just updating the dtype.\n new_dtype = SparseDtype(self.dtype.subtype, fill_value=value)\n else:\n new_dtype = self.dtype\n\n return self._simple_new(new_values, self._sparse_index, new_dtype)"} {"instruction": "Write a Python 3 function that can\nget the location of the first missing value.", "response": "def _first_fill_value_loc(self):\n \"\"\"\n Get the location of the first missing value.\n\n Returns\n -------\n int\n \"\"\"\n if len(self) == 0 or self.sp_index.npoints == len(self):\n return -1\n\n indices = self.sp_index.to_int_index().indices\n if not len(indices) or indices[0] > 0:\n return 0\n\n diff = indices[1:] - indices[:-1]\n return np.searchsorted(diff, 2) + 1"} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef value_counts(self, dropna=True):\n from pandas import Index, Series\n\n keys, counts = algos._value_counts_arraylike(self.sp_values,\n dropna=dropna)\n fcounts = self.sp_index.ngaps\n if fcounts > 0:\n if self._null_fill_value and dropna:\n pass\n else:\n if self._null_fill_value:\n mask = isna(keys)\n else:\n mask = keys == self.fill_value\n\n if mask.any():\n counts[mask] += fcounts\n else:\n keys = np.insert(keys, 0, self.fill_value)\n counts = np.insert(counts, 0, fcounts)\n\n if not isinstance(keys, ABCIndexClass):\n keys = Index(keys)\n result = Series(counts, index=keys)\n return result", "response": "Returns a Series containing counts of unique values in the current object."} {"instruction": "Write a Python 3 function for\nconverting to a SparseArray with a certain dtype.", "response": "def astype(self, dtype=None, copy=True):\n \"\"\"\n Change the dtype of a SparseArray.\n\n The output will always be a SparseArray. To convert to a dense\n ndarray with a certain dtype, use :meth:`numpy.asarray`.\n\n Parameters\n ----------\n dtype : np.dtype or ExtensionDtype\n For SparseDtype, this changes the dtype of\n ``self.sp_values`` and the ``self.fill_value``.\n\n For other dtypes, this only changes the dtype of\n ``self.sp_values``.\n\n copy : bool, default True\n Whether to ensure a copy is made, even if not necessary.\n\n Returns\n -------\n SparseArray\n\n Examples\n --------\n >>> arr = SparseArray([0, 0, 1, 2])\n >>> arr\n [0, 0, 1, 2]\n Fill: 0\n IntIndex\n Indices: array([2, 3], dtype=int32)\n\n >>> arr.astype(np.dtype('int32'))\n [0, 0, 1, 2]\n Fill: 0\n IntIndex\n Indices: array([2, 3], dtype=int32)\n\n Using a NumPy dtype with a different kind (e.g. float) will coerce\n just ``self.sp_values``.\n\n >>> arr.astype(np.dtype('float64'))\n ... # doctest: +NORMALIZE_WHITESPACE\n [0, 0, 1.0, 2.0]\n Fill: 0\n IntIndex\n Indices: array([2, 3], dtype=int32)\n\n Use a SparseDtype if you wish to be change the fill value as well.\n\n >>> arr.astype(SparseDtype(\"float64\", fill_value=np.nan))\n ... # doctest: +NORMALIZE_WHITESPACE\n [nan, nan, 1.0, 2.0]\n Fill: nan\n IntIndex\n Indices: array([2, 3], dtype=int32)\n \"\"\"\n dtype = self.dtype.update_dtype(dtype)\n subtype = dtype._subtype_with_str\n sp_values = astype_nansafe(self.sp_values,\n subtype,\n copy=copy)\n if sp_values is self.sp_values and copy:\n sp_values = sp_values.copy()\n\n return self._simple_new(sp_values,\n self.sp_index,\n dtype)"} {"instruction": "How would you implement a function in Python 3 that\nmaps the categories using a mapper function.", "response": "def map(self, mapper):\n \"\"\"\n Map categories using input correspondence (dict, Series, or function).\n\n Parameters\n ----------\n mapper : dict, Series, callable\n The correspondence from old values to new.\n\n Returns\n -------\n SparseArray\n The output array will have the same density as the input.\n The output fill value will be the result of applying the\n mapping to ``self.fill_value``\n\n Examples\n --------\n >>> arr = pd.SparseArray([0, 1, 2])\n >>> arr.apply(lambda x: x + 10)\n [10, 11, 12]\n Fill: 10\n IntIndex\n Indices: array([1, 2], dtype=int32)\n\n >>> arr.apply({0: 10, 1: 11, 2: 12})\n [10, 11, 12]\n Fill: 10\n IntIndex\n Indices: array([1, 2], dtype=int32)\n\n >>> arr.apply(pd.Series([10, 11, 12], index=[0, 1, 2]))\n [10, 11, 12]\n Fill: 10\n IntIndex\n Indices: array([1, 2], dtype=int32)\n \"\"\"\n # this is used in apply.\n # We get hit since we're an \"is_extension_type\" but regular extension\n # types are not hit. This may be worth adding to the interface.\n if isinstance(mapper, ABCSeries):\n mapper = mapper.to_dict()\n\n if isinstance(mapper, abc.Mapping):\n fill_value = mapper.get(self.fill_value, self.fill_value)\n sp_values = [mapper.get(x, None) for x in self.sp_values]\n else:\n fill_value = mapper(self.fill_value)\n sp_values = [mapper(x) for x in self.sp_values]\n\n return type(self)(sp_values, sparse_index=self.sp_index,\n fill_value=fill_value)"} {"instruction": "Here you have a function in Python 3, explain what it does\ndef all(self, axis=None, *args, **kwargs):\n nv.validate_all(args, kwargs)\n\n values = self.sp_values\n\n if len(values) != len(self) and not np.all(self.fill_value):\n return False\n\n return values.all()", "response": "Tests whether all elements evaluate True\n "} {"instruction": "Can you create a Python 3 function that\ntests whether at least one of elements evaluate True", "response": "def any(self, axis=0, *args, **kwargs):\n \"\"\"\n Tests whether at least one of elements evaluate True\n\n Returns\n -------\n any : bool\n\n See Also\n --------\n numpy.any\n \"\"\"\n nv.validate_any(args, kwargs)\n\n values = self.sp_values\n\n if len(values) != len(self) and np.any(self.fill_value):\n return True\n\n return values.any().item()"} {"instruction": "How would you implement a function in Python 3 that\nreturns the sum of the non - NA values in the logarithmic order.", "response": "def sum(self, axis=0, *args, **kwargs):\n \"\"\"\n Sum of non-NA/null values\n\n Returns\n -------\n sum : float\n \"\"\"\n nv.validate_sum(args, kwargs)\n valid_vals = self._valid_sp_values\n sp_sum = valid_vals.sum()\n if self._null_fill_value:\n return sp_sum\n else:\n nsparse = self.sp_index.ngaps\n return sp_sum + self.fill_value * nsparse"} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef mean(self, axis=0, *args, **kwargs):\n nv.validate_mean(args, kwargs)\n valid_vals = self._valid_sp_values\n sp_sum = valid_vals.sum()\n ct = len(valid_vals)\n\n if self._null_fill_value:\n return sp_sum / ct\n else:\n nsparse = self.sp_index.ngaps\n return (sp_sum + self.fill_value * nsparse) / (ct + nsparse)", "response": "Returns the mean of non - NA values in the logarithmic order."} {"instruction": "Can you write a function in Python 3 where it\nreplaces & with and or with or.", "response": "def _replace_booleans(tok):\n \"\"\"Replace ``&`` with ``and`` and ``|`` with ``or`` so that bitwise\n precedence is changed to boolean precedence.\n\n Parameters\n ----------\n tok : tuple of int, str\n ints correspond to the all caps constants in the tokenize module\n\n Returns\n -------\n t : tuple of int, str\n Either the input or token or the replacement values\n \"\"\"\n toknum, tokval = tok\n if toknum == tokenize.OP:\n if tokval == '&':\n return tokenize.NAME, 'and'\n elif tokval == '|':\n return tokenize.NAME, 'or'\n return toknum, tokval\n return toknum, tokval"} {"instruction": "Implement a Python 3 function for\nreplacing local variables with a syntactically valid name.", "response": "def _replace_locals(tok):\n \"\"\"Replace local variables with a syntactically valid name.\n\n Parameters\n ----------\n tok : tuple of int, str\n ints correspond to the all caps constants in the tokenize module\n\n Returns\n -------\n t : tuple of int, str\n Either the input or token or the replacement values\n\n Notes\n -----\n This is somewhat of a hack in that we rewrite a string such as ``'@a'`` as\n ``'__pd_eval_local_a'`` by telling the tokenizer that ``__pd_eval_local_``\n is a ``tokenize.OP`` and to replace the ``'@'`` symbol with it.\n \"\"\"\n toknum, tokval = tok\n if toknum == tokenize.OP and tokval == '@':\n return tokenize.OP, _LOCAL_TAG\n return toknum, tokval"} {"instruction": "Can you create a Python 3 function that\ncleans up a column name if surrounded by backticks.", "response": "def _clean_spaces_backtick_quoted_names(tok):\n \"\"\"Clean up a column name if surrounded by backticks.\n\n Backtick quoted string are indicated by a certain tokval value. If a string\n is a backtick quoted token it will processed by\n :func:`_remove_spaces_column_name` so that the parser can find this\n string when the query is executed.\n See also :meth:`NDFrame._get_space_character_free_column_resolver`.\n\n Parameters\n ----------\n tok : tuple of int, str\n ints correspond to the all caps constants in the tokenize module\n\n Returns\n -------\n t : tuple of int, str\n Either the input or token or the replacement values\n \"\"\"\n toknum, tokval = tok\n if toknum == _BACKTICK_QUOTED_STRING:\n return tokenize.NAME, _remove_spaces_column_name(tokval)\n return toknum, tokval"} {"instruction": "How would you explain what the following Python 3 function does\ndef _preparse(source, f=_compose(_replace_locals, _replace_booleans,\n _rewrite_assign,\n _clean_spaces_backtick_quoted_names)):\n \"\"\"Compose a collection of tokenization functions\n\n Parameters\n ----------\n source : str\n A Python source code string\n f : callable\n This takes a tuple of (toknum, tokval) as its argument and returns a\n tuple with the same structure but possibly different elements. Defaults\n to the composition of ``_rewrite_assign``, ``_replace_booleans``, and\n ``_replace_locals``.\n\n Returns\n -------\n s : str\n Valid Python source code\n\n Notes\n -----\n The `f` parameter can be any callable that takes *and* returns input of the\n form ``(toknum, tokval)``, where ``toknum`` is one of the constants from\n the ``tokenize`` module and ``tokval`` is a string.\n \"\"\"\n assert callable(f), 'f must be callable'\n return tokenize.untokenize(lmap(f, tokenize_string(source)))", "response": "Compose a collection of tokenization functions from a Python source code string."} {"instruction": "Here you have a function in Python 3, explain what it does\ndef _filter_nodes(superclass, all_nodes=_all_nodes):\n node_names = (node.__name__ for node in all_nodes\n if issubclass(node, superclass))\n return frozenset(node_names)", "response": "Filter out AST nodes that are subclasses of superclass."} {"instruction": "Given the following Python 3 function, write the documentation\ndef _node_not_implemented(node_name, cls):\n\n def f(self, *args, **kwargs):\n raise NotImplementedError(\"{name!r} nodes are not \"\n \"implemented\".format(name=node_name))\n return f", "response": "Return a function that raises a NotImplementedError with a passed node_name."} {"instruction": "Make a summary of the following Python 3 code\ndef disallow(nodes):\n def disallowed(cls):\n cls.unsupported_nodes = ()\n for node in nodes:\n new_method = _node_not_implemented(node, cls)\n name = 'visit_{node}'.format(node=node)\n cls.unsupported_nodes += (name,)\n setattr(cls, name, new_method)\n return cls\n return disallowed", "response": "Decorator to disallow certain nodes from parsing. Raises a\n NotImplementedError instead."} {"instruction": "Create a Python 3 function to\ncreate a function to create an Op class with its symbol already passed.", "response": "def _op_maker(op_class, op_symbol):\n \"\"\"Return a function to create an op class with its symbol already passed.\n\n Returns\n -------\n f : callable\n \"\"\"\n\n def f(self, node, *args, **kwargs):\n \"\"\"Return a partial function with an Op subclass with an operator\n already passed.\n\n Returns\n -------\n f : callable\n \"\"\"\n return partial(op_class, op_symbol, *args, **kwargs)\n return f"} {"instruction": "How would you explain what the following Python 3 function does\ndef add_ops(op_classes):\n def f(cls):\n for op_attr_name, op_class in op_classes.items():\n ops = getattr(cls, '{name}_ops'.format(name=op_attr_name))\n ops_map = getattr(cls, '{name}_op_nodes_map'.format(\n name=op_attr_name))\n for op in ops:\n op_node = ops_map[op]\n if op_node is not None:\n made_op = _op_maker(op_class, op)\n setattr(cls, 'visit_{node}'.format(node=op_node), made_op)\n return cls\n return f", "response": "Decorator to add default implementation of ops."} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef names(self):\n if is_term(self.terms):\n return frozenset([self.terms.name])\n return frozenset(term.name for term in com.flatten(self.terms))", "response": "Get the names in an expression"} {"instruction": "Write a Python 3 script for\nreturning a boolean whether I can attempt conversion to a TimedeltaIndex", "response": "def _is_convertible_to_index(other):\n \"\"\"\n return a boolean whether I can attempt conversion to a TimedeltaIndex\n \"\"\"\n if isinstance(other, TimedeltaIndex):\n return True\n elif (len(other) > 0 and\n other.inferred_type not in ('floating', 'mixed-integer', 'integer',\n 'mixed-integer-float', 'mixed')):\n return True\n return False"} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef timedelta_range(start=None, end=None, periods=None, freq=None,\n name=None, closed=None):\n \"\"\"\n Return a fixed frequency TimedeltaIndex, with day as the default\n frequency\n\n Parameters\n ----------\n start : string or timedelta-like, default None\n Left bound for generating timedeltas\n end : string or timedelta-like, default None\n Right bound for generating timedeltas\n periods : integer, default None\n Number of periods to generate\n freq : string or DateOffset, default 'D'\n Frequency strings can have multiples, e.g. '5H'\n name : string, default None\n Name of the resulting TimedeltaIndex\n closed : string, default None\n Make the interval closed with respect to the given frequency to\n the 'left', 'right', or both sides (None)\n\n Returns\n -------\n rng : TimedeltaIndex\n\n Notes\n -----\n Of the four parameters ``start``, ``end``, ``periods``, and ``freq``,\n exactly three must be specified. If ``freq`` is omitted, the resulting\n ``TimedeltaIndex`` will have ``periods`` linearly spaced elements between\n ``start`` and ``end`` (closed on both sides).\n\n To learn more about the frequency strings, please see `this link\n `__.\n\n Examples\n --------\n\n >>> pd.timedelta_range(start='1 day', periods=4)\n TimedeltaIndex(['1 days', '2 days', '3 days', '4 days'],\n dtype='timedelta64[ns]', freq='D')\n\n The ``closed`` parameter specifies which endpoint is included. The default\n behavior is to include both endpoints.\n\n >>> pd.timedelta_range(start='1 day', periods=4, closed='right')\n TimedeltaIndex(['2 days', '3 days', '4 days'],\n dtype='timedelta64[ns]', freq='D')\n\n The ``freq`` parameter specifies the frequency of the TimedeltaIndex.\n Only fixed frequencies can be passed, non-fixed frequencies such as\n 'M' (month end) will raise.\n\n >>> pd.timedelta_range(start='1 day', end='2 days', freq='6H')\n TimedeltaIndex(['1 days 00:00:00', '1 days 06:00:00', '1 days 12:00:00',\n '1 days 18:00:00', '2 days 00:00:00'],\n dtype='timedelta64[ns]', freq='6H')\n\n Specify ``start``, ``end``, and ``periods``; the frequency is generated\n automatically (linearly spaced).\n\n >>> pd.timedelta_range(start='1 day', end='5 days', periods=4)\n TimedeltaIndex(['1 days 00:00:00', '2 days 08:00:00', '3 days 16:00:00',\n '5 days 00:00:00'],\n dtype='timedelta64[ns]', freq=None)\n \"\"\"\n if freq is None and com._any_none(periods, start, end):\n freq = 'D'\n\n freq, freq_infer = dtl.maybe_infer_freq(freq)\n tdarr = TimedeltaArray._generate_range(start, end, periods, freq,\n closed=closed)\n return TimedeltaIndex._simple_new(tdarr._data, freq=tdarr.freq, name=name)", "response": "Returns a fixed frequency TimedeltaIndex with day as the default frequency."} {"instruction": "Can you tell what is the following Python 3 function doing\ndef union(self, other):\n if isinstance(other, tuple):\n other = list(other)\n return type(self)(super().__add__(other))", "response": "Returns a FrozenList with other concatenated to the end of self."} {"instruction": "How would you implement a function in Python 3 that\nreturns a FrozenList with elements from other removed from self.", "response": "def difference(self, other):\n \"\"\"\n Returns a FrozenList with elements from other removed from self.\n\n Parameters\n ----------\n other : array-like\n The array-like whose elements we are removing self.\n\n Returns\n -------\n diff : FrozenList\n The collection difference between self and other.\n \"\"\"\n other = set(other)\n temp = [x for x in self if x not in other]\n return type(self)(temp)"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef searchsorted(self, value, side=\"left\", sorter=None):\n\n # We are much more performant if the searched\n # indexer is the same type as the array.\n #\n # This doesn't matter for int64, but DOES\n # matter for smaller int dtypes.\n #\n # xref: https://github.com/numpy/numpy/issues/5370\n try:\n value = self.dtype.type(value)\n except ValueError:\n pass\n\n return super().searchsorted(value, side=side, sorter=sorter)", "response": "Search for the entries in the array sorted by value."} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef arrays_to_mgr(arrays, arr_names, index, columns, dtype=None):\n # figure out the index, if necessary\n if index is None:\n index = extract_index(arrays)\n else:\n index = ensure_index(index)\n\n # don't force copy because getting jammed in an ndarray anyway\n arrays = _homogenize(arrays, index, dtype)\n\n # from BlockManager perspective\n axes = [ensure_index(columns), index]\n\n return create_block_manager_from_arrays(arrays, arr_names, axes)", "response": "Convert a list of arrays into a BlockManager."} {"instruction": "Write a Python 3 function for\nextracting from a masked rec array and create a manager.", "response": "def masked_rec_array_to_mgr(data, index, columns, dtype, copy):\n \"\"\"\n Extract from a masked rec array and create the manager.\n \"\"\"\n\n # essentially process a record array then fill it\n fill_value = data.fill_value\n fdata = ma.getdata(data)\n if index is None:\n index = get_names_from_index(fdata)\n if index is None:\n index = ibase.default_index(len(data))\n index = ensure_index(index)\n\n if columns is not None:\n columns = ensure_index(columns)\n arrays, arr_columns = to_arrays(fdata, columns)\n\n # fill if needed\n new_arrays = []\n for fv, arr, col in zip(fill_value, arrays, arr_columns):\n mask = ma.getmaskarray(data[col])\n if mask.any():\n arr, fv = maybe_upcast(arr, fill_value=fv, copy=True)\n arr[mask] = fv\n new_arrays.append(arr)\n\n # create the manager\n arrays, arr_columns = reorder_arrays(new_arrays, arr_columns, columns)\n if columns is None:\n columns = arr_columns\n\n mgr = arrays_to_mgr(arrays, arr_columns, index, columns, dtype)\n\n if copy:\n mgr = mgr.copy()\n return mgr"} {"instruction": "Here you have a function in Python 3, explain what it does\ndef init_dict(data, index, columns, dtype=None):\n if columns is not None:\n from pandas.core.series import Series\n arrays = Series(data, index=columns, dtype=object)\n data_names = arrays.index\n\n missing = arrays.isnull()\n if index is None:\n # GH10856\n # raise ValueError if only scalars in dict\n index = extract_index(arrays[~missing])\n else:\n index = ensure_index(index)\n\n # no obvious \"empty\" int column\n if missing.any() and not is_integer_dtype(dtype):\n if dtype is None or np.issubdtype(dtype, np.flexible):\n # GH#1783\n nan_dtype = object\n else:\n nan_dtype = dtype\n val = construct_1d_arraylike_from_scalar(np.nan, len(index),\n nan_dtype)\n arrays.loc[missing] = [val] * missing.sum()\n\n else:\n keys = com.dict_keys_to_ordered_list(data)\n columns = data_names = Index(keys)\n # GH#24096 need copy to be deep for datetime64tz case\n # TODO: See if we can avoid these copies\n arrays = [data[k] if not is_datetime64tz_dtype(data[k]) else\n data[k].copy(deep=True) for k in keys]\n return arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype)", "response": "Initialize the object with the given data."} {"instruction": "Can you write a function in Python 3 where it\nconverts a list of data into a list of arrays columns.", "response": "def to_arrays(data, columns, coerce_float=False, dtype=None):\n \"\"\"\n Return list of arrays, columns.\n \"\"\"\n if isinstance(data, ABCDataFrame):\n if columns is not None:\n arrays = [data._ixs(i, axis=1).values\n for i, col in enumerate(data.columns) if col in columns]\n else:\n columns = data.columns\n arrays = [data._ixs(i, axis=1).values for i in range(len(columns))]\n\n return arrays, columns\n\n if not len(data):\n if isinstance(data, np.ndarray):\n columns = data.dtype.names\n if columns is not None:\n return [[]] * len(columns), columns\n return [], [] # columns if columns is not None else []\n if isinstance(data[0], (list, tuple)):\n return _list_to_arrays(data, columns, coerce_float=coerce_float,\n dtype=dtype)\n elif isinstance(data[0], abc.Mapping):\n return _list_of_dict_to_arrays(data, columns,\n coerce_float=coerce_float, dtype=dtype)\n elif isinstance(data[0], ABCSeries):\n return _list_of_series_to_arrays(data, columns,\n coerce_float=coerce_float,\n dtype=dtype)\n elif isinstance(data[0], Categorical):\n if columns is None:\n columns = ibase.default_index(len(data))\n return data, columns\n elif (isinstance(data, (np.ndarray, ABCSeries, Index)) and\n data.dtype.names is not None):\n\n columns = list(data.dtype.names)\n arrays = [data[k] for k in columns]\n return arrays, columns\n else:\n # last ditch effort\n data = lmap(tuple, data)\n return _list_to_arrays(data, columns, coerce_float=coerce_float,\n dtype=dtype)"} {"instruction": "Can you implement a function in Python 3 that\nsanitizes an index type to return an ndarray of the underlying pass through a non - Index.", "response": "def sanitize_index(data, index, copy=False):\n \"\"\"\n Sanitize an index type to return an ndarray of the underlying, pass\n through a non-Index.\n \"\"\"\n\n if index is None:\n return data\n\n if len(data) != len(index):\n raise ValueError('Length of values does not match length of index')\n\n if isinstance(data, ABCIndexClass) and not copy:\n pass\n elif isinstance(data, (ABCPeriodIndex, ABCDatetimeIndex)):\n data = data._values\n if copy:\n data = data.copy()\n\n elif isinstance(data, np.ndarray):\n\n # coerce datetimelike types\n if data.dtype.kind in ['M', 'm']:\n data = sanitize_array(data, index, copy=copy)\n\n return data"} {"instruction": "Implement a Python 3 function for\nsanitizing input data to an ndarray copy if specified coerce to the Taxonomy dtype if specified.", "response": "def sanitize_array(data, index, dtype=None, copy=False,\n raise_cast_failure=False):\n \"\"\"\n Sanitize input data to an ndarray, copy if specified, coerce to the\n dtype if specified.\n \"\"\"\n if dtype is not None:\n dtype = pandas_dtype(dtype)\n\n if isinstance(data, ma.MaskedArray):\n mask = ma.getmaskarray(data)\n if mask.any():\n data, fill_value = maybe_upcast(data, copy=True)\n data.soften_mask() # set hardmask False if it was True\n data[mask] = fill_value\n else:\n data = data.copy()\n\n data = extract_array(data, extract_numpy=True)\n\n # GH#846\n if isinstance(data, np.ndarray):\n\n if dtype is not None:\n subarr = np.array(data, copy=False)\n\n # possibility of nan -> garbage\n if is_float_dtype(data.dtype) and is_integer_dtype(dtype):\n try:\n subarr = _try_cast(data, True, dtype, copy,\n True)\n except ValueError:\n if copy:\n subarr = data.copy()\n else:\n subarr = _try_cast(data, True, dtype, copy, raise_cast_failure)\n elif isinstance(data, Index):\n # don't coerce Index types\n # e.g. indexes can have different conversions (so don't fast path\n # them)\n # GH#6140\n subarr = sanitize_index(data, index, copy=copy)\n else:\n\n # we will try to copy be-definition here\n subarr = _try_cast(data, True, dtype, copy, raise_cast_failure)\n\n elif isinstance(data, ExtensionArray):\n if isinstance(data, ABCPandasArray):\n # We don't want to let people put our PandasArray wrapper\n # (the output of Series/Index.array), into a Series. So\n # we explicitly unwrap it here.\n subarr = data.to_numpy()\n else:\n subarr = data\n\n # everything else in this block must also handle ndarray's,\n # becuase we've unwrapped PandasArray into an ndarray.\n\n if dtype is not None:\n subarr = data.astype(dtype)\n\n if copy:\n subarr = data.copy()\n return subarr\n\n elif isinstance(data, (list, tuple)) and len(data) > 0:\n if dtype is not None:\n try:\n subarr = _try_cast(data, False, dtype, copy,\n raise_cast_failure)\n except Exception:\n if raise_cast_failure: # pragma: no cover\n raise\n subarr = np.array(data, dtype=object, copy=copy)\n subarr = lib.maybe_convert_objects(subarr)\n\n else:\n subarr = maybe_convert_platform(data)\n\n subarr = maybe_cast_to_datetime(subarr, dtype)\n\n elif isinstance(data, range):\n # GH#16804\n arr = np.arange(data.start, data.stop, data.step, dtype='int64')\n subarr = _try_cast(arr, False, dtype, copy, raise_cast_failure)\n else:\n subarr = _try_cast(data, False, dtype, copy, raise_cast_failure)\n\n # scalar like, GH\n if getattr(subarr, 'ndim', 0) == 0:\n if isinstance(data, list): # pragma: no cover\n subarr = np.array(data, dtype=object)\n elif index is not None:\n value = data\n\n # figure out the dtype from the value (upcast if necessary)\n if dtype is None:\n dtype, value = infer_dtype_from_scalar(value)\n else:\n # need to possibly convert the value here\n value = maybe_cast_to_datetime(value, dtype)\n\n subarr = construct_1d_arraylike_from_scalar(\n value, len(index), dtype)\n\n else:\n return subarr.item()\n\n # the result that we want\n elif subarr.ndim == 1:\n if index is not None:\n\n # a 1-element ndarray\n if len(subarr) != len(index) and len(subarr) == 1:\n subarr = construct_1d_arraylike_from_scalar(\n subarr[0], len(index), subarr.dtype)\n\n elif subarr.ndim > 1:\n if isinstance(data, np.ndarray):\n raise Exception('Data must be 1-dimensional')\n else:\n subarr = com.asarray_tuplesafe(data, dtype=dtype)\n\n # This is to prevent mixed-type Series getting all casted to\n # NumPy string type, e.g. NaN --> '-1#IND'.\n if issubclass(subarr.dtype.type, str):\n # GH#16605\n # If not empty convert the data to dtype\n # GH#19853: If data is a scalar, subarr has already the result\n if not lib.is_scalar(data):\n if not np.all(isna(data)):\n data = np.array(data, dtype=dtype, copy=False)\n subarr = np.array(data, dtype=object, copy=copy)\n\n if is_object_dtype(subarr.dtype) and dtype != 'object':\n inferred = lib.infer_dtype(subarr, skipna=False)\n if inferred == 'period':\n try:\n subarr = period_array(subarr)\n except IncompatibleFrequency:\n pass\n\n return subarr"} {"instruction": "Implement a function in Python 3 to\ncheck that a valid engine is passed.", "response": "def _check_engine(engine):\n \"\"\"Make sure a valid engine is passed.\n\n Parameters\n ----------\n engine : str\n\n Raises\n ------\n KeyError\n * If an invalid engine is passed\n ImportError\n * If numexpr was requested but doesn't exist\n\n Returns\n -------\n string engine\n\n \"\"\"\n from pandas.core.computation.check import _NUMEXPR_INSTALLED\n\n if engine is None:\n if _NUMEXPR_INSTALLED:\n engine = 'numexpr'\n else:\n engine = 'python'\n\n if engine not in _engines:\n valid = list(_engines.keys())\n raise KeyError('Invalid engine {engine!r} passed, valid engines are'\n ' {valid}'.format(engine=engine, valid=valid))\n\n # TODO: validate this in a more general way (thinking of future engines\n # that won't necessarily be import-able)\n # Could potentially be done on engine instantiation\n if engine == 'numexpr':\n if not _NUMEXPR_INSTALLED:\n raise ImportError(\"'numexpr' is not installed or an \"\n \"unsupported version. Cannot use \"\n \"engine='numexpr' for query/eval \"\n \"if 'numexpr' is not installed\")\n\n return engine"} {"instruction": "How would you explain what the following Python 3 function does\ndef _check_parser(parser):\n from pandas.core.computation.expr import _parsers\n\n if parser not in _parsers:\n raise KeyError('Invalid parser {parser!r} passed, valid parsers are'\n ' {valid}'.format(parser=parser, valid=_parsers.keys()))", "response": "Check that a valid parser is passed."} {"instruction": "Create a Python 3 function for\nevaluating a Python expression using various backends.", "response": "def eval(expr, parser='pandas', engine=None, truediv=True,\n local_dict=None, global_dict=None, resolvers=(), level=0,\n target=None, inplace=False):\n \"\"\"Evaluate a Python expression as a string using various backends.\n\n The following arithmetic operations are supported: ``+``, ``-``, ``*``,\n ``/``, ``**``, ``%``, ``//`` (python engine only) along with the following\n boolean operations: ``|`` (or), ``&`` (and), and ``~`` (not).\n Additionally, the ``'pandas'`` parser allows the use of :keyword:`and`,\n :keyword:`or`, and :keyword:`not` with the same semantics as the\n corresponding bitwise operators. :class:`~pandas.Series` and\n :class:`~pandas.DataFrame` objects are supported and behave as they would\n with plain ol' Python evaluation.\n\n Parameters\n ----------\n expr : str or unicode\n The expression to evaluate. This string cannot contain any Python\n `statements\n `__,\n only Python `expressions\n `__.\n parser : string, default 'pandas', {'pandas', 'python'}\n The parser to use to construct the syntax tree from the expression. The\n default of ``'pandas'`` parses code slightly different than standard\n Python. Alternatively, you can parse an expression using the\n ``'python'`` parser to retain strict Python semantics. See the\n :ref:`enhancing performance ` documentation for\n more details.\n engine : string or None, default 'numexpr', {'python', 'numexpr'}\n\n The engine used to evaluate the expression. Supported engines are\n\n - None : tries to use ``numexpr``, falls back to ``python``\n - ``'numexpr'``: This default engine evaluates pandas objects using\n numexpr for large speed ups in complex expressions\n with large frames.\n - ``'python'``: Performs operations as if you had ``eval``'d in top\n level python. This engine is generally not that useful.\n\n More backends may be available in the future.\n\n truediv : bool, optional\n Whether to use true division, like in Python >= 3\n local_dict : dict or None, optional\n A dictionary of local variables, taken from locals() by default.\n global_dict : dict or None, optional\n A dictionary of global variables, taken from globals() by default.\n resolvers : list of dict-like or None, optional\n A list of objects implementing the ``__getitem__`` special method that\n you can use to inject an additional collection of namespaces to use for\n variable lookup. For example, this is used in the\n :meth:`~DataFrame.query` method to inject the\n ``DataFrame.index`` and ``DataFrame.columns``\n variables that refer to their respective :class:`~pandas.DataFrame`\n instance attributes.\n level : int, optional\n The number of prior stack frames to traverse and add to the current\n scope. Most users will **not** need to change this parameter.\n target : object, optional, default None\n This is the target object for assignment. It is used when there is\n variable assignment in the expression. If so, then `target` must\n support item assignment with string keys, and if a copy is being\n returned, it must also support `.copy()`.\n inplace : bool, default False\n If `target` is provided, and the expression mutates `target`, whether\n to modify `target` inplace. Otherwise, return a copy of `target` with\n the mutation.\n\n Returns\n -------\n ndarray, numeric scalar, DataFrame, Series\n\n Raises\n ------\n ValueError\n There are many instances where such an error can be raised:\n\n - `target=None`, but the expression is multiline.\n - The expression is multiline, but not all them have item assignment.\n An example of such an arrangement is this:\n\n a = b + 1\n a + 2\n\n Here, there are expressions on different lines, making it multiline,\n but the last line has no variable assigned to the output of `a + 2`.\n - `inplace=True`, but the expression is missing item assignment.\n - Item assignment is provided, but the `target` does not support\n string item assignment.\n - Item assignment is provided and `inplace=False`, but the `target`\n does not support the `.copy()` method\n\n See Also\n --------\n DataFrame.query\n DataFrame.eval\n\n Notes\n -----\n The ``dtype`` of any objects involved in an arithmetic ``%`` operation are\n recursively cast to ``float64``.\n\n See the :ref:`enhancing performance ` documentation for\n more details.\n \"\"\"\n from pandas.core.computation.expr import Expr\n\n inplace = validate_bool_kwarg(inplace, \"inplace\")\n\n if isinstance(expr, str):\n _check_expression(expr)\n exprs = [e.strip() for e in expr.splitlines() if e.strip() != '']\n else:\n exprs = [expr]\n multi_line = len(exprs) > 1\n\n if multi_line and target is None:\n raise ValueError(\"multi-line expressions are only valid in the \"\n \"context of data, use DataFrame.eval\")\n\n ret = None\n first_expr = True\n target_modified = False\n\n for expr in exprs:\n expr = _convert_expression(expr)\n engine = _check_engine(engine)\n _check_parser(parser)\n _check_resolvers(resolvers)\n _check_for_locals(expr, level, parser)\n\n # get our (possibly passed-in) scope\n env = _ensure_scope(level + 1, global_dict=global_dict,\n local_dict=local_dict, resolvers=resolvers,\n target=target)\n\n parsed_expr = Expr(expr, engine=engine, parser=parser, env=env,\n truediv=truediv)\n\n # construct the engine and evaluate the parsed expression\n eng = _engines[engine]\n eng_inst = eng(parsed_expr)\n ret = eng_inst.evaluate()\n\n if parsed_expr.assigner is None:\n if multi_line:\n raise ValueError(\"Multi-line expressions are only valid\"\n \" if all expressions contain an assignment\")\n elif inplace:\n raise ValueError(\"Cannot operate inplace \"\n \"if there is no assignment\")\n\n # assign if needed\n assigner = parsed_expr.assigner\n if env.target is not None and assigner is not None:\n target_modified = True\n\n # if returning a copy, copy only on the first assignment\n if not inplace and first_expr:\n try:\n target = env.target.copy()\n except AttributeError:\n raise ValueError(\"Cannot return a copy of the target\")\n else:\n target = env.target\n\n # TypeError is most commonly raised (e.g. int, list), but you\n # get IndexError if you try to do this assignment on np.ndarray.\n # we will ignore numpy warnings here; e.g. if trying\n # to use a non-numeric indexer\n try:\n with warnings.catch_warnings(record=True):\n # TODO: Filter the warnings we actually care about here.\n target[assigner] = ret\n except (TypeError, IndexError):\n raise ValueError(\"Cannot assign expression output to target\")\n\n if not resolvers:\n resolvers = ({assigner: ret},)\n else:\n # existing resolver needs updated to handle\n # case of mutating existing column in copy\n for resolver in resolvers:\n if assigner in resolver:\n resolver[assigner] = ret\n break\n else:\n resolvers += ({assigner: ret},)\n\n ret = None\n first_expr = False\n\n # We want to exclude `inplace=None` as being False.\n if inplace is False:\n return target if target_modified else ret"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef _codes_to_ints(self, codes):\n # Shift the representation of each level by the pre-calculated number\n # of bits:\n codes <<= self.offsets\n\n # Now sum and OR are in fact interchangeable. This is a simple\n # composition of the (disjunct) significant bits of each level (i.e.\n # each column in \"codes\") in a single positive integer:\n if codes.ndim == 1:\n # Single key\n return np.bitwise_or.reduce(codes)\n\n # Multiple keys\n return np.bitwise_or.reduce(codes, axis=1)", "response": "Transform a combination of uint64 in one uint64 in one uint64 ( each ) in a strictly - index manner."} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef from_arrays(cls, arrays, sortorder=None, names=None):\n error_msg = \"Input must be a list / sequence of array-likes.\"\n if not is_list_like(arrays):\n raise TypeError(error_msg)\n elif is_iterator(arrays):\n arrays = list(arrays)\n\n # Check if elements of array are list-like\n for array in arrays:\n if not is_list_like(array):\n raise TypeError(error_msg)\n\n # Check if lengths of all arrays are equal or not,\n # raise ValueError, if not\n for i in range(1, len(arrays)):\n if len(arrays[i]) != len(arrays[i - 1]):\n raise ValueError('all arrays must be same length')\n\n from pandas.core.arrays.categorical import _factorize_from_iterables\n\n codes, levels = _factorize_from_iterables(arrays)\n if names is None:\n names = [getattr(arr, \"name\", None) for arr in arrays]\n\n return MultiIndex(levels=levels, codes=codes, sortorder=sortorder,\n names=names, verify_integrity=False)", "response": "Convert a list of arrays to a MultiIndex."} {"instruction": "Given the following Python 3 function, write the documentation\ndef from_tuples(cls, tuples, sortorder=None, names=None):\n if not is_list_like(tuples):\n raise TypeError('Input must be a list / sequence of tuple-likes.')\n elif is_iterator(tuples):\n tuples = list(tuples)\n\n if len(tuples) == 0:\n if names is None:\n msg = 'Cannot infer number of levels from empty list'\n raise TypeError(msg)\n arrays = [[]] * len(names)\n elif isinstance(tuples, (np.ndarray, Index)):\n if isinstance(tuples, Index):\n tuples = tuples._values\n\n arrays = list(lib.tuples_to_object_array(tuples).T)\n elif isinstance(tuples, list):\n arrays = list(lib.to_object_array_tuples(tuples).T)\n else:\n arrays = lzip(*tuples)\n\n return MultiIndex.from_arrays(arrays, sortorder=sortorder, names=names)", "response": "Convert a list of tuples to MultiIndex."} {"instruction": "Here you have a function in Python 3, explain what it does\ndef from_product(cls, iterables, sortorder=None, names=None):\n from pandas.core.arrays.categorical import _factorize_from_iterables\n from pandas.core.reshape.util import cartesian_product\n\n if not is_list_like(iterables):\n raise TypeError(\"Input must be a list / sequence of iterables.\")\n elif is_iterator(iterables):\n iterables = list(iterables)\n\n codes, levels = _factorize_from_iterables(iterables)\n codes = cartesian_product(codes)\n return MultiIndex(levels, codes, sortorder=sortorder, names=names)", "response": "Make a MultiIndex from the cartesian product of multiple iterables."} {"instruction": "Can you tell what is the following Python 3 function doing\ndef from_frame(cls, df, sortorder=None, names=None):\n if not isinstance(df, ABCDataFrame):\n raise TypeError(\"Input must be a DataFrame\")\n\n column_names, columns = lzip(*df.iteritems())\n names = column_names if names is None else names\n return cls.from_arrays(columns, sortorder=sortorder, names=names)", "response": "Make a MultiIndex from a DataFrame."} {"instruction": "Make a summary of the following Python 3 code\ndef set_levels(self, levels, level=None, inplace=False,\n verify_integrity=True):\n \"\"\"\n Set new levels on MultiIndex. Defaults to returning\n new index.\n\n Parameters\n ----------\n levels : sequence or list of sequence\n new level(s) to apply\n level : int, level name, or sequence of int/level names (default None)\n level(s) to set (None for all levels)\n inplace : bool\n if True, mutates in place\n verify_integrity : bool (default True)\n if True, checks that levels and codes are compatible\n\n Returns\n -------\n new index (of same type and class...etc)\n\n Examples\n --------\n >>> idx = pd.MultiIndex.from_tuples([(1, 'one'), (1, 'two'),\n (2, 'one'), (2, 'two')],\n names=['foo', 'bar'])\n >>> idx.set_levels([['a','b'], [1,2]])\n MultiIndex(levels=[['a', 'b'], [1, 2]],\n codes=[[0, 0, 1, 1], [0, 1, 0, 1]],\n names=['foo', 'bar'])\n >>> idx.set_levels(['a','b'], level=0)\n MultiIndex(levels=[['a', 'b'], ['one', 'two']],\n codes=[[0, 0, 1, 1], [0, 1, 0, 1]],\n names=['foo', 'bar'])\n >>> idx.set_levels(['a','b'], level='bar')\n MultiIndex(levels=[[1, 2], ['a', 'b']],\n codes=[[0, 0, 1, 1], [0, 1, 0, 1]],\n names=['foo', 'bar'])\n >>> idx.set_levels([['a','b'], [1,2]], level=[0,1])\n MultiIndex(levels=[['a', 'b'], [1, 2]],\n codes=[[0, 0, 1, 1], [0, 1, 0, 1]],\n names=['foo', 'bar'])\n \"\"\"\n if is_list_like(levels) and not isinstance(levels, Index):\n levels = list(levels)\n\n if level is not None and not is_list_like(level):\n if not is_list_like(levels):\n raise TypeError(\"Levels must be list-like\")\n if is_list_like(levels[0]):\n raise TypeError(\"Levels must be list-like\")\n level = [level]\n levels = [levels]\n elif level is None or is_list_like(level):\n if not is_list_like(levels) or not is_list_like(levels[0]):\n raise TypeError(\"Levels must be list of lists-like\")\n\n if inplace:\n idx = self\n else:\n idx = self._shallow_copy()\n idx._reset_identity()\n idx._set_levels(levels, level=level, validate=True,\n verify_integrity=verify_integrity)\n if not inplace:\n return idx", "response": "Set new levels on MultiIndex."} {"instruction": "Can you implement a function in Python 3 that\nsets new codes on MultiIndex.", "response": "def set_codes(self, codes, level=None, inplace=False,\n verify_integrity=True):\n \"\"\"\n Set new codes on MultiIndex. Defaults to returning\n new index.\n\n .. versionadded:: 0.24.0\n\n New name for deprecated method `set_labels`.\n\n Parameters\n ----------\n codes : sequence or list of sequence\n new codes to apply\n level : int, level name, or sequence of int/level names (default None)\n level(s) to set (None for all levels)\n inplace : bool\n if True, mutates in place\n verify_integrity : bool (default True)\n if True, checks that levels and codes are compatible\n\n Returns\n -------\n new index (of same type and class...etc)\n\n Examples\n --------\n >>> idx = pd.MultiIndex.from_tuples([(1, 'one'), (1, 'two'),\n (2, 'one'), (2, 'two')],\n names=['foo', 'bar'])\n >>> idx.set_codes([[1,0,1,0], [0,0,1,1]])\n MultiIndex(levels=[[1, 2], ['one', 'two']],\n codes=[[1, 0, 1, 0], [0, 0, 1, 1]],\n names=['foo', 'bar'])\n >>> idx.set_codes([1,0,1,0], level=0)\n MultiIndex(levels=[[1, 2], ['one', 'two']],\n codes=[[1, 0, 1, 0], [0, 1, 0, 1]],\n names=['foo', 'bar'])\n >>> idx.set_codes([0,0,1,1], level='bar')\n MultiIndex(levels=[[1, 2], ['one', 'two']],\n codes=[[0, 0, 1, 1], [0, 0, 1, 1]],\n names=['foo', 'bar'])\n >>> idx.set_codes([[1,0,1,0], [0,0,1,1]], level=[0,1])\n MultiIndex(levels=[[1, 2], ['one', 'two']],\n codes=[[1, 0, 1, 0], [0, 0, 1, 1]],\n names=['foo', 'bar'])\n \"\"\"\n if level is not None and not is_list_like(level):\n if not is_list_like(codes):\n raise TypeError(\"Codes must be list-like\")\n if is_list_like(codes[0]):\n raise TypeError(\"Codes must be list-like\")\n level = [level]\n codes = [codes]\n elif level is None or is_list_like(level):\n if not is_list_like(codes) or not is_list_like(codes[0]):\n raise TypeError(\"Codes must be list of lists-like\")\n\n if inplace:\n idx = self\n else:\n idx = self._shallow_copy()\n idx._reset_identity()\n idx._set_codes(codes, level=level, verify_integrity=verify_integrity)\n if not inplace:\n return idx"} {"instruction": "How would you explain what the following Python 3 function does\ndef copy(self, names=None, dtype=None, levels=None, codes=None,\n deep=False, _set_identity=False, **kwargs):\n \"\"\"\n Make a copy of this object. Names, dtype, levels and codes can be\n passed and will be set on new copy.\n\n Parameters\n ----------\n names : sequence, optional\n dtype : numpy dtype or pandas type, optional\n levels : sequence, optional\n codes : sequence, optional\n\n Returns\n -------\n copy : MultiIndex\n\n Notes\n -----\n In most cases, there should be no functional difference from using\n ``deep``, but if ``deep`` is passed it will attempt to deepcopy.\n This could be potentially expensive on large MultiIndex objects.\n \"\"\"\n name = kwargs.get('name')\n names = self._validate_names(name=name, names=names, deep=deep)\n\n if deep:\n from copy import deepcopy\n if levels is None:\n levels = deepcopy(self.levels)\n if codes is None:\n codes = deepcopy(self.codes)\n else:\n if levels is None:\n levels = self.levels\n if codes is None:\n codes = self.codes\n return MultiIndex(levels=levels, codes=codes, names=names,\n sortorder=self.sortorder, verify_integrity=False,\n _set_identity=_set_identity)", "response": "Make a copy of this MultiIndex object."} {"instruction": "Make a summary of the following Python 3 code\ndef _is_memory_usage_qualified(self):\n def f(l):\n return 'mixed' in l or 'string' in l or 'unicode' in l\n return any(f(l) for l in self._inferred_type_levels)", "response": "return a boolean if we need a qualified. info display"} {"instruction": "Make a summary of the following Python 3 code\ndef _nbytes(self, deep=False):\n\n # for implementations with no useful getsizeof (PyPy)\n objsize = 24\n\n level_nbytes = sum(i.memory_usage(deep=deep) for i in self.levels)\n label_nbytes = sum(i.nbytes for i in self.codes)\n names_nbytes = sum(getsizeof(i, objsize) for i in self.names)\n result = level_nbytes + label_nbytes + names_nbytes\n\n # include our engine hashtable\n result += self._engine.sizeof(deep=deep)\n return result", "response": "return the number of bytes in the underlying data structures if deep = True\n include the engine hashtable\n "} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef _format_attrs(self):\n attrs = [\n ('levels', ibase.default_pprint(self._levels,\n max_seq_items=False)),\n ('codes', ibase.default_pprint(self._codes,\n max_seq_items=False))]\n if com._any_not_none(*self.names):\n attrs.append(('names', ibase.default_pprint(self.names)))\n if self.sortorder is not None:\n attrs.append(('sortorder', ibase.default_pprint(self.sortorder)))\n return attrs", "response": "Return a list of tuples of the attr formatted_value"} {"instruction": "Create a Python 3 function to\nset new names on the index.", "response": "def _set_names(self, names, level=None, validate=True):\n \"\"\"\n Set new names on index. Each name has to be a hashable type.\n\n Parameters\n ----------\n values : str or sequence\n name(s) to set\n level : int, level name, or sequence of int/level names (default None)\n If the index is a MultiIndex (hierarchical), level(s) to set (None\n for all levels). Otherwise level must be None\n validate : boolean, default True\n validate that the names match level lengths\n\n Raises\n ------\n TypeError if each name is not hashable.\n\n Notes\n -----\n sets names on levels. WARNING: mutates!\n\n Note that you generally want to set this *after* changing levels, so\n that it only acts on copies\n \"\"\"\n # GH 15110\n # Don't allow a single string for names in a MultiIndex\n if names is not None and not is_list_like(names):\n raise ValueError('Names should be list-like for a MultiIndex')\n names = list(names)\n\n if validate and level is not None and len(names) != len(level):\n raise ValueError('Length of names must match length of level.')\n if validate and level is None and len(names) != self.nlevels:\n raise ValueError('Length of names must match number of levels in '\n 'MultiIndex.')\n\n if level is None:\n level = range(self.nlevels)\n else:\n level = [self._get_level_number(l) for l in level]\n\n # set the name\n for l, name in zip(level, names):\n if name is not None:\n # GH 20527\n # All items in 'names' need to be hashable:\n if not is_hashable(name):\n raise TypeError('{}.name must be a hashable type'\n .format(self.__class__.__name__))\n self.levels[l].rename(name, inplace=True)"} {"instruction": "Explain what the following Python 3 code does\ndef is_monotonic_increasing(self):\n\n # reversed() because lexsort() wants the most significant key last.\n values = [self._get_level_values(i).values\n for i in reversed(range(len(self.levels)))]\n try:\n sort_order = np.lexsort(values)\n return Index(sort_order).is_monotonic\n except TypeError:\n\n # we have mixed types and np.lexsort is not happy\n return Index(self.values).is_monotonic", "response": "return if the index is monotonic increasing"} {"instruction": "Given the following Python 3 function, write the documentation\ndef _hashed_indexing_key(self, key):\n from pandas.core.util.hashing import hash_tuples, hash_tuple\n\n if not isinstance(key, tuple):\n return hash_tuples(key)\n\n if not len(key) == self.nlevels:\n raise KeyError\n\n def f(k, stringify):\n if stringify and not isinstance(k, str):\n k = str(k)\n return k\n key = tuple(f(k, stringify)\n for k, stringify in zip(key, self._have_mixed_levels))\n return hash_tuple(key)", "response": "validate and return the hash for the provided key"} {"instruction": "Make a summary of the following Python 3 code\ndef _get_level_values(self, level, unique=False):\n\n values = self.levels[level]\n level_codes = self.codes[level]\n if unique:\n level_codes = algos.unique(level_codes)\n filled = algos.take_1d(values._values, level_codes,\n fill_value=values._na_value)\n values = values._shallow_copy(filled)\n return values", "response": "Returns vector of label values for requested level"} {"instruction": "Write a Python 3 script to\nget vector of label values for requested level.", "response": "def get_level_values(self, level):\n \"\"\"\n Return vector of label values for requested level,\n equal to the length of the index.\n\n Parameters\n ----------\n level : int or str\n ``level`` is either the integer position of the level in the\n MultiIndex, or the name of the level.\n\n Returns\n -------\n values : Index\n Values is a level of this MultiIndex converted to\n a single :class:`Index` (or subclass thereof).\n\n Examples\n ---------\n\n Create a MultiIndex:\n\n >>> mi = pd.MultiIndex.from_arrays((list('abc'), list('def')))\n >>> mi.names = ['level_1', 'level_2']\n\n Get level values by supplying level as either integer or name:\n\n >>> mi.get_level_values(0)\n Index(['a', 'b', 'c'], dtype='object', name='level_1')\n >>> mi.get_level_values('level_2')\n Index(['d', 'e', 'f'], dtype='object', name='level_2')\n \"\"\"\n level = self._get_level_number(level)\n values = self._get_level_values(level)\n return values"} {"instruction": "Implement a function in Python 3 to\ncreate a DataFrame containing the levels of the MultiIndex as columns.", "response": "def to_frame(self, index=True, name=None):\n \"\"\"\n Create a DataFrame with the levels of the MultiIndex as columns.\n\n Column ordering is determined by the DataFrame constructor with data as\n a dict.\n\n .. versionadded:: 0.24.0\n\n Parameters\n ----------\n index : boolean, default True\n Set the index of the returned DataFrame as the original MultiIndex.\n\n name : list / sequence of strings, optional\n The passed names should substitute index level names.\n\n Returns\n -------\n DataFrame : a DataFrame containing the original MultiIndex data.\n\n See Also\n --------\n DataFrame\n \"\"\"\n\n from pandas import DataFrame\n if name is not None:\n if not is_list_like(name):\n raise TypeError(\"'name' must be a list / sequence \"\n \"of column names.\")\n\n if len(name) != len(self.levels):\n raise ValueError(\"'name' should have same length as \"\n \"number of levels on index.\")\n idx_names = name\n else:\n idx_names = self.names\n\n # Guarantee resulting column order\n result = DataFrame(\n OrderedDict([\n ((level if lvlname is None else lvlname),\n self._get_level_values(level))\n for lvlname, level in zip(idx_names, range(len(self.levels)))\n ]),\n copy=False\n )\n\n if index:\n result.index = self\n return result"} {"instruction": "Given the following Python 3 function, write the documentation\ndef to_hierarchical(self, n_repeat, n_shuffle=1):\n levels = self.levels\n codes = [np.repeat(level_codes, n_repeat) for\n level_codes in self.codes]\n # Assumes that each level_codes is divisible by n_shuffle\n codes = [x.reshape(n_shuffle, -1).ravel(order='F') for x in codes]\n names = self.names\n warnings.warn(\"Method .to_hierarchical is deprecated and will \"\n \"be removed in a future version\",\n FutureWarning, stacklevel=2)\n return MultiIndex(levels=levels, codes=codes, names=names)", "response": "Return a MultiIndex reshaped to conform to the shapes given by n_repeat and n_shuffle."} {"instruction": "Given the following Python 3 function, write the documentation\ndef _sort_levels_monotonic(self):\n\n if self.is_lexsorted() and self.is_monotonic:\n return self\n\n new_levels = []\n new_codes = []\n\n for lev, level_codes in zip(self.levels, self.codes):\n\n if not lev.is_monotonic:\n try:\n # indexer to reorder the levels\n indexer = lev.argsort()\n except TypeError:\n pass\n else:\n lev = lev.take(indexer)\n\n # indexer to reorder the level codes\n indexer = ensure_int64(indexer)\n ri = lib.get_reverse_indexer(indexer, len(indexer))\n level_codes = algos.take_1d(ri, level_codes)\n\n new_levels.append(lev)\n new_codes.append(level_codes)\n\n return MultiIndex(new_levels, new_codes,\n names=self.names, sortorder=self.sortorder,\n verify_integrity=False)", "response": "This function sorts the levels in the current MultiIndex to monotonically sorted by level codes."} {"instruction": "Implement a function in Python 3 to\ncreate a MultiIndex from the current that removes unused levels.", "response": "def remove_unused_levels(self):\n \"\"\"\n Create a new MultiIndex from the current that removes\n unused levels, meaning that they are not expressed in the labels.\n\n The resulting MultiIndex will have the same outward\n appearance, meaning the same .values and ordering. It will also\n be .equals() to the original.\n\n .. versionadded:: 0.20.0\n\n Returns\n -------\n MultiIndex\n\n Examples\n --------\n >>> i = pd.MultiIndex.from_product([range(2), list('ab')])\n MultiIndex(levels=[[0, 1], ['a', 'b']],\n codes=[[0, 0, 1, 1], [0, 1, 0, 1]])\n\n >>> i[2:]\n MultiIndex(levels=[[0, 1], ['a', 'b']],\n codes=[[1, 1], [0, 1]])\n\n The 0 from the first level is not represented\n and can be removed\n\n >>> i[2:].remove_unused_levels()\n MultiIndex(levels=[[1], ['a', 'b']],\n codes=[[0, 0], [0, 1]])\n \"\"\"\n\n new_levels = []\n new_codes = []\n\n changed = False\n for lev, level_codes in zip(self.levels, self.codes):\n\n # Since few levels are typically unused, bincount() is more\n # efficient than unique() - however it only accepts positive values\n # (and drops order):\n uniques = np.where(np.bincount(level_codes + 1) > 0)[0] - 1\n has_na = int(len(uniques) and (uniques[0] == -1))\n\n if len(uniques) != len(lev) + has_na:\n # We have unused levels\n changed = True\n\n # Recalculate uniques, now preserving order.\n # Can easily be cythonized by exploiting the already existing\n # \"uniques\" and stop parsing \"level_codes\" when all items\n # are found:\n uniques = algos.unique(level_codes)\n if has_na:\n na_idx = np.where(uniques == -1)[0]\n # Just ensure that -1 is in first position:\n uniques[[0, na_idx[0]]] = uniques[[na_idx[0], 0]]\n\n # codes get mapped from uniques to 0:len(uniques)\n # -1 (if present) is mapped to last position\n code_mapping = np.zeros(len(lev) + has_na)\n # ... and reassigned value -1:\n code_mapping[uniques] = np.arange(len(uniques)) - has_na\n\n level_codes = code_mapping[level_codes]\n\n # new levels are simple\n lev = lev.take(uniques[has_na:])\n\n new_levels.append(lev)\n new_codes.append(level_codes)\n\n result = self._shallow_copy()\n\n if changed:\n result._reset_identity()\n result._set_levels(new_levels, validate=False)\n result._set_codes(new_codes, validate=False)\n\n return result"} {"instruction": "Make a summary of the following Python 3 code\ndef _assert_take_fillable(self, values, indices, allow_fill=True,\n fill_value=None, na_value=None):\n \"\"\" Internal method to handle NA filling of take \"\"\"\n # only fill if we are passing a non-None fill_value\n if allow_fill and fill_value is not None:\n if (indices < -1).any():\n msg = ('When allow_fill=True and fill_value is not None, '\n 'all indices must be >= -1')\n raise ValueError(msg)\n taken = [lab.take(indices) for lab in self.codes]\n mask = indices == -1\n if mask.any():\n masked = []\n for new_label in taken:\n label_values = new_label.values()\n label_values[mask] = na_value\n masked.append(np.asarray(label_values))\n taken = masked\n else:\n taken = [lab.take(indices) for lab in self.codes]\n return taken", "response": "Internal method to handle NA filling of take"} {"instruction": "Create a Python 3 function for\nappending a collection of Index options together with other.", "response": "def append(self, other):\n \"\"\"\n Append a collection of Index options together\n\n Parameters\n ----------\n other : Index or list/tuple of indices\n\n Returns\n -------\n appended : Index\n \"\"\"\n if not isinstance(other, (list, tuple)):\n other = [other]\n\n if all((isinstance(o, MultiIndex) and o.nlevels >= self.nlevels)\n for o in other):\n arrays = []\n for i in range(self.nlevels):\n label = self._get_level_values(i)\n appended = [o._get_level_values(i) for o in other]\n arrays.append(label.append(appended))\n return MultiIndex.from_arrays(arrays, names=self.names)\n\n to_concat = (self.values, ) + tuple(k._values for k in other)\n new_tuples = np.concatenate(to_concat)\n\n # if all(isinstance(x, MultiIndex) for x in other):\n try:\n return MultiIndex.from_tuples(new_tuples, names=self.names)\n except (TypeError, IndexError):\n return Index(new_tuples)"} {"instruction": "How would you explain what the following Python 3 function does\ndef drop(self, codes, level=None, errors='raise'):\n if level is not None:\n return self._drop_from_level(codes, level)\n\n try:\n if not isinstance(codes, (np.ndarray, Index)):\n codes = com.index_labels_to_array(codes)\n indexer = self.get_indexer(codes)\n mask = indexer == -1\n if mask.any():\n if errors != 'ignore':\n raise ValueError('codes %s not contained in axis' %\n codes[mask])\n except Exception:\n pass\n\n inds = []\n for level_codes in codes:\n try:\n loc = self.get_loc(level_codes)\n # get_loc returns either an integer, a slice, or a boolean\n # mask\n if isinstance(loc, int):\n inds.append(loc)\n elif isinstance(loc, slice):\n inds.extend(lrange(loc.start, loc.stop))\n elif com.is_bool_indexer(loc):\n if self.lexsort_depth == 0:\n warnings.warn('dropping on a non-lexsorted multi-index'\n ' without a level parameter may impact '\n 'performance.',\n PerformanceWarning,\n stacklevel=3)\n loc = loc.nonzero()[0]\n inds.extend(loc)\n else:\n msg = 'unsupported indexer of type {}'.format(type(loc))\n raise AssertionError(msg)\n except KeyError:\n if errors != 'ignore':\n raise\n\n return self.delete(inds)", "response": "Drop a set of codes from the MultiIndex."} {"instruction": "Make a summary of the following Python 3 code\ndef swaplevel(self, i=-2, j=-1):\n new_levels = list(self.levels)\n new_codes = list(self.codes)\n new_names = list(self.names)\n\n i = self._get_level_number(i)\n j = self._get_level_number(j)\n\n new_levels[i], new_levels[j] = new_levels[j], new_levels[i]\n new_codes[i], new_codes[j] = new_codes[j], new_codes[i]\n new_names[i], new_names[j] = new_names[j], new_names[i]\n\n return MultiIndex(levels=new_levels, codes=new_codes,\n names=new_names, verify_integrity=False)", "response": "This method is used to swap level i with level j."} {"instruction": "Can you tell what is the following Python 3 function doing\ndef reorder_levels(self, order):\n order = [self._get_level_number(i) for i in order]\n if len(order) != self.nlevels:\n raise AssertionError('Length of order must be same as '\n 'number of levels (%d), got %d' %\n (self.nlevels, len(order)))\n new_levels = [self.levels[i] for i in order]\n new_codes = [self.codes[i] for i in order]\n new_names = [self.names[i] for i in order]\n\n return MultiIndex(levels=new_levels, codes=new_codes,\n names=new_names, verify_integrity=False)", "response": "Rearrange levels using input order. May not drop duplicate levels\n "} {"instruction": "Here you have a function in Python 3, explain what it does\ndef _get_codes_for_sorting(self):\n from pandas.core.arrays import Categorical\n\n def cats(level_codes):\n return np.arange(np.array(level_codes).max() + 1 if\n len(level_codes) else 0,\n dtype=level_codes.dtype)\n\n return [Categorical.from_codes(level_codes, cats(level_codes),\n ordered=True)\n for level_codes in self.codes]", "response": "Get the codes for sorting the items in the cache."} {"instruction": "Write a Python 3 function that can\nsort the associated factor at a specific level.", "response": "def sortlevel(self, level=0, ascending=True, sort_remaining=True):\n \"\"\"\n Sort MultiIndex at the requested level. The result will respect the\n original ordering of the associated factor at that level.\n\n Parameters\n ----------\n level : list-like, int or str, default 0\n If a string is given, must be a name of the level\n If list-like must be names or ints of levels.\n ascending : boolean, default True\n False to sort in descending order\n Can also be a list to specify a directed ordering\n sort_remaining : sort by the remaining levels after level\n\n Returns\n -------\n sorted_index : pd.MultiIndex\n Resulting index.\n indexer : np.ndarray\n Indices of output values in original index.\n \"\"\"\n from pandas.core.sorting import indexer_from_factorized\n\n if isinstance(level, (str, int)):\n level = [level]\n level = [self._get_level_number(lev) for lev in level]\n sortorder = None\n\n # we have a directed ordering via ascending\n if isinstance(ascending, list):\n if not len(level) == len(ascending):\n raise ValueError(\"level must have same length as ascending\")\n\n from pandas.core.sorting import lexsort_indexer\n indexer = lexsort_indexer([self.codes[lev] for lev in level],\n orders=ascending)\n\n # level ordering\n else:\n\n codes = list(self.codes)\n shape = list(self.levshape)\n\n # partition codes and shape\n primary = tuple(codes[lev] for lev in level)\n primshp = tuple(shape[lev] for lev in level)\n\n # Reverse sorted to retain the order of\n # smaller indices that needs to be removed\n for lev in sorted(level, reverse=True):\n codes.pop(lev)\n shape.pop(lev)\n\n if sort_remaining:\n primary += primary + tuple(codes)\n primshp += primshp + tuple(shape)\n else:\n sortorder = level[0]\n\n indexer = indexer_from_factorized(primary, primshp,\n compress=False)\n\n if not ascending:\n indexer = indexer[::-1]\n\n indexer = ensure_platform_int(indexer)\n new_codes = [level_codes.take(indexer) for level_codes in self.codes]\n\n new_index = MultiIndex(codes=new_codes, levels=self.levels,\n names=self.names, sortorder=sortorder,\n verify_integrity=False)\n\n return new_index, indexer"} {"instruction": "Implement a Python 3 function for\nconverting a list - like object to a tuple - safe indexer.", "response": "def _convert_listlike_indexer(self, keyarr, kind=None):\n \"\"\"\n Parameters\n ----------\n keyarr : list-like\n Indexer to convert.\n\n Returns\n -------\n tuple (indexer, keyarr)\n indexer is an ndarray or None if cannot convert\n keyarr are tuple-safe keys\n \"\"\"\n indexer, keyarr = super()._convert_listlike_indexer(keyarr, kind=kind)\n\n # are we indexing a specific level\n if indexer is None and len(keyarr) and not isinstance(keyarr[0],\n tuple):\n level = 0\n _, indexer = self.reindex(keyarr, level=level)\n\n # take all\n if indexer is None:\n indexer = np.arange(len(self))\n\n check = self.levels[0].get_indexer(keyarr)\n mask = check == -1\n if mask.any():\n raise KeyError('%s not in index' % keyarr[mask])\n\n return indexer, keyarr"} {"instruction": "Implement a function in Python 3 to\ncreate index with target's values (move/add/delete values as necessary) Returns ------- new_index : pd.MultiIndex Resulting index indexer : np.ndarray or None Indices of output values in original index.", "response": "def reindex(self, target, method=None, level=None, limit=None,\n tolerance=None):\n \"\"\"\n Create index with target's values (move/add/delete values as necessary)\n\n Returns\n -------\n new_index : pd.MultiIndex\n Resulting index\n indexer : np.ndarray or None\n Indices of output values in original index.\n\n \"\"\"\n # GH6552: preserve names when reindexing to non-named target\n # (i.e. neither Index nor Series).\n preserve_names = not hasattr(target, 'names')\n\n if level is not None:\n if method is not None:\n raise TypeError('Fill method not supported if level passed')\n\n # GH7774: preserve dtype/tz if target is empty and not an Index.\n # target may be an iterator\n target = ibase._ensure_has_len(target)\n if len(target) == 0 and not isinstance(target, Index):\n idx = self.levels[level]\n attrs = idx._get_attributes_dict()\n attrs.pop('freq', None) # don't preserve freq\n target = type(idx)._simple_new(np.empty(0, dtype=idx.dtype),\n **attrs)\n else:\n target = ensure_index(target)\n target, indexer, _ = self._join_level(target, level, how='right',\n return_indexers=True,\n keep_order=False)\n else:\n target = ensure_index(target)\n if self.equals(target):\n indexer = None\n else:\n if self.is_unique:\n indexer = self.get_indexer(target, method=method,\n limit=limit,\n tolerance=tolerance)\n else:\n raise ValueError(\"cannot handle a non-unique multi-index!\")\n\n if not isinstance(target, MultiIndex):\n if indexer is None:\n target = self\n elif (indexer >= 0).all():\n target = self.take(indexer)\n else:\n # hopefully?\n target = MultiIndex.from_tuples(target)\n\n if (preserve_names and target.nlevels == self.nlevels and\n target.names != self.names):\n target = target.copy(deep=False)\n target.names = self.names\n\n return target, indexer"} {"instruction": "Make a summary of the following Python 3 code\ndef slice_locs(self, start=None, end=None, step=None, kind=None):\n # This function adds nothing to its parent implementation (the magic\n # happens in get_slice_bound method), but it adds meaningful doc.\n return super().slice_locs(start, end, step, kind=kind)", "response": "This method returns the locations of the slice locations for the input labels."} {"instruction": "How would you code a function in Python 3 to\nget the location of a key in the index.", "response": "def get_loc(self, key, method=None):\n \"\"\"\n Get location for a label or a tuple of labels as an integer, slice or\n boolean mask.\n\n Parameters\n ----------\n key : label or tuple of labels (one for each level)\n method : None\n\n Returns\n -------\n loc : int, slice object or boolean mask\n If the key is past the lexsort depth, the return may be a\n boolean mask array, otherwise it is always a slice or int.\n\n Examples\n ---------\n >>> mi = pd.MultiIndex.from_arrays([list('abb'), list('def')])\n\n >>> mi.get_loc('b')\n slice(1, 3, None)\n\n >>> mi.get_loc(('b', 'e'))\n 1\n\n Notes\n ------\n The key cannot be a slice, list of same-level labels, a boolean mask,\n or a sequence of such. If you want to use those, use\n :meth:`MultiIndex.get_locs` instead.\n\n See Also\n --------\n Index.get_loc : The get_loc method for (single-level) index.\n MultiIndex.slice_locs : Get slice location given start label(s) and\n end label(s).\n MultiIndex.get_locs : Get location for a label/slice/list/mask or a\n sequence of such.\n \"\"\"\n if method is not None:\n raise NotImplementedError('only the default get_loc method is '\n 'currently supported for MultiIndex')\n\n def _maybe_to_slice(loc):\n \"\"\"convert integer indexer to boolean mask or slice if possible\"\"\"\n if not isinstance(loc, np.ndarray) or loc.dtype != 'int64':\n return loc\n\n loc = lib.maybe_indices_to_slice(loc, len(self))\n if isinstance(loc, slice):\n return loc\n\n mask = np.empty(len(self), dtype='bool')\n mask.fill(False)\n mask[loc] = True\n return mask\n\n if not isinstance(key, tuple):\n loc = self._get_level_indexer(key, level=0)\n return _maybe_to_slice(loc)\n\n keylen = len(key)\n if self.nlevels < keylen:\n raise KeyError('Key length ({0}) exceeds index depth ({1})'\n ''.format(keylen, self.nlevels))\n\n if keylen == self.nlevels and self.is_unique:\n return self._engine.get_loc(key)\n\n # -- partial selection or non-unique index\n # break the key into 2 parts based on the lexsort_depth of the index;\n # the first part returns a continuous slice of the index; the 2nd part\n # needs linear search within the slice\n i = self.lexsort_depth\n lead_key, follow_key = key[:i], key[i:]\n start, stop = (self.slice_locs(lead_key, lead_key)\n if lead_key else (0, len(self)))\n\n if start == stop:\n raise KeyError(key)\n\n if not follow_key:\n return slice(start, stop)\n\n warnings.warn('indexing past lexsort depth may impact performance.',\n PerformanceWarning, stacklevel=10)\n\n loc = np.arange(start, stop, dtype='int64')\n\n for i, k in enumerate(follow_key, len(lead_key)):\n mask = self.codes[i][loc] == self.levels[i].get_loc(k)\n if not mask.all():\n loc = loc[mask]\n if not len(loc):\n raise KeyError(key)\n\n return (_maybe_to_slice(loc) if len(loc) != stop - start else\n slice(start, stop))"} {"instruction": "Here you have a function in Python 3, explain what it does\ndef get_loc_level(self, key, level=0, drop_level=True):\n\n def maybe_droplevels(indexer, levels, drop_level):\n if not drop_level:\n return self[indexer]\n # kludgearound\n orig_index = new_index = self[indexer]\n levels = [self._get_level_number(i) for i in levels]\n for i in sorted(levels, reverse=True):\n try:\n new_index = new_index.droplevel(i)\n except ValueError:\n\n # no dropping here\n return orig_index\n return new_index\n\n if isinstance(level, (tuple, list)):\n if len(key) != len(level):\n raise AssertionError('Key for location must have same '\n 'length as number of levels')\n result = None\n for lev, k in zip(level, key):\n loc, new_index = self.get_loc_level(k, level=lev)\n if isinstance(loc, slice):\n mask = np.zeros(len(self), dtype=bool)\n mask[loc] = True\n loc = mask\n\n result = loc if result is None else result & loc\n\n return result, maybe_droplevels(result, level, drop_level)\n\n level = self._get_level_number(level)\n\n # kludge for #1796\n if isinstance(key, list):\n key = tuple(key)\n\n if isinstance(key, tuple) and level == 0:\n\n try:\n if key in self.levels[0]:\n indexer = self._get_level_indexer(key, level=level)\n new_index = maybe_droplevels(indexer, [0], drop_level)\n return indexer, new_index\n except TypeError:\n pass\n\n if not any(isinstance(k, slice) for k in key):\n\n # partial selection\n # optionally get indexer to avoid re-calculation\n def partial_selection(key, indexer=None):\n if indexer is None:\n indexer = self.get_loc(key)\n ilevels = [i for i in range(len(key))\n if key[i] != slice(None, None)]\n return indexer, maybe_droplevels(indexer, ilevels,\n drop_level)\n\n if len(key) == self.nlevels and self.is_unique:\n # Complete key in unique index -> standard get_loc\n return (self._engine.get_loc(key), None)\n else:\n return partial_selection(key)\n else:\n indexer = None\n for i, k in enumerate(key):\n if not isinstance(k, slice):\n k = self._get_level_indexer(k, level=i)\n if isinstance(k, slice):\n # everything\n if k.start == 0 and k.stop == len(self):\n k = slice(None, None)\n else:\n k_index = k\n\n if isinstance(k, slice):\n if k == slice(None, None):\n continue\n else:\n raise TypeError(key)\n\n if indexer is None:\n indexer = k_index\n else: # pragma: no cover\n indexer &= k_index\n if indexer is None:\n indexer = slice(None, None)\n ilevels = [i for i in range(len(key))\n if key[i] != slice(None, None)]\n return indexer, maybe_droplevels(indexer, ilevels, drop_level)\n else:\n indexer = self._get_level_indexer(key, level=level)\n return indexer, maybe_droplevels(indexer, [level], drop_level)", "response": "Get both the location for the requested label or sequence of labels and level."} {"instruction": "Write a Python 3 function for\ngetting locations of the related items in the index.", "response": "def get_locs(self, seq):\n \"\"\"\n Get location for a given label/slice/list/mask or a sequence of such as\n an array of integers.\n\n Parameters\n ----------\n seq : label/slice/list/mask or a sequence of such\n You should use one of the above for each level.\n If a level should not be used, set it to ``slice(None)``.\n\n Returns\n -------\n locs : array of integers suitable for passing to iloc\n\n Examples\n ---------\n >>> mi = pd.MultiIndex.from_arrays([list('abb'), list('def')])\n\n >>> mi.get_locs('b')\n array([1, 2], dtype=int64)\n\n >>> mi.get_locs([slice(None), ['e', 'f']])\n array([1, 2], dtype=int64)\n\n >>> mi.get_locs([[True, False, True], slice('e', 'f')])\n array([2], dtype=int64)\n\n See Also\n --------\n MultiIndex.get_loc : Get location for a label or a tuple of labels.\n MultiIndex.slice_locs : Get slice location given start label(s) and\n end label(s).\n \"\"\"\n from .numeric import Int64Index\n\n # must be lexsorted to at least as many levels\n true_slices = [i for (i, s) in enumerate(com.is_true_slices(seq)) if s]\n if true_slices and true_slices[-1] >= self.lexsort_depth:\n raise UnsortedIndexError('MultiIndex slicing requires the index '\n 'to be lexsorted: slicing on levels {0}, '\n 'lexsort depth {1}'\n .format(true_slices, self.lexsort_depth))\n # indexer\n # this is the list of all values that we want to select\n n = len(self)\n indexer = None\n\n def _convert_to_indexer(r):\n # return an indexer\n if isinstance(r, slice):\n m = np.zeros(n, dtype=bool)\n m[r] = True\n r = m.nonzero()[0]\n elif com.is_bool_indexer(r):\n if len(r) != n:\n raise ValueError(\"cannot index with a boolean indexer \"\n \"that is not the same length as the \"\n \"index\")\n r = r.nonzero()[0]\n return Int64Index(r)\n\n def _update_indexer(idxr, indexer=indexer):\n if indexer is None:\n indexer = Index(np.arange(n))\n if idxr is None:\n return indexer\n return indexer & idxr\n\n for i, k in enumerate(seq):\n\n if com.is_bool_indexer(k):\n # a boolean indexer, must be the same length!\n k = np.asarray(k)\n indexer = _update_indexer(_convert_to_indexer(k),\n indexer=indexer)\n\n elif is_list_like(k):\n # a collection of labels to include from this level (these\n # are or'd)\n indexers = None\n for x in k:\n try:\n idxrs = _convert_to_indexer(\n self._get_level_indexer(x, level=i,\n indexer=indexer))\n indexers = (idxrs if indexers is None\n else indexers | idxrs)\n except KeyError:\n\n # ignore not founds\n continue\n\n if indexers is not None:\n indexer = _update_indexer(indexers, indexer=indexer)\n else:\n # no matches we are done\n return Int64Index([])._ndarray_values\n\n elif com.is_null_slice(k):\n # empty slice\n indexer = _update_indexer(None, indexer=indexer)\n\n elif isinstance(k, slice):\n\n # a slice, include BOTH of the labels\n indexer = _update_indexer(_convert_to_indexer(\n self._get_level_indexer(k, level=i, indexer=indexer)),\n indexer=indexer)\n else:\n # a single label\n indexer = _update_indexer(_convert_to_indexer(\n self.get_loc_level(k, level=i, drop_level=False)[0]),\n indexer=indexer)\n\n # empty indexer\n if indexer is None:\n return Int64Index([])._ndarray_values\n return indexer._ndarray_values"} {"instruction": "How would you code a function in Python 3 to\ntruncate the MultiIndex to a new index between two labels or tuples.", "response": "def truncate(self, before=None, after=None):\n \"\"\"\n Slice index between two labels / tuples, return new MultiIndex\n\n Parameters\n ----------\n before : label or tuple, can be partial. Default None\n None defaults to start\n after : label or tuple, can be partial. Default None\n None defaults to end\n\n Returns\n -------\n truncated : MultiIndex\n \"\"\"\n if after and before and after < before:\n raise ValueError('after < before')\n\n i, j = self.levels[0].slice_locs(before, after)\n left, right = self.slice_locs(before, after)\n\n new_levels = list(self.levels)\n new_levels[0] = new_levels[0][i:j]\n\n new_codes = [level_codes[left:right] for level_codes in self.codes]\n new_codes[0] = new_codes[0] - i\n\n return MultiIndex(levels=new_levels, codes=new_codes,\n verify_integrity=False)"} {"instruction": "Write a Python 3 function that can\ndetermine if two MultiIndex objects have the same labeling information.", "response": "def equals(self, other):\n \"\"\"\n Determines if two MultiIndex objects have the same labeling information\n (the levels themselves do not necessarily have to be the same)\n\n See Also\n --------\n equal_levels\n \"\"\"\n if self.is_(other):\n return True\n\n if not isinstance(other, Index):\n return False\n\n if not isinstance(other, MultiIndex):\n other_vals = com.values_from_object(ensure_index(other))\n return array_equivalent(self._ndarray_values, other_vals)\n\n if self.nlevels != other.nlevels:\n return False\n\n if len(self) != len(other):\n return False\n\n for i in range(self.nlevels):\n self_codes = self.codes[i]\n self_codes = self_codes[self_codes != -1]\n self_values = algos.take_nd(np.asarray(self.levels[i]._values),\n self_codes, allow_fill=False)\n\n other_codes = other.codes[i]\n other_codes = other_codes[other_codes != -1]\n other_values = algos.take_nd(\n np.asarray(other.levels[i]._values),\n other_codes, allow_fill=False)\n\n # since we use NaT both datetime64 and timedelta64\n # we can have a situation where a level is typed say\n # timedelta64 in self (IOW it has other values than NaT)\n # but types datetime64 in other (where its all NaT)\n # but these are equivalent\n if len(self_values) == 0 and len(other_values) == 0:\n continue\n\n if not array_equivalent(self_values, other_values):\n return False\n\n return True"} {"instruction": "Write a Python 3 function for\nreturning True if the levels of both MultiIndex objects are the same.", "response": "def equal_levels(self, other):\n \"\"\"\n Return True if the levels of both MultiIndex objects are the same\n\n \"\"\"\n if self.nlevels != other.nlevels:\n return False\n\n for i in range(self.nlevels):\n if not self.levels[i].equals(other.levels[i]):\n return False\n return True"} {"instruction": "Create a Python 3 function for\nreturning the union of two MultiIndex objects.", "response": "def union(self, other, sort=None):\n \"\"\"\n Form the union of two MultiIndex objects\n\n Parameters\n ----------\n other : MultiIndex or array / Index of tuples\n sort : False or None, default None\n Whether to sort the resulting Index.\n\n * None : Sort the result, except when\n\n 1. `self` and `other` are equal.\n 2. `self` has length 0.\n 3. Some values in `self` or `other` cannot be compared.\n A RuntimeWarning is issued in this case.\n\n * False : do not sort the result.\n\n .. versionadded:: 0.24.0\n\n .. versionchanged:: 0.24.1\n\n Changed the default value from ``True`` to ``None``\n (without change in behaviour).\n\n Returns\n -------\n Index\n\n >>> index.union(index2)\n \"\"\"\n self._validate_sort_keyword(sort)\n self._assert_can_do_setop(other)\n other, result_names = self._convert_can_do_setop(other)\n\n if len(other) == 0 or self.equals(other):\n return self\n\n # TODO: Index.union returns other when `len(self)` is 0.\n\n uniq_tuples = lib.fast_unique_multiple([self._ndarray_values,\n other._ndarray_values],\n sort=sort)\n\n return MultiIndex.from_arrays(lzip(*uniq_tuples), sortorder=0,\n names=result_names)"} {"instruction": "How would you implement a function in Python 3 that\nreturns the intersection of two MultiIndex objects.", "response": "def intersection(self, other, sort=False):\n \"\"\"\n Form the intersection of two MultiIndex objects.\n\n Parameters\n ----------\n other : MultiIndex or array / Index of tuples\n sort : False or None, default False\n Sort the resulting MultiIndex if possible\n\n .. versionadded:: 0.24.0\n\n .. versionchanged:: 0.24.1\n\n Changed the default from ``True`` to ``False``, to match\n behaviour from before 0.24.0\n\n Returns\n -------\n Index\n \"\"\"\n self._validate_sort_keyword(sort)\n self._assert_can_do_setop(other)\n other, result_names = self._convert_can_do_setop(other)\n\n if self.equals(other):\n return self\n\n self_tuples = self._ndarray_values\n other_tuples = other._ndarray_values\n uniq_tuples = set(self_tuples) & set(other_tuples)\n\n if sort is None:\n uniq_tuples = sorted(uniq_tuples)\n\n if len(uniq_tuples) == 0:\n return MultiIndex(levels=self.levels,\n codes=[[]] * self.nlevels,\n names=result_names, verify_integrity=False)\n else:\n return MultiIndex.from_arrays(lzip(*uniq_tuples), sortorder=0,\n names=result_names)"} {"instruction": "Can you create a Python 3 function that\ncomputes set difference of two MultiIndex objects.", "response": "def difference(self, other, sort=None):\n \"\"\"\n Compute set difference of two MultiIndex objects\n\n Parameters\n ----------\n other : MultiIndex\n sort : False or None, default None\n Sort the resulting MultiIndex if possible\n\n .. versionadded:: 0.24.0\n\n .. versionchanged:: 0.24.1\n\n Changed the default value from ``True`` to ``None``\n (without change in behaviour).\n\n Returns\n -------\n diff : MultiIndex\n \"\"\"\n self._validate_sort_keyword(sort)\n self._assert_can_do_setop(other)\n other, result_names = self._convert_can_do_setop(other)\n\n if len(other) == 0:\n return self\n\n if self.equals(other):\n return MultiIndex(levels=self.levels,\n codes=[[]] * self.nlevels,\n names=result_names, verify_integrity=False)\n\n this = self._get_unique_index()\n\n indexer = this.get_indexer(other)\n indexer = indexer.take((indexer != -1).nonzero()[0])\n\n label_diff = np.setdiff1d(np.arange(this.size), indexer,\n assume_unique=True)\n difference = this.values.take(label_diff)\n if sort is None:\n difference = sorted(difference)\n\n if len(difference) == 0:\n return MultiIndex(levels=[[]] * self.nlevels,\n codes=[[]] * self.nlevels,\n names=result_names, verify_integrity=False)\n else:\n return MultiIndex.from_tuples(difference, sortorder=0,\n names=result_names)"} {"instruction": "Write a Python 3 script for\nmaking new MultiIndex inserting new item at location loc.", "response": "def insert(self, loc, item):\n \"\"\"\n Make new MultiIndex inserting new item at location\n\n Parameters\n ----------\n loc : int\n item : tuple\n Must be same length as number of levels in the MultiIndex\n\n Returns\n -------\n new_index : Index\n \"\"\"\n # Pad the key with empty strings if lower levels of the key\n # aren't specified:\n if not isinstance(item, tuple):\n item = (item, ) + ('', ) * (self.nlevels - 1)\n elif len(item) != self.nlevels:\n raise ValueError('Item must have length equal to number of '\n 'levels.')\n\n new_levels = []\n new_codes = []\n for k, level, level_codes in zip(item, self.levels, self.codes):\n if k not in level:\n # have to insert into level\n # must insert at end otherwise you have to recompute all the\n # other codes\n lev_loc = len(level)\n level = level.insert(lev_loc, k)\n else:\n lev_loc = level.get_loc(k)\n\n new_levels.append(level)\n new_codes.append(np.insert(\n ensure_int64(level_codes), loc, lev_loc))\n\n return MultiIndex(levels=new_levels, codes=new_codes,\n names=self.names, verify_integrity=False)"} {"instruction": "How would you implement a function in Python 3 that\nmakes new index with passed location deleted", "response": "def delete(self, loc):\n \"\"\"\n Make new index with passed location deleted\n\n Returns\n -------\n new_index : MultiIndex\n \"\"\"\n new_codes = [np.delete(level_codes, loc) for level_codes in self.codes]\n return MultiIndex(levels=self.levels, codes=new_codes,\n names=self.names, verify_integrity=False)"} {"instruction": "Explain what the following Python 3 code does\ndef _ensure_data(values, dtype=None):\n\n # we check some simple dtypes first\n try:\n if is_object_dtype(dtype):\n return ensure_object(np.asarray(values)), 'object', 'object'\n if is_bool_dtype(values) or is_bool_dtype(dtype):\n # we are actually coercing to uint64\n # until our algos support uint8 directly (see TODO)\n return np.asarray(values).astype('uint64'), 'bool', 'uint64'\n elif is_signed_integer_dtype(values) or is_signed_integer_dtype(dtype):\n return ensure_int64(values), 'int64', 'int64'\n elif (is_unsigned_integer_dtype(values) or\n is_unsigned_integer_dtype(dtype)):\n return ensure_uint64(values), 'uint64', 'uint64'\n elif is_float_dtype(values) or is_float_dtype(dtype):\n return ensure_float64(values), 'float64', 'float64'\n elif is_object_dtype(values) and dtype is None:\n return ensure_object(np.asarray(values)), 'object', 'object'\n elif is_complex_dtype(values) or is_complex_dtype(dtype):\n\n # ignore the fact that we are casting to float\n # which discards complex parts\n with catch_warnings():\n simplefilter(\"ignore\", np.ComplexWarning)\n values = ensure_float64(values)\n return values, 'float64', 'float64'\n\n except (TypeError, ValueError, OverflowError):\n # if we are trying to coerce to a dtype\n # and it is incompat this will fall thru to here\n return ensure_object(values), 'object', 'object'\n\n # datetimelike\n if (needs_i8_conversion(values) or\n is_period_dtype(dtype) or\n is_datetime64_any_dtype(dtype) or\n is_timedelta64_dtype(dtype)):\n if is_period_dtype(values) or is_period_dtype(dtype):\n from pandas import PeriodIndex\n values = PeriodIndex(values)\n dtype = values.dtype\n elif is_timedelta64_dtype(values) or is_timedelta64_dtype(dtype):\n from pandas import TimedeltaIndex\n values = TimedeltaIndex(values)\n dtype = values.dtype\n else:\n # Datetime\n from pandas import DatetimeIndex\n values = DatetimeIndex(values)\n dtype = values.dtype\n\n return values.asi8, dtype, 'int64'\n\n elif (is_categorical_dtype(values) and\n (is_categorical_dtype(dtype) or dtype is None)):\n values = getattr(values, 'values', values)\n values = values.codes\n dtype = 'category'\n\n # we are actually coercing to int64\n # until our algos support int* directly (not all do)\n values = ensure_int64(values)\n\n return values, dtype, 'int64'\n\n # we have failed, return object\n values = np.asarray(values, dtype=np.object)\n return ensure_object(values), 'object', 'object'", "response": "This function will ensure that our data is of the correct dtype for the base - level functions."} {"instruction": "Implement a function in Python 3 to\nreconstruct the data array for the .", "response": "def _reconstruct_data(values, dtype, original):\n \"\"\"\n reverse of _ensure_data\n\n Parameters\n ----------\n values : ndarray\n dtype : pandas_dtype\n original : ndarray-like\n\n Returns\n -------\n Index for extension types, otherwise ndarray casted to dtype\n \"\"\"\n from pandas import Index\n if is_extension_array_dtype(dtype):\n values = dtype.construct_array_type()._from_sequence(values)\n elif is_datetime64tz_dtype(dtype) or is_period_dtype(dtype):\n values = Index(original)._shallow_copy(values, name=None)\n elif is_bool_dtype(dtype):\n values = values.astype(dtype)\n\n # we only support object dtypes bool Index\n if isinstance(original, Index):\n values = values.astype(object)\n elif dtype is not None:\n values = values.astype(dtype)\n\n return values"} {"instruction": "Here you have a function in Python 3, explain what it does\ndef _ensure_arraylike(values):\n if not is_array_like(values):\n inferred = lib.infer_dtype(values, skipna=False)\n if inferred in ['mixed', 'string', 'unicode']:\n if isinstance(values, tuple):\n values = list(values)\n values = construct_1d_object_array_from_listlike(values)\n else:\n values = np.asarray(values)\n return values", "response": "ensure that we are arraylike if not already\nAGEREAD"} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef _get_hashtable_algo(values):\n values, dtype, ndtype = _ensure_data(values)\n\n if ndtype == 'object':\n\n # it's cheaper to use a String Hash Table than Object; we infer\n # including nulls because that is the only difference between\n # StringHashTable and ObjectHashtable\n if lib.infer_dtype(values, skipna=False) in ['string']:\n ndtype = 'string'\n else:\n ndtype = 'object'\n\n htable, table = _hashtables[ndtype]\n return (htable, table, values, dtype, ndtype)", "response": "Returns the hashtable class vector class dtype ndtype for the base hashtable class."} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef match(to_match, values, na_sentinel=-1):\n values = com.asarray_tuplesafe(values)\n htable, _, values, dtype, ndtype = _get_hashtable_algo(values)\n to_match, _, _ = _ensure_data(to_match, dtype)\n table = htable(min(len(to_match), 1000000))\n table.map_locations(values)\n result = table.lookup(to_match)\n\n if na_sentinel != -1:\n\n # replace but return a numpy array\n # use a Series because it handles dtype conversions properly\n from pandas import Series\n result = Series(result.ravel()).replace(-1, na_sentinel)\n result = result.values.reshape(result.shape)\n\n return result", "response": "Compute locations of to_match into values\n "} {"instruction": "Can you write a function in Python 3 where it\nreturns a 1d array - like containing the unique values of the given array - like objects.", "response": "def unique(values):\n \"\"\"\n Hash table-based unique. Uniques are returned in order\n of appearance. This does NOT sort.\n\n Significantly faster than numpy.unique. Includes NA values.\n\n Parameters\n ----------\n values : 1d array-like\n\n Returns\n -------\n numpy.ndarray or ExtensionArray\n\n The return can be:\n\n * Index : when the input is an Index\n * Categorical : when the input is a Categorical dtype\n * ndarray : when the input is a Series/ndarray\n\n Return numpy.ndarray or ExtensionArray.\n\n See Also\n --------\n Index.unique\n Series.unique\n\n Examples\n --------\n >>> pd.unique(pd.Series([2, 1, 3, 3]))\n array([2, 1, 3])\n\n >>> pd.unique(pd.Series([2] + [1] * 5))\n array([2, 1])\n\n >>> pd.unique(pd.Series([pd.Timestamp('20160101'),\n ... pd.Timestamp('20160101')]))\n array(['2016-01-01T00:00:00.000000000'], dtype='datetime64[ns]')\n\n >>> pd.unique(pd.Series([pd.Timestamp('20160101', tz='US/Eastern'),\n ... pd.Timestamp('20160101', tz='US/Eastern')]))\n array([Timestamp('2016-01-01 00:00:00-0500', tz='US/Eastern')],\n dtype=object)\n\n >>> pd.unique(pd.Index([pd.Timestamp('20160101', tz='US/Eastern'),\n ... pd.Timestamp('20160101', tz='US/Eastern')]))\n DatetimeIndex(['2016-01-01 00:00:00-05:00'],\n ... dtype='datetime64[ns, US/Eastern]', freq=None)\n\n >>> pd.unique(list('baabc'))\n array(['b', 'a', 'c'], dtype=object)\n\n An unordered Categorical will return categories in the\n order of appearance.\n\n >>> pd.unique(pd.Series(pd.Categorical(list('baabc'))))\n [b, a, c]\n Categories (3, object): [b, a, c]\n\n >>> pd.unique(pd.Series(pd.Categorical(list('baabc'),\n ... categories=list('abc'))))\n [b, a, c]\n Categories (3, object): [b, a, c]\n\n An ordered Categorical preserves the category ordering.\n\n >>> pd.unique(pd.Series(pd.Categorical(list('baabc'),\n ... categories=list('abc'),\n ... ordered=True)))\n [b, a, c]\n Categories (3, object): [a < b < c]\n\n An array of tuples\n\n >>> pd.unique([('a', 'b'), ('b', 'a'), ('a', 'c'), ('b', 'a')])\n array([('a', 'b'), ('b', 'a'), ('a', 'c')], dtype=object)\n \"\"\"\n\n values = _ensure_arraylike(values)\n\n if is_extension_array_dtype(values):\n # Dispatch to extension dtype's unique.\n return values.unique()\n\n original = values\n htable, _, values, dtype, ndtype = _get_hashtable_algo(values)\n\n table = htable(len(values))\n uniques = table.unique(values)\n uniques = _reconstruct_data(uniques, dtype, original)\n return uniques"} {"instruction": "Create a Python 3 function to\ncompute the isin boolean array of a set of objects.", "response": "def isin(comps, values):\n \"\"\"\n Compute the isin boolean array\n\n Parameters\n ----------\n comps : array-like\n values : array-like\n\n Returns\n -------\n boolean array same length as comps\n \"\"\"\n\n if not is_list_like(comps):\n raise TypeError(\"only list-like objects are allowed to be passed\"\n \" to isin(), you passed a [{comps_type}]\"\n .format(comps_type=type(comps).__name__))\n if not is_list_like(values):\n raise TypeError(\"only list-like objects are allowed to be passed\"\n \" to isin(), you passed a [{values_type}]\"\n .format(values_type=type(values).__name__))\n\n if not isinstance(values, (ABCIndex, ABCSeries, np.ndarray)):\n values = construct_1d_object_array_from_listlike(list(values))\n\n if is_categorical_dtype(comps):\n # TODO(extension)\n # handle categoricals\n return comps._values.isin(values)\n\n comps = com.values_from_object(comps)\n\n comps, dtype, _ = _ensure_data(comps)\n values, _, _ = _ensure_data(values, dtype=dtype)\n\n # faster for larger cases to use np.in1d\n f = lambda x, y: htable.ismember_object(x, values)\n\n # GH16012\n # Ensure np.in1d doesn't get object types or it *may* throw an exception\n if len(comps) > 1000000 and not is_object_dtype(comps):\n f = lambda x, y: np.in1d(x, y)\n elif is_integer_dtype(comps):\n try:\n values = values.astype('int64', copy=False)\n comps = comps.astype('int64', copy=False)\n f = lambda x, y: htable.ismember_int64(x, y)\n except (TypeError, ValueError, OverflowError):\n values = values.astype(object)\n comps = comps.astype(object)\n\n elif is_float_dtype(comps):\n try:\n values = values.astype('float64', copy=False)\n comps = comps.astype('float64', copy=False)\n f = lambda x, y: htable.ismember_float64(x, y)\n except (TypeError, ValueError):\n values = values.astype(object)\n comps = comps.astype(object)\n\n return f(comps, values)"} {"instruction": "Create a Python 3 function to\nfactorize an array - like to labels and uniques.", "response": "def _factorize_array(values, na_sentinel=-1, size_hint=None,\n na_value=None):\n \"\"\"Factorize an array-like to labels and uniques.\n\n This doesn't do any coercion of types or unboxing before factorization.\n\n Parameters\n ----------\n values : ndarray\n na_sentinel : int, default -1\n size_hint : int, optional\n Passsed through to the hashtable's 'get_labels' method\n na_value : object, optional\n A value in `values` to consider missing. Note: only use this\n parameter when you know that you don't have any values pandas would\n consider missing in the array (NaN for float data, iNaT for\n datetimes, etc.).\n\n Returns\n -------\n labels, uniques : ndarray\n \"\"\"\n (hash_klass, _), values = _get_data_algo(values, _hashtables)\n\n table = hash_klass(size_hint or len(values))\n uniques, labels = table.factorize(values, na_sentinel=na_sentinel,\n na_value=na_value)\n\n labels = ensure_platform_int(labels)\n return labels, uniques"} {"instruction": "Make a summary of the following Python 3 code\ndef value_counts(values, sort=True, ascending=False, normalize=False,\n bins=None, dropna=True):\n \"\"\"\n Compute a histogram of the counts of non-null values.\n\n Parameters\n ----------\n values : ndarray (1-d)\n sort : boolean, default True\n Sort by values\n ascending : boolean, default False\n Sort in ascending order\n normalize: boolean, default False\n If True then compute a relative histogram\n bins : integer, optional\n Rather than count values, group them into half-open bins,\n convenience for pd.cut, only works with numeric data\n dropna : boolean, default True\n Don't include counts of NaN\n\n Returns\n -------\n value_counts : Series\n\n \"\"\"\n from pandas.core.series import Series, Index\n name = getattr(values, 'name', None)\n\n if bins is not None:\n try:\n from pandas.core.reshape.tile import cut\n values = Series(values)\n ii = cut(values, bins, include_lowest=True)\n except TypeError:\n raise TypeError(\"bins argument only works with numeric data.\")\n\n # count, remove nulls (from the index), and but the bins\n result = ii.value_counts(dropna=dropna)\n result = result[result.index.notna()]\n result.index = result.index.astype('interval')\n result = result.sort_index()\n\n # if we are dropna and we have NO values\n if dropna and (result.values == 0).all():\n result = result.iloc[0:0]\n\n # normalizing is by len of all (regardless of dropna)\n counts = np.array([len(ii)])\n\n else:\n\n if is_extension_array_dtype(values) or is_sparse(values):\n\n # handle Categorical and sparse,\n result = Series(values)._values.value_counts(dropna=dropna)\n result.name = name\n counts = result.values\n\n else:\n keys, counts = _value_counts_arraylike(values, dropna)\n\n if not isinstance(keys, Index):\n keys = Index(keys)\n result = Series(counts, index=keys, name=name)\n\n if sort:\n result = result.sort_values(ascending=ascending)\n\n if normalize:\n result = result / float(counts.sum())\n\n return result", "response": "Compute a histogram of the counts of non - null values."} {"instruction": "Given the following Python 3 function, write the documentation\ndef duplicated(values, keep='first'):\n\n values, dtype, ndtype = _ensure_data(values)\n f = getattr(htable, \"duplicated_{dtype}\".format(dtype=ndtype))\n return f(values, keep=keep)", "response": "Returns boolean ndarray denoting duplicate values."} {"instruction": "Write a Python 3 function for\nreturning the mode of an array.", "response": "def mode(values, dropna=True):\n \"\"\"\n Returns the mode(s) of an array.\n\n Parameters\n ----------\n values : array-like\n Array over which to check for duplicate values.\n dropna : boolean, default True\n Don't consider counts of NaN/NaT.\n\n .. versionadded:: 0.24.0\n\n Returns\n -------\n mode : Series\n \"\"\"\n from pandas import Series\n\n values = _ensure_arraylike(values)\n original = values\n\n # categorical is a fast-path\n if is_categorical_dtype(values):\n if isinstance(values, Series):\n return Series(values.values.mode(dropna=dropna), name=values.name)\n return values.mode(dropna=dropna)\n\n if dropna and is_datetimelike(values):\n mask = values.isnull()\n values = values[~mask]\n\n values, dtype, ndtype = _ensure_data(values)\n\n f = getattr(htable, \"mode_{dtype}\".format(dtype=ndtype))\n result = f(values, dropna=dropna)\n try:\n result = np.sort(result)\n except TypeError as e:\n warn(\"Unable to sort modes: {error}\".format(error=e))\n\n result = _reconstruct_data(result, original.dtype, original)\n return Series(result)"} {"instruction": "Can you create a Python 3 function that\nranks the values along a given axis.", "response": "def rank(values, axis=0, method='average', na_option='keep',\n ascending=True, pct=False):\n \"\"\"\n Rank the values along a given axis.\n\n Parameters\n ----------\n values : array-like\n Array whose values will be ranked. The number of dimensions in this\n array must not exceed 2.\n axis : int, default 0\n Axis over which to perform rankings.\n method : {'average', 'min', 'max', 'first', 'dense'}, default 'average'\n The method by which tiebreaks are broken during the ranking.\n na_option : {'keep', 'top'}, default 'keep'\n The method by which NaNs are placed in the ranking.\n - ``keep``: rank each NaN value with a NaN ranking\n - ``top``: replace each NaN with either +/- inf so that they\n there are ranked at the top\n ascending : boolean, default True\n Whether or not the elements should be ranked in ascending order.\n pct : boolean, default False\n Whether or not to the display the returned rankings in integer form\n (e.g. 1, 2, 3) or in percentile form (e.g. 0.333..., 0.666..., 1).\n \"\"\"\n if values.ndim == 1:\n f, values = _get_data_algo(values, _rank1d_functions)\n ranks = f(values, ties_method=method, ascending=ascending,\n na_option=na_option, pct=pct)\n elif values.ndim == 2:\n f, values = _get_data_algo(values, _rank2d_functions)\n ranks = f(values, axis=axis, ties_method=method,\n ascending=ascending, na_option=na_option, pct=pct)\n else:\n raise TypeError(\"Array with ndim > 2 are not supported.\")\n\n return ranks"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef checked_add_with_arr(arr, b, arr_mask=None, b_mask=None):\n # For performance reasons, we broadcast 'b' to the new array 'b2'\n # so that it has the same size as 'arr'.\n b2 = np.broadcast_to(b, arr.shape)\n if b_mask is not None:\n # We do the same broadcasting for b_mask as well.\n b2_mask = np.broadcast_to(b_mask, arr.shape)\n else:\n b2_mask = None\n\n # For elements that are NaN, regardless of their value, we should\n # ignore whether they overflow or not when doing the checked add.\n if arr_mask is not None and b2_mask is not None:\n not_nan = np.logical_not(arr_mask | b2_mask)\n elif arr_mask is not None:\n not_nan = np.logical_not(arr_mask)\n elif b_mask is not None:\n not_nan = np.logical_not(b2_mask)\n else:\n not_nan = np.empty(arr.shape, dtype=bool)\n not_nan.fill(True)\n\n # gh-14324: For each element in 'arr' and its corresponding element\n # in 'b2', we check the sign of the element in 'b2'. If it is positive,\n # we then check whether its sum with the element in 'arr' exceeds\n # np.iinfo(np.int64).max. If so, we have an overflow error. If it\n # it is negative, we then check whether its sum with the element in\n # 'arr' exceeds np.iinfo(np.int64).min. If so, we have an overflow\n # error as well.\n mask1 = b2 > 0\n mask2 = b2 < 0\n\n if not mask1.any():\n to_raise = ((np.iinfo(np.int64).min - b2 > arr) & not_nan).any()\n elif not mask2.any():\n to_raise = ((np.iinfo(np.int64).max - b2 < arr) & not_nan).any()\n else:\n to_raise = (((np.iinfo(np.int64).max -\n b2[mask1] < arr[mask1]) & not_nan[mask1]).any() or\n ((np.iinfo(np.int64).min -\n b2[mask2] > arr[mask2]) & not_nan[mask2]).any())\n\n if to_raise:\n raise OverflowError(\"Overflow in int64 addition\")\n return arr + b", "response": "This function adds an array or scalar to a node tree tree"} {"instruction": "Make a summary of the following Python 3 code\ndef quantile(x, q, interpolation_method='fraction'):\n x = np.asarray(x)\n mask = isna(x)\n\n x = x[~mask]\n\n values = np.sort(x)\n\n def _interpolate(a, b, fraction):\n \"\"\"Returns the point at the given fraction between a and b, where\n 'fraction' must be between 0 and 1.\n \"\"\"\n return a + (b - a) * fraction\n\n def _get_score(at):\n if len(values) == 0:\n return np.nan\n\n idx = at * (len(values) - 1)\n if idx % 1 == 0:\n score = values[int(idx)]\n else:\n if interpolation_method == 'fraction':\n score = _interpolate(values[int(idx)], values[int(idx) + 1],\n idx % 1)\n elif interpolation_method == 'lower':\n score = values[np.floor(idx)]\n elif interpolation_method == 'higher':\n score = values[np.ceil(idx)]\n else:\n raise ValueError(\"interpolation_method can only be 'fraction' \"\n \", 'lower' or 'higher'\")\n\n return score\n\n if is_scalar(q):\n return _get_score(q)\n else:\n q = np.asarray(q, np.float64)\n return algos.arrmap_float64(q, _get_score)", "response": "Compute sample quantile or quantiles of the input array. For example q = 0. 5 and q = 49. 5."} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef take(arr, indices, axis=0, allow_fill=False, fill_value=None):\n from pandas.core.indexing import validate_indices\n\n if not is_array_like(arr):\n arr = np.asarray(arr)\n\n indices = np.asarray(indices, dtype=np.intp)\n\n if allow_fill:\n # Pandas style, -1 means NA\n validate_indices(indices, len(arr))\n result = take_1d(arr, indices, axis=axis, allow_fill=True,\n fill_value=fill_value)\n else:\n # NumPy style\n result = arr.take(indices, axis=axis)\n return result", "response": "Take elements from an array."} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef take_nd(arr, indexer, axis=0, out=None, fill_value=np.nan, mask_info=None,\n allow_fill=True):\n \"\"\"\n Specialized Cython take which sets NaN values in one pass\n\n This dispatches to ``take`` defined on ExtensionArrays. It does not\n currently dispatch to ``SparseArray.take`` for sparse ``arr``.\n\n Parameters\n ----------\n arr : array-like\n Input array.\n indexer : ndarray\n 1-D array of indices to take, subarrays corresponding to -1 value\n indices are filed with fill_value\n axis : int, default 0\n Axis to take from\n out : ndarray or None, default None\n Optional output array, must be appropriate type to hold input and\n fill_value together, if indexer has any -1 value entries; call\n _maybe_promote to determine this type for any fill_value\n fill_value : any, default np.nan\n Fill value to replace -1 values with\n mask_info : tuple of (ndarray, boolean)\n If provided, value should correspond to:\n (indexer != -1, (indexer != -1).any())\n If not provided, it will be computed internally if necessary\n allow_fill : boolean, default True\n If False, indexer is assumed to contain no -1 values so no filling\n will be done. This short-circuits computation of a mask. Result is\n undefined if allow_fill == False and -1 is present in indexer.\n\n Returns\n -------\n subarray : array-like\n May be the same type as the input, or cast to an ndarray.\n \"\"\"\n\n # TODO(EA): Remove these if / elifs as datetimeTZ, interval, become EAs\n # dispatch to internal type takes\n if is_extension_array_dtype(arr):\n return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill)\n elif is_datetime64tz_dtype(arr):\n return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill)\n elif is_interval_dtype(arr):\n return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill)\n\n if is_sparse(arr):\n arr = arr.get_values()\n elif isinstance(arr, (ABCIndexClass, ABCSeries)):\n arr = arr.values\n\n arr = np.asarray(arr)\n\n if indexer is None:\n indexer = np.arange(arr.shape[axis], dtype=np.int64)\n dtype, fill_value = arr.dtype, arr.dtype.type()\n else:\n indexer = ensure_int64(indexer, copy=False)\n if not allow_fill:\n dtype, fill_value = arr.dtype, arr.dtype.type()\n mask_info = None, False\n else:\n # check for promotion based on types only (do this first because\n # it's faster than computing a mask)\n dtype, fill_value = maybe_promote(arr.dtype, fill_value)\n if dtype != arr.dtype and (out is None or out.dtype != dtype):\n # check if promotion is actually required based on indexer\n if mask_info is not None:\n mask, needs_masking = mask_info\n else:\n mask = indexer == -1\n needs_masking = mask.any()\n mask_info = mask, needs_masking\n if needs_masking:\n if out is not None and out.dtype != dtype:\n raise TypeError('Incompatible type for fill_value')\n else:\n # if not, then depromote, set fill_value to dummy\n # (it won't be used but we don't want the cython code\n # to crash when trying to cast it to dtype)\n dtype, fill_value = arr.dtype, arr.dtype.type()\n\n flip_order = False\n if arr.ndim == 2:\n if arr.flags.f_contiguous:\n flip_order = True\n\n if flip_order:\n arr = arr.T\n axis = arr.ndim - axis - 1\n if out is not None:\n out = out.T\n\n # at this point, it's guaranteed that dtype can hold both the arr values\n # and the fill_value\n if out is None:\n out_shape = list(arr.shape)\n out_shape[axis] = len(indexer)\n out_shape = tuple(out_shape)\n if arr.flags.f_contiguous and axis == arr.ndim - 1:\n # minor tweak that can make an order-of-magnitude difference\n # for dataframes initialized directly from 2-d ndarrays\n # (s.t. df.values is c-contiguous and df._data.blocks[0] is its\n # f-contiguous transpose)\n out = np.empty(out_shape, dtype=dtype, order='F')\n else:\n out = np.empty(out_shape, dtype=dtype)\n\n func = _get_take_nd_function(arr.ndim, arr.dtype, out.dtype, axis=axis,\n mask_info=mask_info)\n func(arr, indexer, out, fill_value)\n\n if flip_order:\n out = out.T\n return out", "response": "This function handles the take operation for the base - level numpy array and returns the result as a new array."} {"instruction": "Explain what the following Python 3 code does\ndef take_2d_multi(arr, indexer, out=None, fill_value=np.nan, mask_info=None,\n allow_fill=True):\n \"\"\"\n Specialized Cython take which sets NaN values in one pass\n \"\"\"\n if indexer is None or (indexer[0] is None and indexer[1] is None):\n row_idx = np.arange(arr.shape[0], dtype=np.int64)\n col_idx = np.arange(arr.shape[1], dtype=np.int64)\n indexer = row_idx, col_idx\n dtype, fill_value = arr.dtype, arr.dtype.type()\n else:\n row_idx, col_idx = indexer\n if row_idx is None:\n row_idx = np.arange(arr.shape[0], dtype=np.int64)\n else:\n row_idx = ensure_int64(row_idx)\n if col_idx is None:\n col_idx = np.arange(arr.shape[1], dtype=np.int64)\n else:\n col_idx = ensure_int64(col_idx)\n indexer = row_idx, col_idx\n if not allow_fill:\n dtype, fill_value = arr.dtype, arr.dtype.type()\n mask_info = None, False\n else:\n # check for promotion based on types only (do this first because\n # it's faster than computing a mask)\n dtype, fill_value = maybe_promote(arr.dtype, fill_value)\n if dtype != arr.dtype and (out is None or out.dtype != dtype):\n # check if promotion is actually required based on indexer\n if mask_info is not None:\n (row_mask, col_mask), (row_needs, col_needs) = mask_info\n else:\n row_mask = row_idx == -1\n col_mask = col_idx == -1\n row_needs = row_mask.any()\n col_needs = col_mask.any()\n mask_info = (row_mask, col_mask), (row_needs, col_needs)\n if row_needs or col_needs:\n if out is not None and out.dtype != dtype:\n raise TypeError('Incompatible type for fill_value')\n else:\n # if not, then depromote, set fill_value to dummy\n # (it won't be used but we don't want the cython code\n # to crash when trying to cast it to dtype)\n dtype, fill_value = arr.dtype, arr.dtype.type()\n\n # at this point, it's guaranteed that dtype can hold both the arr values\n # and the fill_value\n if out is None:\n out_shape = len(row_idx), len(col_idx)\n out = np.empty(out_shape, dtype=dtype)\n\n func = _take_2d_multi_dict.get((arr.dtype.name, out.dtype.name), None)\n if func is None and arr.dtype != out.dtype:\n func = _take_2d_multi_dict.get((out.dtype.name, out.dtype.name), None)\n if func is not None:\n func = _convert_wrapper(func, out.dtype)\n if func is None:\n\n def func(arr, indexer, out, fill_value=np.nan):\n _take_2d_multi_object(arr, indexer, out, fill_value=fill_value,\n mask_info=mask_info)\n\n func(arr, indexer, out=out, fill_value=fill_value)\n return out", "response": "Take a 2D array of items from a 2D array."} {"instruction": "Given the following Python 3 function, write the documentation\ndef searchsorted(arr, value, side=\"left\", sorter=None):\n if sorter is not None:\n sorter = ensure_platform_int(sorter)\n\n if isinstance(arr, np.ndarray) and is_integer_dtype(arr) and (\n is_integer(value) or is_integer_dtype(value)):\n from .arrays.array_ import array\n # if `arr` and `value` have different dtypes, `arr` would be\n # recast by numpy, causing a slow search.\n # Before searching below, we therefore try to give `value` the\n # same dtype as `arr`, while guarding against integer overflows.\n iinfo = np.iinfo(arr.dtype.type)\n value_arr = np.array([value]) if is_scalar(value) else np.array(value)\n if (value_arr >= iinfo.min).all() and (value_arr <= iinfo.max).all():\n # value within bounds, so no overflow, so can convert value dtype\n # to dtype of arr\n dtype = arr.dtype\n else:\n dtype = value_arr.dtype\n\n if is_scalar(value):\n value = dtype.type(value)\n else:\n value = array(value, dtype=dtype)\n elif not (is_object_dtype(arr) or is_numeric_dtype(arr) or\n is_categorical_dtype(arr)):\n from pandas.core.series import Series\n # E.g. if `arr` is an array with dtype='datetime64[ns]'\n # and `value` is a pd.Timestamp, we may need to convert value\n value_ser = Series(value)._values\n value = value_ser[0] if is_scalar(value) else value_ser\n\n result = arr.searchsorted(value, side=side, sorter=sorter)\n return result", "response": "Searchsorted implementation of numpy. searchsorted."} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef diff(arr, n, axis=0):\n\n n = int(n)\n na = np.nan\n dtype = arr.dtype\n\n is_timedelta = False\n if needs_i8_conversion(arr):\n dtype = np.float64\n arr = arr.view('i8')\n na = iNaT\n is_timedelta = True\n\n elif is_bool_dtype(dtype):\n dtype = np.object_\n\n elif is_integer_dtype(dtype):\n dtype = np.float64\n\n dtype = np.dtype(dtype)\n out_arr = np.empty(arr.shape, dtype=dtype)\n\n na_indexer = [slice(None)] * arr.ndim\n na_indexer[axis] = slice(None, n) if n >= 0 else slice(n, None)\n out_arr[tuple(na_indexer)] = na\n\n if arr.ndim == 2 and arr.dtype.name in _diff_special:\n f = _diff_special[arr.dtype.name]\n f(arr, out_arr, n, axis)\n else:\n res_indexer = [slice(None)] * arr.ndim\n res_indexer[axis] = slice(n, None) if n >= 0 else slice(None, n)\n res_indexer = tuple(res_indexer)\n\n lag_indexer = [slice(None)] * arr.ndim\n lag_indexer[axis] = slice(None, -n) if n > 0 else slice(-n, None)\n lag_indexer = tuple(lag_indexer)\n\n # need to make sure that we account for na for datelike/timedelta\n # we don't actually want to subtract these i8 numbers\n if is_timedelta:\n res = arr[res_indexer]\n lag = arr[lag_indexer]\n\n mask = (arr[res_indexer] == na) | (arr[lag_indexer] == na)\n if mask.any():\n res = res.copy()\n res[mask] = 0\n lag = lag.copy()\n lag[mask] = 0\n\n result = res - lag\n result[mask] = na\n out_arr[res_indexer] = result\n else:\n out_arr[res_indexer] = arr[res_indexer] - arr[lag_indexer]\n\n if is_timedelta:\n from pandas import TimedeltaIndex\n out_arr = TimedeltaIndex(out_arr.ravel().astype('int64')).asi8.reshape(\n out_arr.shape).astype('timedelta64[ns]')\n\n return out_arr", "response": "Diff of n between self and s - s. shift"} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef _to_ijv(ss, row_levels=(0, ), column_levels=(1, ), sort_labels=False):\n # index and column levels must be a partition of the index\n _check_is_partition([row_levels, column_levels], range(ss.index.nlevels))\n\n # from the SparseSeries: get the labels and data for non-null entries\n values = ss._data.internal_values()._valid_sp_values\n\n nonnull_labels = ss.dropna()\n\n def get_indexers(levels):\n \"\"\" Return sparse coords and dense labels for subset levels \"\"\"\n\n # TODO: how to do this better? cleanly slice nonnull_labels given the\n # coord\n values_ilabels = [tuple(x[i] for i in levels)\n for x in nonnull_labels.index]\n if len(levels) == 1:\n values_ilabels = [x[0] for x in values_ilabels]\n\n # # performance issues with groupby ###################################\n # TODO: these two lines can rejplace the code below but\n # groupby is too slow (in some cases at least)\n # labels_to_i = ss.groupby(level=levels, sort=sort_labels).first()\n # labels_to_i[:] = np.arange(labels_to_i.shape[0])\n\n def _get_label_to_i_dict(labels, sort_labels=False):\n \"\"\" Return OrderedDict of unique labels to number.\n Optionally sort by label.\n \"\"\"\n labels = Index(lmap(tuple, labels)).unique().tolist() # squish\n if sort_labels:\n labels = sorted(list(labels))\n d = OrderedDict((k, i) for i, k in enumerate(labels))\n return (d)\n\n def _get_index_subset_to_coord_dict(index, subset, sort_labels=False):\n ilabels = list(zip(*[index._get_level_values(i) for i in subset]))\n labels_to_i = _get_label_to_i_dict(ilabels,\n sort_labels=sort_labels)\n labels_to_i = Series(labels_to_i)\n if len(subset) > 1:\n labels_to_i.index = MultiIndex.from_tuples(labels_to_i.index)\n labels_to_i.index.names = [index.names[i] for i in subset]\n else:\n labels_to_i.index = Index(x[0] for x in labels_to_i.index)\n labels_to_i.index.name = index.names[subset[0]]\n\n labels_to_i.name = 'value'\n return (labels_to_i)\n\n labels_to_i = _get_index_subset_to_coord_dict(ss.index, levels,\n sort_labels=sort_labels)\n # #####################################################################\n # #####################################################################\n\n i_coord = labels_to_i[values_ilabels].tolist()\n i_labels = labels_to_i.index.tolist()\n\n return i_coord, i_labels\n\n i_coord, i_labels = get_indexers(row_levels)\n j_coord, j_labels = get_indexers(column_levels)\n\n return values, i_coord, j_coord, i_labels, j_labels", "response": "Convert a SparseSeries to an internal IJV."} {"instruction": "Can you implement a function in Python 3 that\nconverts a SparseSeries to a scipy. sparse. coo_matrix using index levels row_levels column_levels as the row and column labels respectively. Returns the sparse_matrix row and column labels respectively.", "response": "def _sparse_series_to_coo(ss, row_levels=(0, ), column_levels=(1, ),\n sort_labels=False):\n \"\"\"\n Convert a SparseSeries to a scipy.sparse.coo_matrix using index\n levels row_levels, column_levels as the row and column\n labels respectively. Returns the sparse_matrix, row and column labels.\n \"\"\"\n\n import scipy.sparse\n\n if ss.index.nlevels < 2:\n raise ValueError('to_coo requires MultiIndex with nlevels > 2')\n if not ss.index.is_unique:\n raise ValueError('Duplicate index entries are not allowed in to_coo '\n 'transformation.')\n\n # to keep things simple, only rely on integer indexing (not labels)\n row_levels = [ss.index._get_level_number(x) for x in row_levels]\n column_levels = [ss.index._get_level_number(x) for x in column_levels]\n\n v, i, j, rows, columns = _to_ijv(ss, row_levels=row_levels,\n column_levels=column_levels,\n sort_labels=sort_labels)\n sparse_matrix = scipy.sparse.coo_matrix(\n (v, (i, j)), shape=(len(rows), len(columns)))\n return sparse_matrix, rows, columns"} {"instruction": "Implement a function in Python 3 to\nconvert a scipy. sparse. coo_matrix to a SparseSeries.", "response": "def _coo_to_sparse_series(A, dense_index=False):\n \"\"\"\n Convert a scipy.sparse.coo_matrix to a SparseSeries.\n Use the defaults given in the SparseSeries constructor.\n \"\"\"\n s = Series(A.data, MultiIndex.from_arrays((A.row, A.col)))\n s = s.sort_index()\n s = s.to_sparse() # TODO: specify kind?\n if dense_index:\n # is there a better constructor method to use here?\n i = range(A.shape[0])\n j = range(A.shape[1])\n ind = MultiIndex.from_product([i, j])\n s = s.reindex(ind)\n return s"} {"instruction": "Implement a function in Python 3 to\nconvert a datetime - like object to a numpy array of M8 values.", "response": "def _to_M8(key, tz=None):\n \"\"\"\n Timestamp-like => dt64\n \"\"\"\n if not isinstance(key, Timestamp):\n # this also converts strings\n key = Timestamp(key)\n if key.tzinfo is not None and tz is not None:\n # Don't tz_localize(None) if key is already tz-aware\n key = key.tz_convert(tz)\n else:\n key = key.tz_localize(tz)\n\n return np.int64(conversion.pydt_to_i8(key)).view(_NS_DTYPE)"} {"instruction": "Can you write a function in Python 3 where it\nwraps comparison operations to convert datetime - like to datetime64 - like", "response": "def _dt_array_cmp(cls, op):\n \"\"\"\n Wrap comparison operations to convert datetime-like to datetime64\n \"\"\"\n opname = '__{name}__'.format(name=op.__name__)\n nat_result = opname == '__ne__'\n\n def wrapper(self, other):\n if isinstance(other, (ABCDataFrame, ABCSeries, ABCIndexClass)):\n return NotImplemented\n\n other = lib.item_from_zerodim(other)\n\n if isinstance(other, (datetime, np.datetime64, str)):\n if isinstance(other, (datetime, np.datetime64)):\n # GH#18435 strings get a pass from tzawareness compat\n self._assert_tzawareness_compat(other)\n\n try:\n other = _to_M8(other, tz=self.tz)\n except ValueError:\n # string that cannot be parsed to Timestamp\n return ops.invalid_comparison(self, other, op)\n\n result = op(self.asi8, other.view('i8'))\n if isna(other):\n result.fill(nat_result)\n elif lib.is_scalar(other) or np.ndim(other) == 0:\n return ops.invalid_comparison(self, other, op)\n elif len(other) != len(self):\n raise ValueError(\"Lengths must match\")\n else:\n if isinstance(other, list):\n try:\n other = type(self)._from_sequence(other)\n except ValueError:\n other = np.array(other, dtype=np.object_)\n elif not isinstance(other, (np.ndarray, ABCIndexClass, ABCSeries,\n DatetimeArray)):\n # Following Timestamp convention, __eq__ is all-False\n # and __ne__ is all True, others raise TypeError.\n return ops.invalid_comparison(self, other, op)\n\n if is_object_dtype(other):\n # We have to use _comp_method_OBJECT_ARRAY instead of numpy\n # comparison otherwise it would fail to raise when\n # comparing tz-aware and tz-naive\n with np.errstate(all='ignore'):\n result = ops._comp_method_OBJECT_ARRAY(op,\n self.astype(object),\n other)\n o_mask = isna(other)\n elif not (is_datetime64_dtype(other) or\n is_datetime64tz_dtype(other)):\n # e.g. is_timedelta64_dtype(other)\n return ops.invalid_comparison(self, other, op)\n else:\n self._assert_tzawareness_compat(other)\n if isinstance(other, (ABCIndexClass, ABCSeries)):\n other = other.array\n\n if (is_datetime64_dtype(other) and\n not is_datetime64_ns_dtype(other) or\n not hasattr(other, 'asi8')):\n # e.g. other.dtype == 'datetime64[s]'\n # or an object-dtype ndarray\n other = type(self)._from_sequence(other)\n\n result = op(self.view('i8'), other.view('i8'))\n o_mask = other._isnan\n\n result = com.values_from_object(result)\n\n if o_mask.any():\n result[o_mask] = nat_result\n\n if self._hasnans:\n result[self._isnan] = nat_result\n\n return result\n\n return compat.set_function_name(wrapper, opname, cls)"} {"instruction": "How would you implement a function in Python 3 that\nconverts a list - like object to a numpy array with dtype datetime64 [ ns ].", "response": "def sequence_to_dt64ns(data, dtype=None, copy=False,\n tz=None,\n dayfirst=False, yearfirst=False, ambiguous='raise',\n int_as_wall_time=False):\n \"\"\"\n Parameters\n ----------\n data : list-like\n dtype : dtype, str, or None, default None\n copy : bool, default False\n tz : tzinfo, str, or None, default None\n dayfirst : bool, default False\n yearfirst : bool, default False\n ambiguous : str, bool, or arraylike, default 'raise'\n See pandas._libs.tslibs.conversion.tz_localize_to_utc\n int_as_wall_time : bool, default False\n Whether to treat ints as wall time in specified timezone, or as\n nanosecond-precision UNIX epoch (wall time in UTC).\n This is used in DatetimeIndex.__init__ to deprecate the wall-time\n behaviour.\n\n ..versionadded:: 0.24.0\n\n Returns\n -------\n result : numpy.ndarray\n The sequence converted to a numpy array with dtype ``datetime64[ns]``.\n tz : tzinfo or None\n Either the user-provided tzinfo or one inferred from the data.\n inferred_freq : Tick or None\n The inferred frequency of the sequence.\n\n Raises\n ------\n TypeError : PeriodDType data is passed\n \"\"\"\n\n inferred_freq = None\n\n dtype = _validate_dt64_dtype(dtype)\n\n if not hasattr(data, \"dtype\"):\n # e.g. list, tuple\n if np.ndim(data) == 0:\n # i.e. generator\n data = list(data)\n data = np.asarray(data)\n copy = False\n elif isinstance(data, ABCSeries):\n data = data._values\n if isinstance(data, ABCPandasArray):\n data = data.to_numpy()\n\n if hasattr(data, \"freq\"):\n # i.e. DatetimeArray/Index\n inferred_freq = data.freq\n\n # if dtype has an embedded tz, capture it\n tz = validate_tz_from_dtype(dtype, tz)\n\n if isinstance(data, ABCIndexClass):\n data = data._data\n\n # By this point we are assured to have either a numpy array or Index\n data, copy = maybe_convert_dtype(data, copy)\n\n if is_object_dtype(data) or is_string_dtype(data):\n # TODO: We do not have tests specific to string-dtypes,\n # also complex or categorical or other extension\n copy = False\n if lib.infer_dtype(data, skipna=False) == 'integer':\n data = data.astype(np.int64)\n else:\n # data comes back here as either i8 to denote UTC timestamps\n # or M8[ns] to denote wall times\n data, inferred_tz = objects_to_datetime64ns(\n data, dayfirst=dayfirst, yearfirst=yearfirst)\n tz = maybe_infer_tz(tz, inferred_tz)\n # When a sequence of timestamp objects is passed, we always\n # want to treat the (now i8-valued) data as UTC timestamps,\n # not wall times.\n int_as_wall_time = False\n\n # `data` may have originally been a Categorical[datetime64[ns, tz]],\n # so we need to handle these types.\n if is_datetime64tz_dtype(data):\n # DatetimeArray -> ndarray\n tz = maybe_infer_tz(tz, data.tz)\n result = data._data\n\n elif is_datetime64_dtype(data):\n # tz-naive DatetimeArray or ndarray[datetime64]\n data = getattr(data, \"_data\", data)\n if data.dtype != _NS_DTYPE:\n data = conversion.ensure_datetime64ns(data)\n\n if tz is not None:\n # Convert tz-naive to UTC\n tz = timezones.maybe_get_tz(tz)\n data = conversion.tz_localize_to_utc(data.view('i8'), tz,\n ambiguous=ambiguous)\n data = data.view(_NS_DTYPE)\n\n assert data.dtype == _NS_DTYPE, data.dtype\n result = data\n\n else:\n # must be integer dtype otherwise\n # assume this data are epoch timestamps\n if tz:\n tz = timezones.maybe_get_tz(tz)\n\n if data.dtype != _INT64_DTYPE:\n data = data.astype(np.int64, copy=False)\n if int_as_wall_time and tz is not None and not timezones.is_utc(tz):\n warnings.warn(_i8_message, FutureWarning, stacklevel=4)\n data = conversion.tz_localize_to_utc(data.view('i8'), tz,\n ambiguous=ambiguous)\n data = data.view(_NS_DTYPE)\n result = data.view(_NS_DTYPE)\n\n if copy:\n # TODO: should this be deepcopy?\n result = result.copy()\n\n assert isinstance(result, np.ndarray), type(result)\n assert result.dtype == 'M8[ns]', result.dtype\n\n # We have to call this again after possibly inferring a tz above\n validate_tz_from_dtype(dtype, tz)\n\n return result, tz, inferred_freq"} {"instruction": "Given the following Python 3 function, write the documentation\ndef objects_to_datetime64ns(data, dayfirst, yearfirst,\n utc=False, errors=\"raise\",\n require_iso8601=False, allow_object=False):\n \"\"\"\n Convert data to array of timestamps.\n\n Parameters\n ----------\n data : np.ndarray[object]\n dayfirst : bool\n yearfirst : bool\n utc : bool, default False\n Whether to convert timezone-aware timestamps to UTC\n errors : {'raise', 'ignore', 'coerce'}\n allow_object : bool\n Whether to return an object-dtype ndarray instead of raising if the\n data contains more than one timezone.\n\n Returns\n -------\n result : ndarray\n np.int64 dtype if returned values represent UTC timestamps\n np.datetime64[ns] if returned values represent wall times\n object if mixed timezones\n inferred_tz : tzinfo or None\n\n Raises\n ------\n ValueError : if data cannot be converted to datetimes\n \"\"\"\n assert errors in [\"raise\", \"ignore\", \"coerce\"]\n\n # if str-dtype, convert\n data = np.array(data, copy=False, dtype=np.object_)\n\n try:\n result, tz_parsed = tslib.array_to_datetime(\n data,\n errors=errors,\n utc=utc,\n dayfirst=dayfirst,\n yearfirst=yearfirst,\n require_iso8601=require_iso8601\n )\n except ValueError as e:\n try:\n values, tz_parsed = conversion.datetime_to_datetime64(data)\n # If tzaware, these values represent unix timestamps, so we\n # return them as i8 to distinguish from wall times\n return values.view('i8'), tz_parsed\n except (ValueError, TypeError):\n raise e\n\n if tz_parsed is not None:\n # We can take a shortcut since the datetime64 numpy array\n # is in UTC\n # Return i8 values to denote unix timestamps\n return result.view('i8'), tz_parsed\n elif is_datetime64_dtype(result):\n # returning M8[ns] denotes wall-times; since tz is None\n # the distinction is a thin one\n return result, tz_parsed\n elif is_object_dtype(result):\n # GH#23675 when called via `pd.to_datetime`, returning an object-dtype\n # array is allowed. When called via `pd.DatetimeIndex`, we can\n # only accept datetime64 dtype, so raise TypeError if object-dtype\n # is returned, as that indicates the values can be recognized as\n # datetimes but they have conflicting timezones/awareness\n if allow_object:\n return result, tz_parsed\n raise TypeError(result)\n else: # pragma: no cover\n # GH#23675 this TypeError should never be hit, whereas the TypeError\n # in the object-dtype branch above is reachable.\n raise TypeError(result)", "response": "Convert a numpy array of timestamps to a datetime64 array of timestamps."} {"instruction": "Write a Python 3 script for\nconverting data based on dtype conventions.", "response": "def maybe_convert_dtype(data, copy):\n \"\"\"\n Convert data based on dtype conventions, issuing deprecation warnings\n or errors where appropriate.\n\n Parameters\n ----------\n data : np.ndarray or pd.Index\n copy : bool\n\n Returns\n -------\n data : np.ndarray or pd.Index\n copy : bool\n\n Raises\n ------\n TypeError : PeriodDType data is passed\n \"\"\"\n if is_float_dtype(data):\n # Note: we must cast to datetime64[ns] here in order to treat these\n # as wall-times instead of UTC timestamps.\n data = data.astype(_NS_DTYPE)\n copy = False\n # TODO: deprecate this behavior to instead treat symmetrically\n # with integer dtypes. See discussion in GH#23675\n\n elif is_timedelta64_dtype(data):\n warnings.warn(\"Passing timedelta64-dtype data is deprecated, will \"\n \"raise a TypeError in a future version\",\n FutureWarning, stacklevel=5)\n data = data.view(_NS_DTYPE)\n\n elif is_period_dtype(data):\n # Note: without explicitly raising here, PeriodIndex\n # test_setops.test_join_does_not_recur fails\n raise TypeError(\"Passing PeriodDtype data is invalid. \"\n \"Use `data.to_timestamp()` instead\")\n\n elif is_categorical_dtype(data):\n # GH#18664 preserve tz in going DTI->Categorical->DTI\n # TODO: cases where we need to do another pass through this func,\n # e.g. the categories are timedelta64s\n data = data.categories.take(data.codes, fill_value=NaT)._values\n copy = False\n\n elif is_extension_type(data) and not is_datetime64tz_dtype(data):\n # Includes categorical\n # TODO: We have no tests for these\n data = np.array(data, dtype=np.object_)\n copy = False\n\n return data, copy"} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef _validate_dt64_dtype(dtype):\n if dtype is not None:\n dtype = pandas_dtype(dtype)\n if is_dtype_equal(dtype, np.dtype(\"M8\")):\n # no precision, warn\n dtype = _NS_DTYPE\n msg = textwrap.dedent(\"\"\"\\\n Passing in 'datetime64' dtype with no precision is deprecated\n and will raise in a future version. Please pass in\n 'datetime64[ns]' instead.\"\"\")\n warnings.warn(msg, FutureWarning, stacklevel=5)\n\n if ((isinstance(dtype, np.dtype) and dtype != _NS_DTYPE)\n or not isinstance(dtype, (np.dtype, DatetimeTZDtype))):\n raise ValueError(\"Unexpected value for 'dtype': '{dtype}'. \"\n \"Must be 'datetime64[ns]' or DatetimeTZDtype'.\"\n .format(dtype=dtype))\n return dtype", "response": "Validate that a dtype is either a numpy datetime64 [ ns ] or DatetimeTZDtype."} {"instruction": "How would you explain what the following Python 3 function does\ndef validate_tz_from_dtype(dtype, tz):\n if dtype is not None:\n if isinstance(dtype, str):\n try:\n dtype = DatetimeTZDtype.construct_from_string(dtype)\n except TypeError:\n # Things like `datetime64[ns]`, which is OK for the\n # constructors, but also nonsense, which should be validated\n # but not by us. We *do* allow non-existent tz errors to\n # go through\n pass\n dtz = getattr(dtype, 'tz', None)\n if dtz is not None:\n if tz is not None and not timezones.tz_compare(tz, dtz):\n raise ValueError(\"cannot supply both a tz and a dtype\"\n \" with a tz\")\n tz = dtz\n\n if tz is not None and is_datetime64_dtype(dtype):\n # We also need to check for the case where the user passed a\n # tz-naive dtype (i.e. datetime64[ns])\n if tz is not None and not timezones.tz_compare(tz, dtz):\n raise ValueError(\"cannot supply both a tz and a \"\n \"timezone-naive dtype (i.e. datetime64[ns])\")\n\n return tz", "response": "Validate that the given dtype is a DatetimeTZDtype and that it does not conflict with the given tz."} {"instruction": "Write a Python 3 script for\ninferring a timezone from start and end time points.", "response": "def _infer_tz_from_endpoints(start, end, tz):\n \"\"\"\n If a timezone is not explicitly given via `tz`, see if one can\n be inferred from the `start` and `end` endpoints. If more than one\n of these inputs provides a timezone, require that they all agree.\n\n Parameters\n ----------\n start : Timestamp\n end : Timestamp\n tz : tzinfo or None\n\n Returns\n -------\n tz : tzinfo or None\n\n Raises\n ------\n TypeError : if start and end timezones do not agree\n \"\"\"\n try:\n inferred_tz = timezones.infer_tzinfo(start, end)\n except Exception:\n raise TypeError('Start and end cannot both be tz-aware with '\n 'different timezones')\n\n inferred_tz = timezones.maybe_get_tz(inferred_tz)\n tz = timezones.maybe_get_tz(tz)\n\n if tz is not None and inferred_tz is not None:\n if not timezones.tz_compare(inferred_tz, tz):\n raise AssertionError(\"Inferred time zone not equal to passed \"\n \"time zone\")\n\n elif inferred_tz is not None:\n tz = inferred_tz\n\n return tz"} {"instruction": "Given the following Python 3 function, write the documentation\ndef _maybe_localize_point(ts, is_none, is_not_none, freq, tz):\n # Make sure start and end are timezone localized if:\n # 1) freq = a Timedelta-like frequency (Tick)\n # 2) freq = None i.e. generating a linspaced range\n if isinstance(freq, Tick) or freq is None:\n localize_args = {'tz': tz, 'ambiguous': False}\n else:\n localize_args = {'tz': None}\n if is_none is None and is_not_none is not None:\n ts = ts.tz_localize(**localize_args)\n return ts", "response": "Localize a start or end Timestamp to the timezone of the corresponding\n ."} {"instruction": "How would you implement a function in Python 3 that\nsubtracts DatetimeArray or ndarray [ datetime64 ]", "response": "def _sub_datetime_arraylike(self, other):\n \"\"\"subtract DatetimeArray/Index or ndarray[datetime64]\"\"\"\n if len(self) != len(other):\n raise ValueError(\"cannot add indices of unequal length\")\n\n if isinstance(other, np.ndarray):\n assert is_datetime64_dtype(other)\n other = type(self)(other)\n\n if not self._has_same_tz(other):\n # require tz compat\n raise TypeError(\"{cls} subtraction must have the same \"\n \"timezones or no timezones\"\n .format(cls=type(self).__name__))\n\n self_i8 = self.asi8\n other_i8 = other.asi8\n arr_mask = self._isnan | other._isnan\n new_values = checked_add_with_arr(self_i8, -other_i8,\n arr_mask=arr_mask)\n if self._hasnans or other._hasnans:\n new_values[arr_mask] = iNaT\n return new_values.view('timedelta64[ns]')"} {"instruction": "Create a Python 3 function to\nadd a timedelta - like Tick or TimedeltaIndex - like object to self yielding a new DatetimeArray", "response": "def _add_delta(self, delta):\n \"\"\"\n Add a timedelta-like, Tick, or TimedeltaIndex-like object\n to self, yielding a new DatetimeArray\n\n Parameters\n ----------\n other : {timedelta, np.timedelta64, Tick,\n TimedeltaIndex, ndarray[timedelta64]}\n\n Returns\n -------\n result : DatetimeArray\n \"\"\"\n new_values = super()._add_delta(delta)\n return type(self)._from_sequence(new_values, tz=self.tz, freq='infer')"} {"instruction": "Make a summary of the following Python 3 code\ndef tz_convert(self, tz):\n tz = timezones.maybe_get_tz(tz)\n\n if self.tz is None:\n # tz naive, use tz_localize\n raise TypeError('Cannot convert tz-naive timestamps, use '\n 'tz_localize to localize')\n\n # No conversion since timestamps are all UTC to begin with\n dtype = tz_to_dtype(tz)\n return self._simple_new(self.asi8, dtype=dtype, freq=self.freq)", "response": "Convert tz - aware Datetime Array or Index from one time zone to another time zone."} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef tz_localize(self, tz, ambiguous='raise', nonexistent='raise',\n errors=None):\n \"\"\"\n Localize tz-naive Datetime Array/Index to tz-aware\n Datetime Array/Index.\n\n This method takes a time zone (tz) naive Datetime Array/Index object\n and makes this time zone aware. It does not move the time to another\n time zone.\n Time zone localization helps to switch from time zone aware to time\n zone unaware objects.\n\n Parameters\n ----------\n tz : str, pytz.timezone, dateutil.tz.tzfile or None\n Time zone to convert timestamps to. Passing ``None`` will\n remove the time zone information preserving local time.\n ambiguous : 'infer', 'NaT', bool array, default 'raise'\n When clocks moved backward due to DST, ambiguous times may arise.\n For example in Central European Time (UTC+01), when going from\n 03:00 DST to 02:00 non-DST, 02:30:00 local time occurs both at\n 00:30:00 UTC and at 01:30:00 UTC. In such a situation, the\n `ambiguous` parameter dictates how ambiguous times should be\n handled.\n\n - 'infer' will attempt to infer fall dst-transition hours based on\n order\n - bool-ndarray where True signifies a DST time, False signifies a\n non-DST time (note that this flag is only applicable for\n ambiguous times)\n - 'NaT' will return NaT where there are ambiguous times\n - 'raise' will raise an AmbiguousTimeError if there are ambiguous\n times\n\n nonexistent : 'shift_forward', 'shift_backward, 'NaT', timedelta, \\\ndefault 'raise'\n A nonexistent time does not exist in a particular timezone\n where clocks moved forward due to DST.\n\n - 'shift_forward' will shift the nonexistent time forward to the\n closest existing time\n - 'shift_backward' will shift the nonexistent time backward to the\n closest existing time\n - 'NaT' will return NaT where there are nonexistent times\n - timedelta objects will shift nonexistent times by the timedelta\n - 'raise' will raise an NonExistentTimeError if there are\n nonexistent times\n\n .. versionadded:: 0.24.0\n\n errors : {'raise', 'coerce'}, default None\n\n - 'raise' will raise a NonExistentTimeError if a timestamp is not\n valid in the specified time zone (e.g. due to a transition from\n or to DST time). Use ``nonexistent='raise'`` instead.\n - 'coerce' will return NaT if the timestamp can not be converted\n to the specified time zone. Use ``nonexistent='NaT'`` instead.\n\n .. deprecated:: 0.24.0\n\n Returns\n -------\n Same type as self\n Array/Index converted to the specified time zone.\n\n Raises\n ------\n TypeError\n If the Datetime Array/Index is tz-aware and tz is not None.\n\n See Also\n --------\n DatetimeIndex.tz_convert : Convert tz-aware DatetimeIndex from\n one time zone to another.\n\n Examples\n --------\n >>> tz_naive = pd.date_range('2018-03-01 09:00', periods=3)\n >>> tz_naive\n DatetimeIndex(['2018-03-01 09:00:00', '2018-03-02 09:00:00',\n '2018-03-03 09:00:00'],\n dtype='datetime64[ns]', freq='D')\n\n Localize DatetimeIndex in US/Eastern time zone:\n\n >>> tz_aware = tz_naive.tz_localize(tz='US/Eastern')\n >>> tz_aware\n DatetimeIndex(['2018-03-01 09:00:00-05:00',\n '2018-03-02 09:00:00-05:00',\n '2018-03-03 09:00:00-05:00'],\n dtype='datetime64[ns, US/Eastern]', freq='D')\n\n With the ``tz=None``, we can remove the time zone information\n while keeping the local time (not converted to UTC):\n\n >>> tz_aware.tz_localize(None)\n DatetimeIndex(['2018-03-01 09:00:00', '2018-03-02 09:00:00',\n '2018-03-03 09:00:00'],\n dtype='datetime64[ns]', freq='D')\n\n Be careful with DST changes. When there is sequential data, pandas can\n infer the DST time:\n >>> s = pd.to_datetime(pd.Series(['2018-10-28 01:30:00',\n ... '2018-10-28 02:00:00',\n ... '2018-10-28 02:30:00',\n ... '2018-10-28 02:00:00',\n ... '2018-10-28 02:30:00',\n ... '2018-10-28 03:00:00',\n ... '2018-10-28 03:30:00']))\n >>> s.dt.tz_localize('CET', ambiguous='infer')\n 0 2018-10-28 01:30:00+02:00\n 1 2018-10-28 02:00:00+02:00\n 2 2018-10-28 02:30:00+02:00\n 3 2018-10-28 02:00:00+01:00\n 4 2018-10-28 02:30:00+01:00\n 5 2018-10-28 03:00:00+01:00\n 6 2018-10-28 03:30:00+01:00\n dtype: datetime64[ns, CET]\n\n In some cases, inferring the DST is impossible. In such cases, you can\n pass an ndarray to the ambiguous parameter to set the DST explicitly\n\n >>> s = pd.to_datetime(pd.Series(['2018-10-28 01:20:00',\n ... '2018-10-28 02:36:00',\n ... '2018-10-28 03:46:00']))\n >>> s.dt.tz_localize('CET', ambiguous=np.array([True, True, False]))\n 0 2015-03-29 03:00:00+02:00\n 1 2015-03-29 03:30:00+02:00\n dtype: datetime64[ns, Europe/Warsaw]\n\n If the DST transition causes nonexistent times, you can shift these\n dates forward or backwards with a timedelta object or `'shift_forward'`\n or `'shift_backwards'`.\n >>> s = pd.to_datetime(pd.Series(['2015-03-29 02:30:00',\n ... '2015-03-29 03:30:00']))\n >>> s.dt.tz_localize('Europe/Warsaw', nonexistent='shift_forward')\n 0 2015-03-29 03:00:00+02:00\n 1 2015-03-29 03:30:00+02:00\n dtype: datetime64[ns, 'Europe/Warsaw']\n >>> s.dt.tz_localize('Europe/Warsaw', nonexistent='shift_backward')\n 0 2015-03-29 01:59:59.999999999+01:00\n 1 2015-03-29 03:30:00+02:00\n dtype: datetime64[ns, 'Europe/Warsaw']\n >>> s.dt.tz_localize('Europe/Warsaw', nonexistent=pd.Timedelta('1H'))\n 0 2015-03-29 03:30:00+02:00\n 1 2015-03-29 03:30:00+02:00\n dtype: datetime64[ns, 'Europe/Warsaw']\n \"\"\"\n if errors is not None:\n warnings.warn(\"The errors argument is deprecated and will be \"\n \"removed in a future release. Use \"\n \"nonexistent='NaT' or nonexistent='raise' \"\n \"instead.\", FutureWarning)\n if errors == 'coerce':\n nonexistent = 'NaT'\n elif errors == 'raise':\n nonexistent = 'raise'\n else:\n raise ValueError(\"The errors argument must be either 'coerce' \"\n \"or 'raise'.\")\n\n nonexistent_options = ('raise', 'NaT', 'shift_forward',\n 'shift_backward')\n if nonexistent not in nonexistent_options and not isinstance(\n nonexistent, timedelta):\n raise ValueError(\"The nonexistent argument must be one of 'raise',\"\n \" 'NaT', 'shift_forward', 'shift_backward' or\"\n \" a timedelta object\")\n\n if self.tz is not None:\n if tz is None:\n new_dates = tzconversion.tz_convert(self.asi8, timezones.UTC,\n self.tz)\n else:\n raise TypeError(\"Already tz-aware, use tz_convert to convert.\")\n else:\n tz = timezones.maybe_get_tz(tz)\n # Convert to UTC\n\n new_dates = conversion.tz_localize_to_utc(\n self.asi8, tz, ambiguous=ambiguous, nonexistent=nonexistent,\n )\n new_dates = new_dates.view(_NS_DTYPE)\n dtype = tz_to_dtype(tz)\n return self._simple_new(new_dates, dtype=dtype, freq=self.freq)", "response": "This method will translate the time zone of the object to the specified time zone."} {"instruction": "Implement a function in Python 3 to\nnormalize the date - time to midnight.", "response": "def normalize(self):\n \"\"\"\n Convert times to midnight.\n\n The time component of the date-time is converted to midnight i.e.\n 00:00:00. This is useful in cases, when the time does not matter.\n Length is unaltered. The timezones are unaffected.\n\n This method is available on Series with datetime values under\n the ``.dt`` accessor, and directly on Datetime Array/Index.\n\n Returns\n -------\n DatetimeArray, DatetimeIndex or Series\n The same type as the original data. Series will have the same\n name and index. DatetimeIndex will have the same name.\n\n See Also\n --------\n floor : Floor the datetimes to the specified freq.\n ceil : Ceil the datetimes to the specified freq.\n round : Round the datetimes to the specified freq.\n\n Examples\n --------\n >>> idx = pd.date_range(start='2014-08-01 10:00', freq='H',\n ... periods=3, tz='Asia/Calcutta')\n >>> idx\n DatetimeIndex(['2014-08-01 10:00:00+05:30',\n '2014-08-01 11:00:00+05:30',\n '2014-08-01 12:00:00+05:30'],\n dtype='datetime64[ns, Asia/Calcutta]', freq='H')\n >>> idx.normalize()\n DatetimeIndex(['2014-08-01 00:00:00+05:30',\n '2014-08-01 00:00:00+05:30',\n '2014-08-01 00:00:00+05:30'],\n dtype='datetime64[ns, Asia/Calcutta]', freq=None)\n \"\"\"\n if self.tz is None or timezones.is_utc(self.tz):\n not_null = ~self.isna()\n DAY_NS = ccalendar.DAY_SECONDS * 1000000000\n new_values = self.asi8.copy()\n adjustment = (new_values[not_null] % DAY_NS)\n new_values[not_null] = new_values[not_null] - adjustment\n else:\n new_values = conversion.normalize_i8_timestamps(self.asi8, self.tz)\n return type(self)._from_sequence(new_values,\n freq='infer').tz_localize(self.tz)"} {"instruction": "How would you explain what the following Python 3 function does\ndef to_period(self, freq=None):\n from pandas.core.arrays import PeriodArray\n\n if self.tz is not None:\n warnings.warn(\"Converting to PeriodArray/Index representation \"\n \"will drop timezone information.\", UserWarning)\n\n if freq is None:\n freq = self.freqstr or self.inferred_freq\n\n if freq is None:\n raise ValueError(\"You must pass a freq argument as \"\n \"current index has none.\")\n\n freq = get_period_alias(freq)\n\n return PeriodArray._from_datetime64(self._data, freq, tz=self.tz)", "response": "Converts the internal datetime - like object to PeriodArray or Index."} {"instruction": "Can you tell what is the following Python 3 function doing\ndef to_perioddelta(self, freq):\n # TODO: consider privatizing (discussion in GH#23113)\n from pandas.core.arrays.timedeltas import TimedeltaArray\n i8delta = self.asi8 - self.to_period(freq).to_timestamp().asi8\n m8delta = i8delta.view('m8[ns]')\n return TimedeltaArray(m8delta)", "response": "Calculates the TimedeltaArray of difference between index\n values and index converted to PeriodArray at specified frequency"} {"instruction": "Explain what the following Python 3 code does\ndef month_name(self, locale=None):\n if self.tz is not None and not timezones.is_utc(self.tz):\n values = self._local_timestamps()\n else:\n values = self.asi8\n\n result = fields.get_date_name_field(values, 'month_name',\n locale=locale)\n result = self._maybe_mask_results(result, fill_value=None)\n return result", "response": "Return the month names of the DateTimeIndex with specified locale."} {"instruction": "Explain what the following Python 3 code does\ndef time(self):\n # If the Timestamps have a timezone that is not UTC,\n # convert them into their i8 representation while\n # keeping their timezone and not using UTC\n if self.tz is not None and not timezones.is_utc(self.tz):\n timestamps = self._local_timestamps()\n else:\n timestamps = self.asi8\n\n return tslib.ints_to_pydatetime(timestamps, box=\"time\")", "response": "Returns numpy array of datetime. time. The time part of the Timestamps.\n "} {"instruction": "How would you explain what the following Python 3 function does\ndef to_julian_date(self):\n\n # http://mysite.verizon.net/aesir_research/date/jdalg2.htm\n year = np.asarray(self.year)\n month = np.asarray(self.month)\n day = np.asarray(self.day)\n testarr = month < 3\n year[testarr] -= 1\n month[testarr] += 12\n return (day +\n np.fix((153 * month - 457) / 5) +\n 365 * year +\n np.floor(year / 4) -\n np.floor(year / 100) +\n np.floor(year / 400) +\n 1721118.5 +\n (self.hour +\n self.minute / 60.0 +\n self.second / 3600.0 +\n self.microsecond / 3600.0 / 1e+6 +\n self.nanosecond / 3600.0 / 1e+9\n ) / 24.0)", "response": "Convert Datetime Array to float64 ndarray of Julian Dates."} {"instruction": "Create a Python 3 function to\nparse the API documentation file and yield information about all public API items.", "response": "def get_api_items(api_doc_fd):\n \"\"\"\n Yield information about all public API items.\n\n Parse api.rst file from the documentation, and extract all the functions,\n methods, classes, attributes... This should include all pandas public API.\n\n Parameters\n ----------\n api_doc_fd : file descriptor\n A file descriptor of the API documentation page, containing the table\n of contents with all the public API.\n\n Yields\n ------\n name : str\n The name of the object (e.g. 'pandas.Series.str.upper).\n func : function\n The object itself. In most cases this will be a function or method,\n but it can also be classes, properties, cython objects...\n section : str\n The name of the section in the API page where the object item is\n located.\n subsection : str\n The name of the subsection in the API page where the object item is\n located.\n \"\"\"\n current_module = 'pandas'\n previous_line = current_section = current_subsection = ''\n position = None\n for line in api_doc_fd:\n line = line.strip()\n if len(line) == len(previous_line):\n if set(line) == set('-'):\n current_section = previous_line\n continue\n if set(line) == set('~'):\n current_subsection = previous_line\n continue\n\n if line.startswith('.. currentmodule::'):\n current_module = line.replace('.. currentmodule::', '').strip()\n continue\n\n if line == '.. autosummary::':\n position = 'autosummary'\n continue\n\n if position == 'autosummary':\n if line == '':\n position = 'items'\n continue\n\n if position == 'items':\n if line == '':\n position = None\n continue\n item = line.strip()\n func = importlib.import_module(current_module)\n for part in item.split('.'):\n func = getattr(func, part)\n\n yield ('.'.join([current_module, item]), func,\n current_section, current_subsection)\n\n previous_line = line"} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef get_validation_data(doc):\n\n errs = []\n wrns = []\n if not doc.raw_doc:\n errs.append(error('GL08'))\n return errs, wrns, ''\n\n if doc.start_blank_lines != 1:\n errs.append(error('GL01'))\n if doc.end_blank_lines != 1:\n errs.append(error('GL02'))\n if doc.double_blank_lines:\n errs.append(error('GL03'))\n mentioned_errs = doc.mentioned_private_classes\n if mentioned_errs:\n errs.append(error('GL04',\n mentioned_private_classes=', '.join(mentioned_errs)))\n for line in doc.raw_doc.splitlines():\n if re.match(\"^ *\\t\", line):\n errs.append(error('GL05', line_with_tabs=line.lstrip()))\n\n unexpected_sections = [section for section in doc.section_titles\n if section not in ALLOWED_SECTIONS]\n for section in unexpected_sections:\n errs.append(error('GL06',\n section=section,\n allowed_sections=', '.join(ALLOWED_SECTIONS)))\n\n correct_order = [section for section in ALLOWED_SECTIONS\n if section in doc.section_titles]\n if correct_order != doc.section_titles:\n errs.append(error('GL07',\n correct_sections=', '.join(correct_order)))\n\n if (doc.deprecated_with_directive\n and not doc.extended_summary.startswith('.. deprecated:: ')):\n errs.append(error('GL09'))\n\n if not doc.summary:\n errs.append(error('SS01'))\n else:\n if not doc.summary[0].isupper():\n errs.append(error('SS02'))\n if doc.summary[-1] != '.':\n errs.append(error('SS03'))\n if doc.summary != doc.summary.lstrip():\n errs.append(error('SS04'))\n elif (doc.is_function_or_method\n and doc.summary.split(' ')[0][-1] == 's'):\n errs.append(error('SS05'))\n if doc.num_summary_lines > 1:\n errs.append(error('SS06'))\n\n if not doc.extended_summary:\n wrns.append(('ES01', 'No extended summary found'))\n\n # PR01: Parameters not documented\n # PR02: Unknown parameters\n # PR03: Wrong parameters order\n errs += doc.parameter_mismatches\n\n for param in doc.doc_parameters:\n if not param.startswith(\"*\"): # Check can ignore var / kwargs\n if not doc.parameter_type(param):\n if ':' in param:\n errs.append(error('PR10',\n param_name=param.split(':')[0]))\n else:\n errs.append(error('PR04', param_name=param))\n else:\n if doc.parameter_type(param)[-1] == '.':\n errs.append(error('PR05', param_name=param))\n common_type_errors = [('integer', 'int'),\n ('boolean', 'bool'),\n ('string', 'str')]\n for wrong_type, right_type in common_type_errors:\n if wrong_type in doc.parameter_type(param):\n errs.append(error('PR06',\n param_name=param,\n right_type=right_type,\n wrong_type=wrong_type))\n if not doc.parameter_desc(param):\n errs.append(error('PR07', param_name=param))\n else:\n if not doc.parameter_desc(param)[0].isupper():\n errs.append(error('PR08', param_name=param))\n if doc.parameter_desc(param)[-1] != '.':\n errs.append(error('PR09', param_name=param))\n\n if doc.is_function_or_method:\n if not doc.returns:\n if doc.method_returns_something:\n errs.append(error('RT01'))\n else:\n if len(doc.returns) == 1 and doc.returns[0].name:\n errs.append(error('RT02'))\n for name_or_type, type_, desc in doc.returns:\n if not desc:\n errs.append(error('RT03'))\n else:\n desc = ' '.join(desc)\n if not desc[0].isupper():\n errs.append(error('RT04'))\n if not desc.endswith('.'):\n errs.append(error('RT05'))\n\n if not doc.yields and 'yield' in doc.method_source:\n errs.append(error('YD01'))\n\n if not doc.see_also:\n wrns.append(error('SA01'))\n else:\n for rel_name, rel_desc in doc.see_also.items():\n if rel_desc:\n if not rel_desc.endswith('.'):\n errs.append(error('SA02', reference_name=rel_name))\n if not rel_desc[0].isupper():\n errs.append(error('SA03', reference_name=rel_name))\n else:\n errs.append(error('SA04', reference_name=rel_name))\n if rel_name.startswith('pandas.'):\n errs.append(error('SA05',\n reference_name=rel_name,\n right_reference=rel_name[len('pandas.'):]))\n\n examples_errs = ''\n if not doc.examples:\n wrns.append(error('EX01'))\n else:\n examples_errs = doc.examples_errors\n if examples_errs:\n errs.append(error('EX02', doctest_log=examples_errs))\n for err in doc.validate_pep8():\n errs.append(error('EX03',\n error_code=err.error_code,\n error_message=err.message,\n times_happening=' ({} times)'.format(err.count)\n if err.count > 1 else ''))\n examples_source_code = ''.join(doc.examples_source_code)\n for wrong_import in ('numpy', 'pandas'):\n if 'import {}'.format(wrong_import) in examples_source_code:\n errs.append(error('EX04', imported_library=wrong_import))\n return errs, wrns, examples_errs", "response": "Validate the docstring and return the validation data."} {"instruction": "Can you create a Python 3 function that\nvalidates the docstring for the given function_name and return a dictionary containing all the information obtained from validating .", "response": "def validate_one(func_name):\n \"\"\"\n Validate the docstring for the given func_name\n\n Parameters\n ----------\n func_name : function\n Function whose docstring will be evaluated (e.g. pandas.read_csv).\n\n Returns\n -------\n dict\n A dictionary containing all the information obtained from validating\n the docstring.\n \"\"\"\n doc = Docstring(func_name)\n errs, wrns, examples_errs = get_validation_data(doc)\n return {'type': doc.type,\n 'docstring': doc.clean_doc,\n 'deprecated': doc.deprecated,\n 'file': doc.source_file_name,\n 'file_line': doc.source_file_def_line,\n 'github_link': doc.github_url,\n 'errors': errs,\n 'warnings': wrns,\n 'examples_errors': examples_errs}"} {"instruction": "Can you write a function in Python 3 where it\nexecutes the validation of all docstrings and return a dictionary with the results.", "response": "def validate_all(prefix, ignore_deprecated=False):\n \"\"\"\n Execute the validation of all docstrings, and return a dict with the\n results.\n\n Parameters\n ----------\n prefix : str or None\n If provided, only the docstrings that start with this pattern will be\n validated. If None, all docstrings will be validated.\n ignore_deprecated: bool, default False\n If True, deprecated objects are ignored when validating docstrings.\n\n Returns\n -------\n dict\n A dictionary with an item for every function/method... containing\n all the validation information.\n \"\"\"\n result = {}\n seen = {}\n\n # functions from the API docs\n api_doc_fnames = os.path.join(\n BASE_PATH, 'doc', 'source', 'reference', '*.rst')\n api_items = []\n for api_doc_fname in glob.glob(api_doc_fnames):\n with open(api_doc_fname) as f:\n api_items += list(get_api_items(f))\n for func_name, func_obj, section, subsection in api_items:\n if prefix and not func_name.startswith(prefix):\n continue\n doc_info = validate_one(func_name)\n if ignore_deprecated and doc_info['deprecated']:\n continue\n result[func_name] = doc_info\n\n shared_code_key = doc_info['file'], doc_info['file_line']\n shared_code = seen.get(shared_code_key, '')\n result[func_name].update({'in_api': True,\n 'section': section,\n 'subsection': subsection,\n 'shared_code_with': shared_code})\n\n seen[shared_code_key] = func_name\n\n # functions from introspecting Series, DataFrame and Panel\n api_item_names = set(list(zip(*api_items))[0])\n for class_ in (pandas.Series, pandas.DataFrame, pandas.Panel):\n for member in inspect.getmembers(class_):\n func_name = 'pandas.{}.{}'.format(class_.__name__, member[0])\n if (not member[0].startswith('_')\n and func_name not in api_item_names):\n if prefix and not func_name.startswith(prefix):\n continue\n doc_info = validate_one(func_name)\n if ignore_deprecated and doc_info['deprecated']:\n continue\n result[func_name] = doc_info\n result[func_name]['in_api'] = False\n\n return result"} {"instruction": "How would you implement a function in Python 3 that\nimports a Python object from its name as string.", "response": "def _load_obj(name):\n \"\"\"\n Import Python object from its name as string.\n\n Parameters\n ----------\n name : str\n Object name to import (e.g. pandas.Series.str.upper)\n\n Returns\n -------\n object\n Python object that can be a class, method, function...\n\n Examples\n --------\n >>> Docstring._load_obj('pandas.Series')\n \n \"\"\"\n for maxsplit in range(1, name.count('.') + 1):\n # TODO when py3 only replace by: module, *func_parts = ...\n func_name_split = name.rsplit('.', maxsplit)\n module = func_name_split[0]\n func_parts = func_name_split[1:]\n try:\n obj = importlib.import_module(module)\n except ImportError:\n pass\n else:\n continue\n\n if 'obj' not in locals():\n raise ImportError('No module can be imported '\n 'from \"{}\"'.format(name))\n\n for part in func_parts:\n obj = getattr(obj, part)\n return obj"} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef _to_original_callable(obj):\n while True:\n if inspect.isfunction(obj) or inspect.isclass(obj):\n f = inspect.getfile(obj)\n if f.startswith('<') and f.endswith('>'):\n return None\n return obj\n if inspect.ismethod(obj):\n obj = obj.__func__\n elif isinstance(obj, functools.partial):\n obj = obj.func\n elif isinstance(obj, property):\n obj = obj.fget\n else:\n return None", "response": "Find the original callable that is the object that contains the source code of the object."} {"instruction": "How would you explain what the following Python 3 function does\ndef source_file_name(self):\n try:\n fname = inspect.getsourcefile(self.code_obj)\n except TypeError:\n # In some cases the object is something complex like a cython\n # object that can't be easily introspected. An it's better to\n # return the source code file of the object as None, than crash\n pass\n else:\n if fname:\n fname = os.path.relpath(fname, BASE_PATH)\n return fname", "response": "Returns the source file name of the object."} {"instruction": "Given the following Python 3 function, write the documentation\ndef method_returns_something(self):\n '''\n Check if the docstrings method can return something.\n\n Bare returns, returns valued None and returns from nested functions are\n disconsidered.\n\n Returns\n -------\n bool\n Whether the docstrings method can return something.\n '''\n\n def get_returns_not_on_nested_functions(node):\n returns = [node] if isinstance(node, ast.Return) else []\n for child in ast.iter_child_nodes(node):\n # Ignore nested functions and its subtrees.\n if not isinstance(child, ast.FunctionDef):\n child_returns = get_returns_not_on_nested_functions(child)\n returns.extend(child_returns)\n return returns\n\n tree = ast.parse(self.method_source).body\n if tree:\n returns = get_returns_not_on_nested_functions(tree[0])\n return_values = [r.value for r in returns]\n # Replace NameConstant nodes valued None for None.\n for i, v in enumerate(return_values):\n if isinstance(v, ast.NameConstant) and v.value is None:\n return_values[i] = None\n return any(return_values)\n else:\n return False", "response": "Check if the docstrings method can return something."} {"instruction": "How would you code a function in Python 3 to\nconvert numpy types to Python types and return the converted value and the optional format string.", "response": "def _value_with_fmt(self, val):\n \"\"\"Convert numpy types to Python types for the Excel writers.\n\n Parameters\n ----------\n val : object\n Value to be written into cells\n\n Returns\n -------\n Tuple with the first element being the converted value and the second\n being an optional format\n \"\"\"\n fmt = None\n\n if is_integer(val):\n val = int(val)\n elif is_float(val):\n val = float(val)\n elif is_bool(val):\n val = bool(val)\n elif isinstance(val, datetime):\n fmt = self.datetime_format\n elif isinstance(val, date):\n fmt = self.date_format\n elif isinstance(val, timedelta):\n val = val.total_seconds() / float(86400)\n fmt = '0'\n else:\n val = compat.to_str(val)\n\n return val, fmt"} {"instruction": "Create a Python 3 function for\nchecking that the path s extension against the Writer s supported Taxonomy extensions. If it isn t supported raises UnsupportedFiletypeError.", "response": "def check_extension(cls, ext):\n \"\"\"checks that path's extension against the Writer's supported\n extensions. If it isn't supported, raises UnsupportedFiletypeError.\"\"\"\n if ext.startswith('.'):\n ext = ext[1:]\n if not any(ext in extension for extension in cls.supported_extensions):\n msg = (\"Invalid extension for engine '{engine}': '{ext}'\"\n .format(engine=pprint_thing(cls.engine),\n ext=pprint_thing(ext)))\n raise ValueError(msg)\n else:\n return True"} {"instruction": "Given the following Python 3 function, write the documentation\ndef parse(self,\n sheet_name=0,\n header=0,\n names=None,\n index_col=None,\n usecols=None,\n squeeze=False,\n converters=None,\n true_values=None,\n false_values=None,\n skiprows=None,\n nrows=None,\n na_values=None,\n parse_dates=False,\n date_parser=None,\n thousands=None,\n comment=None,\n skipfooter=0,\n convert_float=True,\n mangle_dupe_cols=True,\n **kwds):\n \"\"\"\n Parse specified sheet(s) into a DataFrame\n\n Equivalent to read_excel(ExcelFile, ...) See the read_excel\n docstring for more info on accepted parameters\n \"\"\"\n\n # Can't use _deprecate_kwarg since sheetname=None has a special meaning\n if is_integer(sheet_name) and sheet_name == 0 and 'sheetname' in kwds:\n warnings.warn(\"The `sheetname` keyword is deprecated, use \"\n \"`sheet_name` instead\", FutureWarning, stacklevel=2)\n sheet_name = kwds.pop(\"sheetname\")\n elif 'sheetname' in kwds:\n raise TypeError(\"Cannot specify both `sheet_name` \"\n \"and `sheetname`. Use just `sheet_name`\")\n\n if 'chunksize' in kwds:\n raise NotImplementedError(\"chunksize keyword of read_excel \"\n \"is not implemented\")\n\n return self._reader.parse(sheet_name=sheet_name,\n header=header,\n names=names,\n index_col=index_col,\n usecols=usecols,\n squeeze=squeeze,\n converters=converters,\n true_values=true_values,\n false_values=false_values,\n skiprows=skiprows,\n nrows=nrows,\n na_values=na_values,\n parse_dates=parse_dates,\n date_parser=date_parser,\n thousands=thousands,\n comment=comment,\n skipfooter=skipfooter,\n convert_float=convert_float,\n mangle_dupe_cols=mangle_dupe_cols,\n **kwds)", "response": "Parse an Excel file into a DataFrame."} {"instruction": "Write a Python 3 function that can\nvalidate that the where clause is of the right type.", "response": "def _validate_where(w):\n \"\"\"\n Validate that the where statement is of the right type.\n\n The type may either be String, Expr, or list-like of Exprs.\n\n Parameters\n ----------\n w : String term expression, Expr, or list-like of Exprs.\n\n Returns\n -------\n where : The original where clause if the check was successful.\n\n Raises\n ------\n TypeError : An invalid data type was passed in for w (e.g. dict).\n \"\"\"\n\n if not (isinstance(w, (Expr, str)) or is_list_like(w)):\n raise TypeError(\"where must be passed as a string, Expr, \"\n \"or list-like of Exprs\")\n\n return w"} {"instruction": "Make a summary of the following Python 3 code\ndef maybe_expression(s):\n if not isinstance(s, str):\n return False\n ops = ExprVisitor.binary_ops + ExprVisitor.unary_ops + ('=',)\n\n # make sure we have an op at least\n return any(op in s for op in ops)", "response": "loose checking if s is a pytables - acceptable expression"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef generate(self, v):\n val = v.tostring(self.encoding)\n return \"({lhs} {op} {val})\".format(lhs=self.lhs, op=self.op, val=val)", "response": "create and return the op string for this TermValue"} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef convert_value(self, v):\n\n def stringify(value):\n if self.encoding is not None:\n encoder = partial(pprint_thing_encoded,\n encoding=self.encoding)\n else:\n encoder = pprint_thing\n return encoder(value)\n\n kind = _ensure_decoded(self.kind)\n meta = _ensure_decoded(self.meta)\n if kind == 'datetime64' or kind == 'datetime':\n if isinstance(v, (int, float)):\n v = stringify(v)\n v = _ensure_decoded(v)\n v = Timestamp(v)\n if v.tz is not None:\n v = v.tz_convert('UTC')\n return TermValue(v, v.value, kind)\n elif kind == 'timedelta64' or kind == 'timedelta':\n v = Timedelta(v, unit='s').value\n return TermValue(int(v), v, kind)\n elif meta == 'category':\n metadata = com.values_from_object(self.metadata)\n result = metadata.searchsorted(v, side='left')\n\n # result returns 0 if v is first element or if v is not in metadata\n # check that metadata contains v\n if not result and v not in metadata:\n result = -1\n return TermValue(result, result, 'integer')\n elif kind == 'integer':\n v = int(float(v))\n return TermValue(v, v, kind)\n elif kind == 'float':\n v = float(v)\n return TermValue(v, v, kind)\n elif kind == 'bool':\n if isinstance(v, str):\n v = not v.strip().lower() in ['false', 'f', 'no',\n 'n', 'none', '0',\n '[]', '{}', '']\n else:\n v = bool(v)\n return TermValue(v, v, kind)\n elif isinstance(v, str):\n # string quoting\n return TermValue(v, stringify(v), 'string')\n else:\n raise TypeError(\"Cannot compare {v} of type {typ} to {kind} column\"\n .format(v=v, typ=type(v), kind=kind))", "response": "convert the expression that is in the term to something that is in the term"} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef evaluate(self):\n\n try:\n self.condition = self.terms.prune(ConditionBinOp)\n except AttributeError:\n raise ValueError(\"cannot process expression [{expr}], [{slf}] \"\n \"is not a valid condition\".format(expr=self.expr,\n slf=self))\n try:\n self.filter = self.terms.prune(FilterBinOp)\n except AttributeError:\n raise ValueError(\"cannot process expression [{expr}], [{slf}] \"\n \"is not a valid filter\".format(expr=self.expr,\n slf=self))\n\n return self.condition, self.filter", "response": "create and return the numexpr condition and filter"} {"instruction": "Can you create a Python 3 function that\nquotes the string if not encoded and return", "response": "def tostring(self, encoding):\n \"\"\" quote the string if not encoded\n else encode and return \"\"\"\n if self.kind == 'string':\n if encoding is not None:\n return self.converted\n return '\"{converted}\"'.format(converted=self.converted)\n elif self.kind == 'float':\n # python 2 str(float) is not always\n # round-trippable so use repr()\n return repr(self.converted)\n return self.converted"} {"instruction": "Explain what the following Python 3 code does\ndef _ensure_decoded(s):\n if isinstance(s, (np.bytes_, bytes)):\n s = s.decode(pd.get_option('display.encoding'))\n return s", "response": "Ensure that the given string is decoded."} {"instruction": "Write a Python 3 script for\nwrapping around numpy. result_type which overcomes the NPY_MAXARGS argument limit", "response": "def _result_type_many(*arrays_and_dtypes):\n \"\"\" wrapper around numpy.result_type which overcomes the NPY_MAXARGS (32)\n argument limit \"\"\"\n try:\n return np.result_type(*arrays_and_dtypes)\n except ValueError:\n # we have > NPY_MAXARGS terms in our expression\n return reduce(np.result_type, arrays_and_dtypes)"} {"instruction": "Can you write a function in Python 3 where it\nvalidates that the minimum value of the series is not None.", "response": "def validate_argmin_with_skipna(skipna, args, kwargs):\n \"\"\"\n If 'Series.argmin' is called via the 'numpy' library,\n the third parameter in its signature is 'out', which\n takes either an ndarray or 'None', so check if the\n 'skipna' parameter is either an instance of ndarray or\n is None, since 'skipna' itself should be a boolean\n \"\"\"\n\n skipna, args = process_skipna(skipna, args)\n validate_argmin(args, kwargs)\n return skipna"} {"instruction": "Write a Python 3 function that can\nvalidate the maximum argument of a series.", "response": "def validate_argmax_with_skipna(skipna, args, kwargs):\n \"\"\"\n If 'Series.argmax' is called via the 'numpy' library,\n the third parameter in its signature is 'out', which\n takes either an ndarray or 'None', so check if the\n 'skipna' parameter is either an instance of ndarray or\n is None, since 'skipna' itself should be a boolean\n \"\"\"\n\n skipna, args = process_skipna(skipna, args)\n validate_argmax(args, kwargs)\n return skipna"} {"instruction": "Make a summary of the following Python 3 code\ndef validate_argsort_with_ascending(ascending, args, kwargs):\n\n if is_integer(ascending) or ascending is None:\n args = (ascending,) + args\n ascending = True\n\n validate_argsort_kind(args, kwargs, max_fname_arg_count=3)\n return ascending", "response": "Validate that the passed arguments are valid for the order of the categories in the current node."} {"instruction": "Explain what the following Python 3 code does\ndef validate_clip_with_axis(axis, args, kwargs):\n\n if isinstance(axis, ndarray):\n args = (axis,) + args\n axis = None\n\n validate_clip(args, kwargs)\n return axis", "response": "Validate that the first argument to NDFrame. clip is valid."} {"instruction": "Here you have a function in Python 3, explain what it does\ndef validate_cum_func_with_skipna(skipna, args, kwargs, name):\n if not is_bool(skipna):\n args = (skipna,) + args\n skipna = True\n\n validate_cum_func(args, kwargs, fname=name)\n return skipna", "response": "This function is called by the numpy library to check if the skipna parameter is a boolean or not."} {"instruction": "Make a summary of the following Python 3 code\ndef validate_take_with_convert(convert, args, kwargs):\n\n if isinstance(convert, ndarray) or convert is None:\n args = (convert,) + args\n convert = True\n\n validate_take(args, kwargs, max_fname_arg_count=3, method='both')\n return convert", "response": "This function is used to validate that the take function is called with the correct arguments."} {"instruction": "How would you explain what the following Python 3 function does\ndef validate_groupby_func(name, args, kwargs, allowed=None):\n if allowed is None:\n allowed = []\n\n kwargs = set(kwargs) - set(allowed)\n\n if len(args) + len(kwargs) > 0:\n raise UnsupportedFunctionCall((\n \"numpy operations are not valid \"\n \"with groupby. Use .groupby(...).\"\n \"{func}() instead\".format(func=name)))", "response": "Validate that the function name is a group by function."} {"instruction": "Write a Python 3 function that can\nvalidate that the function passed in is valid for the resampler.", "response": "def validate_resampler_func(method, args, kwargs):\n \"\"\"\n 'args' and 'kwargs' should be empty because all of\n their necessary parameters are explicitly listed in\n the function signature\n \"\"\"\n if len(args) + len(kwargs) > 0:\n if method in RESAMPLER_NUMPY_OPS:\n raise UnsupportedFunctionCall((\n \"numpy operations are not valid \"\n \"with resample. Use .resample(...).\"\n \"{func}() instead\".format(func=method)))\n else:\n raise TypeError(\"too many arguments passed in\")"} {"instruction": "Explain what the following Python 3 code does\ndef validate_minmax_axis(axis):\n ndim = 1 # hard-coded for Index\n if axis is None:\n return\n if axis >= ndim or (axis < 0 and ndim + axis < 0):\n raise ValueError(\"`axis` must be fewer than the number of \"\n \"dimensions ({ndim})\".format(ndim=ndim))", "response": "Validate that the axis argument passed to min max argmin or argmax is not None."} {"instruction": "How would you implement a function in Python 3 that\nserializes a msgpack object to a file path.", "response": "def to_msgpack(path_or_buf, *args, **kwargs):\n \"\"\"\n msgpack (serialize) object to input file path\n\n THIS IS AN EXPERIMENTAL LIBRARY and the storage format\n may not be stable until a future release.\n\n Parameters\n ----------\n path_or_buf : string File path, buffer-like, or None\n if None, return generated string\n args : an object or objects to serialize\n encoding : encoding for unicode objects\n append : boolean whether to append to an existing msgpack\n (default is False)\n compress : type of compressor (zlib or blosc), default to None (no\n compression)\n \"\"\"\n global compressor\n compressor = kwargs.pop('compress', None)\n append = kwargs.pop('append', None)\n if append:\n mode = 'a+b'\n else:\n mode = 'wb'\n\n def writer(fh):\n for a in args:\n fh.write(pack(a, **kwargs))\n\n path_or_buf = _stringify_path(path_or_buf)\n if isinstance(path_or_buf, str):\n with open(path_or_buf, mode) as fh:\n writer(fh)\n elif path_or_buf is None:\n buf = BytesIO()\n writer(buf)\n return buf.getvalue()\n else:\n writer(path_or_buf)"} {"instruction": "Implement a Python 3 function for\nloading a msgpack object from a file path.", "response": "def read_msgpack(path_or_buf, encoding='utf-8', iterator=False, **kwargs):\n \"\"\"\n Load msgpack pandas object from the specified\n file path\n\n THIS IS AN EXPERIMENTAL LIBRARY and the storage format\n may not be stable until a future release.\n\n Parameters\n ----------\n path_or_buf : string File path, BytesIO like or string\n encoding : Encoding for decoding msgpack str type\n iterator : boolean, if True, return an iterator to the unpacker\n (default is False)\n\n Returns\n -------\n obj : same type as object stored in file\n \"\"\"\n path_or_buf, _, _, should_close = get_filepath_or_buffer(path_or_buf)\n if iterator:\n return Iterator(path_or_buf)\n\n def read(fh):\n unpacked_obj = list(unpack(fh, encoding=encoding, **kwargs))\n if len(unpacked_obj) == 1:\n return unpacked_obj[0]\n\n if should_close:\n try:\n path_or_buf.close()\n except IOError:\n pass\n return unpacked_obj\n\n # see if we have an actual file\n if isinstance(path_or_buf, str):\n try:\n exists = os.path.exists(path_or_buf)\n except (TypeError, ValueError):\n exists = False\n\n if exists:\n with open(path_or_buf, 'rb') as fh:\n return read(fh)\n\n if isinstance(path_or_buf, bytes):\n # treat as a binary-like\n fh = None\n try:\n fh = BytesIO(path_or_buf)\n return read(fh)\n finally:\n if fh is not None:\n fh.close()\n elif hasattr(path_or_buf, 'read') and callable(path_or_buf.read):\n # treat as a buffer like\n return read(path_or_buf)\n\n raise ValueError('path_or_buf needs to be a string file path or file-like')"} {"instruction": "Can you write a function in Python 3 where it\nreturns my dtype mapping whether number or name", "response": "def dtype_for(t):\n \"\"\" return my dtype mapping, whether number or name \"\"\"\n if t in dtype_dict:\n return dtype_dict[t]\n return np.typeDict.get(t, t)"} {"instruction": "Can you write a function in Python 3 where it\nconverts strings to complex number instance with specified numpy type.", "response": "def c2f(r, i, ctype_name):\n \"\"\"\n Convert strings to complex number instance with specified numpy type.\n \"\"\"\n\n ftype = c2f_dict[ctype_name]\n return np.typeDict[ctype_name](ftype(r) + 1j * ftype(i))"} {"instruction": "Create a Python 3 function to\nconvert the numpy values to a list", "response": "def convert(values):\n \"\"\" convert the numpy values to a list \"\"\"\n\n dtype = values.dtype\n\n if is_categorical_dtype(values):\n return values\n\n elif is_object_dtype(dtype):\n return values.ravel().tolist()\n\n if needs_i8_conversion(dtype):\n values = values.view('i8')\n v = values.ravel()\n\n if compressor == 'zlib':\n _check_zlib()\n\n # return string arrays like they are\n if dtype == np.object_:\n return v.tolist()\n\n # convert to a bytes array\n v = v.tostring()\n return ExtType(0, zlib.compress(v))\n\n elif compressor == 'blosc':\n _check_blosc()\n\n # return string arrays like they are\n if dtype == np.object_:\n return v.tolist()\n\n # convert to a bytes array\n v = v.tostring()\n return ExtType(0, blosc.compress(v, typesize=dtype.itemsize))\n\n # ndarray (on original dtype)\n return ExtType(0, v.tostring())"} {"instruction": "Explain what the following Python 3 code does\ndef encode(obj):\n tobj = type(obj)\n if isinstance(obj, Index):\n if isinstance(obj, RangeIndex):\n return {'typ': 'range_index',\n 'klass': obj.__class__.__name__,\n 'name': getattr(obj, 'name', None),\n 'start': getattr(obj, '_start', None),\n 'stop': getattr(obj, '_stop', None),\n 'step': getattr(obj, '_step', None)}\n elif isinstance(obj, PeriodIndex):\n return {'typ': 'period_index',\n 'klass': obj.__class__.__name__,\n 'name': getattr(obj, 'name', None),\n 'freq': getattr(obj, 'freqstr', None),\n 'dtype': obj.dtype.name,\n 'data': convert(obj.asi8),\n 'compress': compressor}\n elif isinstance(obj, DatetimeIndex):\n tz = getattr(obj, 'tz', None)\n\n # store tz info and data as UTC\n if tz is not None:\n tz = tz.zone\n obj = obj.tz_convert('UTC')\n return {'typ': 'datetime_index',\n 'klass': obj.__class__.__name__,\n 'name': getattr(obj, 'name', None),\n 'dtype': obj.dtype.name,\n 'data': convert(obj.asi8),\n 'freq': getattr(obj, 'freqstr', None),\n 'tz': tz,\n 'compress': compressor}\n elif isinstance(obj, (IntervalIndex, IntervalArray)):\n if isinstance(obj, IntervalIndex):\n typ = 'interval_index'\n else:\n typ = 'interval_array'\n return {'typ': typ,\n 'klass': obj.__class__.__name__,\n 'name': getattr(obj, 'name', None),\n 'left': getattr(obj, 'left', None),\n 'right': getattr(obj, 'right', None),\n 'closed': getattr(obj, 'closed', None)}\n elif isinstance(obj, MultiIndex):\n return {'typ': 'multi_index',\n 'klass': obj.__class__.__name__,\n 'names': getattr(obj, 'names', None),\n 'dtype': obj.dtype.name,\n 'data': convert(obj.values),\n 'compress': compressor}\n else:\n return {'typ': 'index',\n 'klass': obj.__class__.__name__,\n 'name': getattr(obj, 'name', None),\n 'dtype': obj.dtype.name,\n 'data': convert(obj.values),\n 'compress': compressor}\n\n elif isinstance(obj, Categorical):\n return {'typ': 'category',\n 'klass': obj.__class__.__name__,\n 'name': getattr(obj, 'name', None),\n 'codes': obj.codes,\n 'categories': obj.categories,\n 'ordered': obj.ordered,\n 'compress': compressor}\n\n elif isinstance(obj, Series):\n if isinstance(obj, SparseSeries):\n raise NotImplementedError(\n 'msgpack sparse series is not implemented'\n )\n # d = {'typ': 'sparse_series',\n # 'klass': obj.__class__.__name__,\n # 'dtype': obj.dtype.name,\n # 'index': obj.index,\n # 'sp_index': obj.sp_index,\n # 'sp_values': convert(obj.sp_values),\n # 'compress': compressor}\n # for f in ['name', 'fill_value', 'kind']:\n # d[f] = getattr(obj, f, None)\n # return d\n else:\n return {'typ': 'series',\n 'klass': obj.__class__.__name__,\n 'name': getattr(obj, 'name', None),\n 'index': obj.index,\n 'dtype': obj.dtype.name,\n 'data': convert(obj.values),\n 'compress': compressor}\n elif issubclass(tobj, NDFrame):\n if isinstance(obj, SparseDataFrame):\n raise NotImplementedError(\n 'msgpack sparse frame is not implemented'\n )\n # d = {'typ': 'sparse_dataframe',\n # 'klass': obj.__class__.__name__,\n # 'columns': obj.columns}\n # for f in ['default_fill_value', 'default_kind']:\n # d[f] = getattr(obj, f, None)\n # d['data'] = dict([(name, ss)\n # for name, ss in obj.items()])\n # return d\n else:\n\n data = obj._data\n if not data.is_consolidated():\n data = data.consolidate()\n\n # the block manager\n return {'typ': 'block_manager',\n 'klass': obj.__class__.__name__,\n 'axes': data.axes,\n 'blocks': [{'locs': b.mgr_locs.as_array,\n 'values': convert(b.values),\n 'shape': b.values.shape,\n 'dtype': b.dtype.name,\n 'klass': b.__class__.__name__,\n 'compress': compressor} for b in data.blocks]\n }\n\n elif isinstance(obj, (datetime, date, np.datetime64, timedelta,\n np.timedelta64)) or obj is NaT:\n if isinstance(obj, Timestamp):\n tz = obj.tzinfo\n if tz is not None:\n tz = tz.zone\n freq = obj.freq\n if freq is not None:\n freq = freq.freqstr\n return {'typ': 'timestamp',\n 'value': obj.value,\n 'freq': freq,\n 'tz': tz}\n if obj is NaT:\n return {'typ': 'nat'}\n elif isinstance(obj, np.timedelta64):\n return {'typ': 'timedelta64',\n 'data': obj.view('i8')}\n elif isinstance(obj, timedelta):\n return {'typ': 'timedelta',\n 'data': (obj.days, obj.seconds, obj.microseconds)}\n elif isinstance(obj, np.datetime64):\n return {'typ': 'datetime64',\n 'data': str(obj)}\n elif isinstance(obj, datetime):\n return {'typ': 'datetime',\n 'data': obj.isoformat()}\n elif isinstance(obj, date):\n return {'typ': 'date',\n 'data': obj.isoformat()}\n raise Exception(\n \"cannot encode this datetimelike object: {obj}\".format(obj=obj))\n elif isinstance(obj, Period):\n return {'typ': 'period',\n 'ordinal': obj.ordinal,\n 'freq': obj.freqstr}\n elif isinstance(obj, Interval):\n return {'typ': 'interval',\n 'left': obj.left,\n 'right': obj.right,\n 'closed': obj.closed}\n elif isinstance(obj, BlockIndex):\n return {'typ': 'block_index',\n 'klass': obj.__class__.__name__,\n 'blocs': obj.blocs,\n 'blengths': obj.blengths,\n 'length': obj.length}\n elif isinstance(obj, IntIndex):\n return {'typ': 'int_index',\n 'klass': obj.__class__.__name__,\n 'indices': obj.indices,\n 'length': obj.length}\n elif isinstance(obj, np.ndarray):\n return {'typ': 'ndarray',\n 'shape': obj.shape,\n 'ndim': obj.ndim,\n 'dtype': obj.dtype.name,\n 'data': convert(obj),\n 'compress': compressor}\n elif isinstance(obj, np.number):\n if np.iscomplexobj(obj):\n return {'typ': 'np_scalar',\n 'sub_typ': 'np_complex',\n 'dtype': obj.dtype.name,\n 'real': obj.real.__repr__(),\n 'imag': obj.imag.__repr__()}\n else:\n return {'typ': 'np_scalar',\n 'dtype': obj.dtype.name,\n 'data': obj.__repr__()}\n elif isinstance(obj, complex):\n return {'typ': 'np_complex',\n 'real': obj.real.__repr__(),\n 'imag': obj.imag.__repr__()}\n\n return obj", "response": "Encodes an object into a sequence of data structures."} {"instruction": "Write a Python 3 function for\ndecoding a numpy data type into a new object.", "response": "def decode(obj):\n \"\"\"\n Decoder for deserializing numpy data types.\n \"\"\"\n\n typ = obj.get('typ')\n if typ is None:\n return obj\n elif typ == 'timestamp':\n freq = obj['freq'] if 'freq' in obj else obj['offset']\n return Timestamp(obj['value'], tz=obj['tz'], freq=freq)\n elif typ == 'nat':\n return NaT\n elif typ == 'period':\n return Period(ordinal=obj['ordinal'], freq=obj['freq'])\n elif typ == 'index':\n dtype = dtype_for(obj['dtype'])\n data = unconvert(obj['data'], dtype,\n obj.get('compress'))\n return Index(data, dtype=dtype, name=obj['name'])\n elif typ == 'range_index':\n return RangeIndex(obj['start'],\n obj['stop'],\n obj['step'],\n name=obj['name'])\n elif typ == 'multi_index':\n dtype = dtype_for(obj['dtype'])\n data = unconvert(obj['data'], dtype,\n obj.get('compress'))\n data = [tuple(x) for x in data]\n return MultiIndex.from_tuples(data, names=obj['names'])\n elif typ == 'period_index':\n data = unconvert(obj['data'], np.int64, obj.get('compress'))\n d = dict(name=obj['name'], freq=obj['freq'])\n freq = d.pop('freq', None)\n return PeriodIndex(PeriodArray(data, freq), **d)\n\n elif typ == 'datetime_index':\n data = unconvert(obj['data'], np.int64, obj.get('compress'))\n d = dict(name=obj['name'], freq=obj['freq'])\n result = DatetimeIndex(data, **d)\n tz = obj['tz']\n\n # reverse tz conversion\n if tz is not None:\n result = result.tz_localize('UTC').tz_convert(tz)\n return result\n\n elif typ in ('interval_index', 'interval_array'):\n return globals()[obj['klass']].from_arrays(obj['left'],\n obj['right'],\n obj['closed'],\n name=obj['name'])\n elif typ == 'category':\n from_codes = globals()[obj['klass']].from_codes\n return from_codes(codes=obj['codes'],\n categories=obj['categories'],\n ordered=obj['ordered'])\n\n elif typ == 'interval':\n return Interval(obj['left'], obj['right'], obj['closed'])\n elif typ == 'series':\n dtype = dtype_for(obj['dtype'])\n pd_dtype = pandas_dtype(dtype)\n\n index = obj['index']\n result = Series(unconvert(obj['data'], dtype, obj['compress']),\n index=index,\n dtype=pd_dtype,\n name=obj['name'])\n return result\n\n elif typ == 'block_manager':\n axes = obj['axes']\n\n def create_block(b):\n values = _safe_reshape(unconvert(\n b['values'], dtype_for(b['dtype']),\n b['compress']), b['shape'])\n\n # locs handles duplicate column names, and should be used instead\n # of items; see GH 9618\n if 'locs' in b:\n placement = b['locs']\n else:\n placement = axes[0].get_indexer(b['items'])\n\n if is_datetime64tz_dtype(b['dtype']):\n assert isinstance(values, np.ndarray), type(values)\n assert values.dtype == 'M8[ns]', values.dtype\n values = DatetimeArray(values, dtype=b['dtype'])\n\n return make_block(values=values,\n klass=getattr(internals, b['klass']),\n placement=placement,\n dtype=b['dtype'])\n\n blocks = [create_block(b) for b in obj['blocks']]\n return globals()[obj['klass']](BlockManager(blocks, axes))\n elif typ == 'datetime':\n return parse(obj['data'])\n elif typ == 'datetime64':\n return np.datetime64(parse(obj['data']))\n elif typ == 'date':\n return parse(obj['data']).date()\n elif typ == 'timedelta':\n return timedelta(*obj['data'])\n elif typ == 'timedelta64':\n return np.timedelta64(int(obj['data']))\n # elif typ == 'sparse_series':\n # dtype = dtype_for(obj['dtype'])\n # return SparseSeries(\n # unconvert(obj['sp_values'], dtype, obj['compress']),\n # sparse_index=obj['sp_index'], index=obj['index'],\n # fill_value=obj['fill_value'], kind=obj['kind'], name=obj['name'])\n # elif typ == 'sparse_dataframe':\n # return SparseDataFrame(\n # obj['data'], columns=obj['columns'],\n # default_fill_value=obj['default_fill_value'],\n # default_kind=obj['default_kind']\n # )\n # elif typ == 'sparse_panel':\n # return SparsePanel(\n # obj['data'], items=obj['items'],\n # default_fill_value=obj['default_fill_value'],\n # default_kind=obj['default_kind'])\n elif typ == 'block_index':\n return globals()[obj['klass']](obj['length'], obj['blocs'],\n obj['blengths'])\n elif typ == 'int_index':\n return globals()[obj['klass']](obj['length'], obj['indices'])\n elif typ == 'ndarray':\n return unconvert(obj['data'], np.typeDict[obj['dtype']],\n obj.get('compress')).reshape(obj['shape'])\n elif typ == 'np_scalar':\n if obj.get('sub_typ') == 'np_complex':\n return c2f(obj['real'], obj['imag'], obj['dtype'])\n else:\n dtype = dtype_for(obj['dtype'])\n try:\n return dtype(obj['data'])\n except (ValueError, TypeError):\n return dtype.type(obj['data'])\n elif typ == 'np_complex':\n return complex(obj['real'] + '+' + obj['imag'] + 'j')\n elif isinstance(obj, (dict, list, set)):\n return obj\n else:\n return obj"} {"instruction": "Create a Python 3 function to\npack an object and return the packed bytes.", "response": "def pack(o, default=encode,\n encoding='utf-8', unicode_errors='strict', use_single_float=False,\n autoreset=1, use_bin_type=1):\n \"\"\"\n Pack an object and return the packed bytes.\n \"\"\"\n\n return Packer(default=default, encoding=encoding,\n unicode_errors=unicode_errors,\n use_single_float=use_single_float,\n autoreset=autoreset,\n use_bin_type=use_bin_type).pack(o)"} {"instruction": "Given the following Python 3 function, write the documentation\ndef unpack(packed, object_hook=decode,\n list_hook=None, use_list=False, encoding='utf-8',\n unicode_errors='strict', object_pairs_hook=None,\n max_buffer_size=0, ext_hook=ExtType):\n \"\"\"\n Unpack a packed object, return an iterator\n Note: packed lists will be returned as tuples\n \"\"\"\n\n return Unpacker(packed, object_hook=object_hook,\n list_hook=list_hook,\n use_list=use_list, encoding=encoding,\n unicode_errors=unicode_errors,\n object_pairs_hook=object_pairs_hook,\n max_buffer_size=max_buffer_size,\n ext_hook=ext_hook)", "response": "Unpack a packed object into an iterator\n "} {"instruction": "Create a Python 3 function to\nread a JSON string or file - like object into pandas. DataFrame.", "response": "def read_json(path_or_buf=None, orient=None, typ='frame', dtype=None,\n convert_axes=None, convert_dates=True, keep_default_dates=True,\n numpy=False, precise_float=False, date_unit=None, encoding=None,\n lines=False, chunksize=None, compression='infer'):\n \"\"\"\n Convert a JSON string to pandas object.\n\n Parameters\n ----------\n path_or_buf : a valid JSON string or file-like, default: None\n The string could be a URL. Valid URL schemes include http, ftp, s3,\n gcs, and file. For file URLs, a host is expected. For instance, a local\n file could be ``file://localhost/path/to/table.json``\n\n orient : string,\n Indication of expected JSON string format.\n Compatible JSON strings can be produced by ``to_json()`` with a\n corresponding orient value.\n The set of possible orients is:\n\n - ``'split'`` : dict like\n ``{index -> [index], columns -> [columns], data -> [values]}``\n - ``'records'`` : list like\n ``[{column -> value}, ... , {column -> value}]``\n - ``'index'`` : dict like ``{index -> {column -> value}}``\n - ``'columns'`` : dict like ``{column -> {index -> value}}``\n - ``'values'`` : just the values array\n\n The allowed and default values depend on the value\n of the `typ` parameter.\n\n * when ``typ == 'series'``,\n\n - allowed orients are ``{'split','records','index'}``\n - default is ``'index'``\n - The Series index must be unique for orient ``'index'``.\n\n * when ``typ == 'frame'``,\n\n - allowed orients are ``{'split','records','index',\n 'columns','values', 'table'}``\n - default is ``'columns'``\n - The DataFrame index must be unique for orients ``'index'`` and\n ``'columns'``.\n - The DataFrame columns must be unique for orients ``'index'``,\n ``'columns'``, and ``'records'``.\n\n .. versionadded:: 0.23.0\n 'table' as an allowed value for the ``orient`` argument\n\n typ : type of object to recover (series or frame), default 'frame'\n dtype : boolean or dict, default None\n If True, infer dtypes; if a dict of column to dtype, then use those;\n if False, then don't infer dtypes at all, applies only to the data.\n\n For all ``orient`` values except ``'table'``, default is True.\n\n .. versionchanged:: 0.25.0\n\n Not applicable for ``orient='table'``.\n\n convert_axes : boolean, default None\n Try to convert the axes to the proper dtypes.\n\n For all ``orient`` values except ``'table'``, default is True.\n\n .. versionchanged:: 0.25.0\n\n Not applicable for ``orient='table'``.\n\n convert_dates : boolean, default True\n List of columns to parse for dates; If True, then try to parse\n datelike columns default is True; a column label is datelike if\n\n * it ends with ``'_at'``,\n\n * it ends with ``'_time'``,\n\n * it begins with ``'timestamp'``,\n\n * it is ``'modified'``, or\n\n * it is ``'date'``\n\n keep_default_dates : boolean, default True\n If parsing dates, then parse the default datelike columns\n numpy : boolean, default False\n Direct decoding to numpy arrays. Supports numeric data only, but\n non-numeric column and index labels are supported. Note also that the\n JSON ordering MUST be the same for each term if numpy=True.\n precise_float : boolean, default False\n Set to enable usage of higher precision (strtod) function when\n decoding string to double values. Default (False) is to use fast but\n less precise builtin functionality\n date_unit : string, default None\n The timestamp unit to detect if converting dates. The default behaviour\n is to try and detect the correct precision, but if this is not desired\n then pass one of 's', 'ms', 'us' or 'ns' to force parsing only seconds,\n milliseconds, microseconds or nanoseconds respectively.\n encoding : str, default is 'utf-8'\n The encoding to use to decode py3 bytes.\n\n .. versionadded:: 0.19.0\n\n lines : boolean, default False\n Read the file as a json object per line.\n\n .. versionadded:: 0.19.0\n\n chunksize : integer, default None\n Return JsonReader object for iteration.\n See the `line-delimted json docs\n `_\n for more information on ``chunksize``.\n This can only be passed if `lines=True`.\n If this is None, the file will be read into memory all at once.\n\n .. versionadded:: 0.21.0\n\n compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer'\n For on-the-fly decompression of on-disk data. If 'infer', then use\n gzip, bz2, zip or xz if path_or_buf is a string ending in\n '.gz', '.bz2', '.zip', or 'xz', respectively, and no decompression\n otherwise. If using 'zip', the ZIP file must contain only one data\n file to be read in. Set to None for no decompression.\n\n .. versionadded:: 0.21.0\n\n Returns\n -------\n result : Series or DataFrame, depending on the value of `typ`.\n\n See Also\n --------\n DataFrame.to_json\n\n Notes\n -----\n Specific to ``orient='table'``, if a :class:`DataFrame` with a literal\n :class:`Index` name of `index` gets written with :func:`to_json`, the\n subsequent read operation will incorrectly set the :class:`Index` name to\n ``None``. This is because `index` is also used by :func:`DataFrame.to_json`\n to denote a missing :class:`Index` name, and the subsequent\n :func:`read_json` operation cannot distinguish between the two. The same\n limitation is encountered with a :class:`MultiIndex` and any names\n beginning with ``'level_'``.\n\n Examples\n --------\n\n >>> df = pd.DataFrame([['a', 'b'], ['c', 'd']],\n ... index=['row 1', 'row 2'],\n ... columns=['col 1', 'col 2'])\n\n Encoding/decoding a Dataframe using ``'split'`` formatted JSON:\n\n >>> df.to_json(orient='split')\n '{\"columns\":[\"col 1\",\"col 2\"],\n \"index\":[\"row 1\",\"row 2\"],\n \"data\":[[\"a\",\"b\"],[\"c\",\"d\"]]}'\n >>> pd.read_json(_, orient='split')\n col 1 col 2\n row 1 a b\n row 2 c d\n\n Encoding/decoding a Dataframe using ``'index'`` formatted JSON:\n\n >>> df.to_json(orient='index')\n '{\"row 1\":{\"col 1\":\"a\",\"col 2\":\"b\"},\"row 2\":{\"col 1\":\"c\",\"col 2\":\"d\"}}'\n >>> pd.read_json(_, orient='index')\n col 1 col 2\n row 1 a b\n row 2 c d\n\n Encoding/decoding a Dataframe using ``'records'`` formatted JSON.\n Note that index labels are not preserved with this encoding.\n\n >>> df.to_json(orient='records')\n '[{\"col 1\":\"a\",\"col 2\":\"b\"},{\"col 1\":\"c\",\"col 2\":\"d\"}]'\n >>> pd.read_json(_, orient='records')\n col 1 col 2\n 0 a b\n 1 c d\n\n Encoding with Table Schema\n\n >>> df.to_json(orient='table')\n '{\"schema\": {\"fields\": [{\"name\": \"index\", \"type\": \"string\"},\n {\"name\": \"col 1\", \"type\": \"string\"},\n {\"name\": \"col 2\", \"type\": \"string\"}],\n \"primaryKey\": \"index\",\n \"pandas_version\": \"0.20.0\"},\n \"data\": [{\"index\": \"row 1\", \"col 1\": \"a\", \"col 2\": \"b\"},\n {\"index\": \"row 2\", \"col 1\": \"c\", \"col 2\": \"d\"}]}'\n \"\"\"\n\n if orient == 'table' and dtype:\n raise ValueError(\"cannot pass both dtype and orient='table'\")\n if orient == 'table' and convert_axes:\n raise ValueError(\"cannot pass both convert_axes and orient='table'\")\n\n if dtype is None and orient != 'table':\n dtype = True\n if convert_axes is None and orient != 'table':\n convert_axes = True\n\n compression = _infer_compression(path_or_buf, compression)\n filepath_or_buffer, _, compression, should_close = get_filepath_or_buffer(\n path_or_buf, encoding=encoding, compression=compression,\n )\n\n json_reader = JsonReader(\n filepath_or_buffer, orient=orient, typ=typ, dtype=dtype,\n convert_axes=convert_axes, convert_dates=convert_dates,\n keep_default_dates=keep_default_dates, numpy=numpy,\n precise_float=precise_float, date_unit=date_unit, encoding=encoding,\n lines=lines, chunksize=chunksize, compression=compression,\n )\n\n if chunksize:\n return json_reader\n\n result = json_reader.read()\n if should_close:\n try:\n filepath_or_buffer.close()\n except: # noqa: flake8\n pass\n return result"} {"instruction": "Can you create a Python 3 function that\ntries to format axes if they are datelike.", "response": "def _format_axes(self):\n \"\"\"\n Try to format axes if they are datelike.\n \"\"\"\n if not self.obj.index.is_unique and self.orient in (\n 'index', 'columns'):\n raise ValueError(\"DataFrame index must be unique for orient=\"\n \"'{orient}'.\".format(orient=self.orient))\n if not self.obj.columns.is_unique and self.orient in (\n 'index', 'columns', 'records'):\n raise ValueError(\"DataFrame columns must be unique for orient=\"\n \"'{orient}'.\".format(orient=self.orient))"} {"instruction": "How would you implement a function in Python 3 that\ncombines a list of JSON objects into one JSON object.", "response": "def _combine_lines(self, lines):\n \"\"\"\n Combines a list of JSON objects into one JSON object.\n \"\"\"\n lines = filter(None, map(lambda x: x.strip(), lines))\n return '[' + ','.join(lines) + ']'"} {"instruction": "Create a Python 3 function to\nread the whole JSON input into a pandas object.", "response": "def read(self):\n \"\"\"\n Read the whole JSON input into a pandas object.\n \"\"\"\n if self.lines and self.chunksize:\n obj = concat(self)\n elif self.lines:\n\n data = to_str(self.data)\n obj = self._get_object_parser(\n self._combine_lines(data.split('\\n'))\n )\n else:\n obj = self._get_object_parser(self.data)\n self.close()\n return obj"} {"instruction": "Implement a function in Python 3 to\nparse a json document into a pandas object.", "response": "def _get_object_parser(self, json):\n \"\"\"\n Parses a json document into a pandas object.\n \"\"\"\n typ = self.typ\n dtype = self.dtype\n kwargs = {\n \"orient\": self.orient, \"dtype\": self.dtype,\n \"convert_axes\": self.convert_axes,\n \"convert_dates\": self.convert_dates,\n \"keep_default_dates\": self.keep_default_dates, \"numpy\": self.numpy,\n \"precise_float\": self.precise_float, \"date_unit\": self.date_unit\n }\n obj = None\n if typ == 'frame':\n obj = FrameParser(json, **kwargs).parse()\n\n if typ == 'series' or obj is None:\n if not isinstance(dtype, bool):\n kwargs['dtype'] = dtype\n obj = SeriesParser(json, **kwargs).parse()\n\n return obj"} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef check_keys_split(self, decoded):\n bad_keys = set(decoded.keys()).difference(set(self._split_keys))\n if bad_keys:\n bad_keys = \", \".join(bad_keys)\n raise ValueError(\"JSON data had unexpected key(s): {bad_keys}\"\n .format(bad_keys=pprint_thing(bad_keys)))", "response": "Checks that the dict has only the appropriate keys for orient = split."} {"instruction": "Implement a Python 3 function for\ntrying to convert axes.", "response": "def _convert_axes(self):\n \"\"\"\n Try to convert axes.\n \"\"\"\n for axis in self.obj._AXIS_NUMBERS.keys():\n new_axis, result = self._try_convert_data(\n axis, self.obj._get_axis(axis), use_dtypes=False,\n convert_dates=True)\n if result:\n setattr(self.obj, axis, new_axis)"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef _process_converter(self, f, filt=None):\n\n if filt is None:\n filt = lambda col, c: True\n\n needs_new_obj = False\n new_obj = dict()\n for i, (col, c) in enumerate(self.obj.iteritems()):\n if filt(col, c):\n new_data, result = f(col, c)\n if result:\n c = new_data\n needs_new_obj = True\n new_obj[i] = c\n\n if needs_new_obj:\n\n # possibly handle dup columns\n new_obj = DataFrame(new_obj, index=self.obj.index)\n new_obj.columns = self.obj.columns\n self.obj = new_obj", "response": "Take a conversion function and possibly recreate the frame."} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef format_array(values, formatter, float_format=None, na_rep='NaN',\n digits=None, space=None, justify='right', decimal='.',\n leading_space=None):\n \"\"\"\n Format an array for printing.\n\n Parameters\n ----------\n values\n formatter\n float_format\n na_rep\n digits\n space\n justify\n decimal\n leading_space : bool, optional\n Whether the array should be formatted with a leading space.\n When an array as a column of a Series or DataFrame, we do want\n the leading space to pad between columns.\n\n When formatting an Index subclass\n (e.g. IntervalIndex._format_native_types), we don't want the\n leading space since it should be left-aligned.\n\n Returns\n -------\n List[str]\n \"\"\"\n\n if is_datetime64_dtype(values.dtype):\n fmt_klass = Datetime64Formatter\n elif is_datetime64tz_dtype(values):\n fmt_klass = Datetime64TZFormatter\n elif is_timedelta64_dtype(values.dtype):\n fmt_klass = Timedelta64Formatter\n elif is_extension_array_dtype(values.dtype):\n fmt_klass = ExtensionArrayFormatter\n elif is_float_dtype(values.dtype) or is_complex_dtype(values.dtype):\n fmt_klass = FloatArrayFormatter\n elif is_integer_dtype(values.dtype):\n fmt_klass = IntArrayFormatter\n else:\n fmt_klass = GenericArrayFormatter\n\n if space is None:\n space = get_option(\"display.column_space\")\n\n if float_format is None:\n float_format = get_option(\"display.float_format\")\n\n if digits is None:\n digits = get_option(\"display.precision\")\n\n fmt_obj = fmt_klass(values, digits=digits, na_rep=na_rep,\n float_format=float_format, formatter=formatter,\n space=space, justify=justify, decimal=decimal,\n leading_space=leading_space)\n\n return fmt_obj.get_result()", "response": "Formats an array of values using the given formatter."} {"instruction": "Make a summary of the following Python 3 code\ndef format_percentiles(percentiles):\n\n percentiles = np.asarray(percentiles)\n\n # It checks for np.NaN as well\n with np.errstate(invalid='ignore'):\n if not is_numeric_dtype(percentiles) or not np.all(percentiles >= 0) \\\n or not np.all(percentiles <= 1):\n raise ValueError(\"percentiles should all be in the interval [0,1]\")\n\n percentiles = 100 * percentiles\n int_idx = (percentiles.astype(int) == percentiles)\n\n if np.all(int_idx):\n out = percentiles.astype(int).astype(str)\n return [i + '%' for i in out]\n\n unique_pcts = np.unique(percentiles)\n to_begin = unique_pcts[0] if unique_pcts[0] > 0 else None\n to_end = 100 - unique_pcts[-1] if unique_pcts[-1] < 100 else None\n\n # Least precision that keeps percentiles unique after rounding\n prec = -np.floor(np.log10(np.min(\n np.ediff1d(unique_pcts, to_begin=to_begin, to_end=to_end)\n ))).astype(int)\n prec = max(1, prec)\n out = np.empty_like(percentiles, dtype=object)\n out[int_idx] = percentiles[int_idx].astype(int).astype(str)\n out[~int_idx] = percentiles[~int_idx].round(prec).astype(str)\n return [i + '%' for i in out]", "response": "Formats the percentiles of a single resource item into a list of strings."} {"instruction": "Write a Python 3 script for\nreturning a formatter function for a range of timedeltas.", "response": "def _get_format_timedelta64(values, nat_rep='NaT', box=False):\n \"\"\"\n Return a formatter function for a range of timedeltas.\n These will all have the same format argument\n\n If box, then show the return in quotes\n \"\"\"\n\n values_int = values.astype(np.int64)\n\n consider_values = values_int != iNaT\n\n one_day_nanos = (86400 * 1e9)\n even_days = np.logical_and(consider_values,\n values_int % one_day_nanos != 0).sum() == 0\n all_sub_day = np.logical_and(\n consider_values, np.abs(values_int) >= one_day_nanos).sum() == 0\n\n if even_days:\n format = None\n elif all_sub_day:\n format = 'sub_day'\n else:\n format = 'long'\n\n def _formatter(x):\n if x is None or (is_scalar(x) and isna(x)):\n return nat_rep\n\n if not isinstance(x, Timedelta):\n x = Timedelta(x)\n result = x._repr_base(format=format)\n if box:\n result = \"'{res}'\".format(res=result)\n return result\n\n return _formatter"} {"instruction": "Here you have a function in Python 3, explain what it does\ndef _trim_zeros_complex(str_complexes, na_rep='NaN'):\n def separate_and_trim(str_complex, na_rep):\n num_arr = str_complex.split('+')\n return (_trim_zeros_float([num_arr[0]], na_rep) +\n ['+'] +\n _trim_zeros_float([num_arr[1][:-1]], na_rep) +\n ['j'])\n\n return [''.join(separate_and_trim(x, na_rep)) for x in str_complexes]", "response": "Returns a list of complex numbers that are not zero or NaN."} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef _trim_zeros_float(str_floats, na_rep='NaN'):\n trimmed = str_floats\n\n def _is_number(x):\n return (x != na_rep and not x.endswith('inf'))\n\n def _cond(values):\n finite = [x for x in values if _is_number(x)]\n return (len(finite) > 0 and all(x.endswith('0') for x in finite) and\n not (any(('e' in x) or ('E' in x) for x in finite)))\n\n while _cond(trimmed):\n trimmed = [x[:-1] if _is_number(x) else x for x in trimmed]\n\n # leave one 0 after the decimal points if need be.\n return [x + \"0\" if x.endswith('.') and _is_number(x) else x\n for x in trimmed]", "response": "Trims zeros from the list of strings in the order they appear in the archive."} {"instruction": "Given the following Python 3 function, write the documentation\ndef set_eng_float_format(accuracy=3, use_eng_prefix=False):\n\n set_option(\"display.float_format\", EngFormatter(accuracy, use_eng_prefix))\n set_option(\"display.column_space\", max(12, accuracy + 9))", "response": "Set default behavior on how float is formatted in DataFrame."} {"instruction": "Create a Python 3 function for\nreturning a list of lengths of all the levels in a single tree.", "response": "def get_level_lengths(levels, sentinel=''):\n \"\"\"For each index in each level the function returns lengths of indexes.\n\n Parameters\n ----------\n levels : list of lists\n List of values on for level.\n sentinel : string, optional\n Value which states that no new index starts on there.\n\n Returns\n ----------\n Returns list of maps. For each level returns map of indexes (key is index\n in row and value is length of index).\n \"\"\"\n if len(levels) == 0:\n return []\n\n control = [True] * len(levels[0])\n\n result = []\n for level in levels:\n last_index = 0\n\n lengths = {}\n for i, key in enumerate(level):\n if control[i] and key == sentinel:\n pass\n else:\n control[i] = False\n lengths[last_index] = i - last_index\n last_index = i\n\n lengths[last_index] = len(level) - last_index\n\n result.append(lengths)\n\n return result"} {"instruction": "Create a Python 3 function to\nwrite a list of lines to a buffer.", "response": "def buffer_put_lines(buf, lines):\n \"\"\"\n Appends lines to a buffer.\n\n Parameters\n ----------\n buf\n The buffer to write to\n lines\n The lines to append.\n \"\"\"\n if any(isinstance(x, str) for x in lines):\n lines = [str(x) for x in lines]\n buf.write('\\n'.join(lines))"} {"instruction": "Explain what the following Python 3 code does\ndef len(self, text):\n if not isinstance(text, str):\n return len(text)\n\n return sum(self._EAW_MAP.get(east_asian_width(c), self.ambiguous_width)\n for c in text)", "response": "Calculate display width considering unicode East Asian Width"} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef _to_str_columns(self):\n frame = self.tr_frame\n # may include levels names also\n\n str_index = self._get_formatted_index(frame)\n\n if not is_list_like(self.header) and not self.header:\n stringified = []\n for i, c in enumerate(frame):\n fmt_values = self._format_col(i)\n fmt_values = _make_fixed_width(fmt_values, self.justify,\n minimum=(self.col_space or 0),\n adj=self.adj)\n stringified.append(fmt_values)\n else:\n if is_list_like(self.header):\n if len(self.header) != len(self.columns):\n raise ValueError(('Writing {ncols} cols but got {nalias} '\n 'aliases'\n .format(ncols=len(self.columns),\n nalias=len(self.header))))\n str_columns = [[label] for label in self.header]\n else:\n str_columns = self._get_formatted_column_labels(frame)\n\n if self.show_row_idx_names:\n for x in str_columns:\n x.append('')\n\n stringified = []\n for i, c in enumerate(frame):\n cheader = str_columns[i]\n header_colwidth = max(self.col_space or 0,\n *(self.adj.len(x) for x in cheader))\n fmt_values = self._format_col(i)\n fmt_values = _make_fixed_width(fmt_values, self.justify,\n minimum=header_colwidth,\n adj=self.adj)\n\n max_len = max(max(self.adj.len(x) for x in fmt_values),\n header_colwidth)\n cheader = self.adj.justify(cheader, max_len, mode=self.justify)\n stringified.append(cheader + fmt_values)\n\n strcols = stringified\n if self.index:\n strcols.insert(0, str_index)\n\n # Add ... to signal truncated\n truncate_h = self.truncate_h\n truncate_v = self.truncate_v\n\n if truncate_h:\n col_num = self.tr_col_num\n strcols.insert(self.tr_col_num + 1, [' ...'] * (len(str_index)))\n if truncate_v:\n n_header_rows = len(str_index) - len(frame)\n row_num = self.tr_row_num\n for ix, col in enumerate(strcols):\n # infer from above row\n cwidth = self.adj.len(strcols[ix][row_num])\n is_dot_col = False\n if truncate_h:\n is_dot_col = ix == col_num + 1\n if cwidth > 3 or is_dot_col:\n my_str = '...'\n else:\n my_str = '..'\n\n if ix == 0:\n dot_mode = 'left'\n elif is_dot_col:\n cwidth = 4\n dot_mode = 'right'\n else:\n dot_mode = 'right'\n dot_str = self.adj.justify([my_str], cwidth, mode=dot_mode)[0]\n strcols[ix].insert(row_num + n_header_rows, dot_str)\n return strcols", "response": "Render a DataFrame to a list of columns as strings."} {"instruction": "Here you have a function in Python 3, explain what it does\ndef to_string(self):\n from pandas import Series\n\n frame = self.frame\n\n if len(frame.columns) == 0 or len(frame.index) == 0:\n info_line = ('Empty {name}\\nColumns: {col}\\nIndex: {idx}'\n .format(name=type(self.frame).__name__,\n col=pprint_thing(frame.columns),\n idx=pprint_thing(frame.index)))\n text = info_line\n else:\n\n strcols = self._to_str_columns()\n if self.line_width is None: # no need to wrap around just print\n # the whole frame\n text = self.adj.adjoin(1, *strcols)\n elif (not isinstance(self.max_cols, int) or\n self.max_cols > 0): # need to wrap around\n text = self._join_multiline(*strcols)\n else: # max_cols == 0. Try to fit frame to terminal\n text = self.adj.adjoin(1, *strcols).split('\\n')\n max_len = Series(text).str.len().max()\n # plus truncate dot col\n dif = max_len - self.w\n # '+ 1' to avoid too wide repr (GH PR #17023)\n adj_dif = dif + 1\n col_lens = Series([Series(ele).apply(len).max()\n for ele in strcols])\n n_cols = len(col_lens)\n counter = 0\n while adj_dif > 0 and n_cols > 1:\n counter += 1\n mid = int(round(n_cols / 2.))\n mid_ix = col_lens.index[mid]\n col_len = col_lens[mid_ix]\n # adjoin adds one\n adj_dif -= (col_len + 1)\n col_lens = col_lens.drop(mid_ix)\n n_cols = len(col_lens)\n # subtract index column\n max_cols_adj = n_cols - self.index\n # GH-21180. Ensure that we print at least two.\n max_cols_adj = max(max_cols_adj, 2)\n self.max_cols_adj = max_cols_adj\n\n # Call again _chk_truncate to cut frame appropriately\n # and then generate string representation\n self._chk_truncate()\n strcols = self._to_str_columns()\n text = self.adj.adjoin(1, *strcols)\n self.buf.writelines(text)\n\n if self.should_show_dimensions:\n self.buf.write(\"\\n\\n[{nrows} rows x {ncols} columns]\"\n .format(nrows=len(frame), ncols=len(frame.columns)))", "response": "Render a DataFrame to a console - friendly tabular output."} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef to_latex(self, column_format=None, longtable=False, encoding=None,\n multicolumn=False, multicolumn_format=None, multirow=False):\n \"\"\"\n Render a DataFrame to a LaTeX tabular/longtable environment output.\n \"\"\"\n\n from pandas.io.formats.latex import LatexFormatter\n latex_renderer = LatexFormatter(self, column_format=column_format,\n longtable=longtable,\n multicolumn=multicolumn,\n multicolumn_format=multicolumn_format,\n multirow=multirow)\n\n if encoding is None:\n encoding = 'utf-8'\n\n if hasattr(self.buf, 'write'):\n latex_renderer.write_result(self.buf)\n elif isinstance(self.buf, str):\n import codecs\n with codecs.open(self.buf, 'w', encoding=encoding) as f:\n latex_renderer.write_result(f)\n else:\n raise TypeError('buf is not a file name and it has no write '\n 'method')", "response": "Render a DataFrame to a LaTeX tabular or longtable environment output."} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef to_html(self, classes=None, notebook=False, border=None):\n from pandas.io.formats.html import HTMLFormatter, NotebookFormatter\n Klass = NotebookFormatter if notebook else HTMLFormatter\n html = Klass(self, classes=classes, border=border).render()\n if hasattr(self.buf, 'write'):\n buffer_put_lines(self.buf, html)\n elif isinstance(self.buf, str):\n with open(self.buf, 'w') as f:\n buffer_put_lines(f, html)\n else:\n raise TypeError('buf is not a file name and it has no write '\n ' method')", "response": "Render a DataFrame to a html table."} {"instruction": "Make a summary of the following Python 3 code\ndef _value_formatter(self, float_format=None, threshold=None):\n\n # the float_format parameter supersedes self.float_format\n if float_format is None:\n float_format = self.float_format\n\n # we are going to compose different functions, to first convert to\n # a string, then replace the decimal symbol, and finally chop according\n # to the threshold\n\n # when there is no float_format, we use str instead of '%g'\n # because str(0.0) = '0.0' while '%g' % 0.0 = '0'\n if float_format:\n def base_formatter(v):\n return float_format(value=v) if notna(v) else self.na_rep\n else:\n def base_formatter(v):\n return str(v) if notna(v) else self.na_rep\n\n if self.decimal != '.':\n def decimal_formatter(v):\n return base_formatter(v).replace('.', self.decimal, 1)\n else:\n decimal_formatter = base_formatter\n\n if threshold is None:\n return decimal_formatter\n\n def formatter(value):\n if notna(value):\n if abs(value) > threshold:\n return decimal_formatter(value)\n else:\n return decimal_formatter(0.0)\n else:\n return self.na_rep\n\n return formatter", "response": "Returns a function to be applied on each value to format it\n "} {"instruction": "Write a Python 3 script to\nreturn the values converted into strings using the parameters given at initialisation as a numpy array", "response": "def get_result_as_array(self):\n \"\"\"\n Returns the float values converted into strings using\n the parameters given at initialisation, as a numpy array\n \"\"\"\n\n if self.formatter is not None:\n return np.array([self.formatter(x) for x in self.values])\n\n if self.fixed_width:\n threshold = get_option(\"display.chop_threshold\")\n else:\n threshold = None\n\n # if we have a fixed_width, we'll need to try different float_format\n def format_values_with(float_format):\n formatter = self._value_formatter(float_format, threshold)\n\n # default formatter leaves a space to the left when formatting\n # floats, must be consistent for left-justifying NaNs (GH #25061)\n if self.justify == 'left':\n na_rep = ' ' + self.na_rep\n else:\n na_rep = self.na_rep\n\n # separate the wheat from the chaff\n values = self.values\n is_complex = is_complex_dtype(values)\n mask = isna(values)\n if hasattr(values, 'to_dense'): # sparse numpy ndarray\n values = values.to_dense()\n values = np.array(values, dtype='object')\n values[mask] = na_rep\n imask = (~mask).ravel()\n values.flat[imask] = np.array([formatter(val)\n for val in values.ravel()[imask]])\n\n if self.fixed_width:\n if is_complex:\n return _trim_zeros_complex(values, na_rep)\n else:\n return _trim_zeros_float(values, na_rep)\n\n return values\n\n # There is a special default string when we are fixed-width\n # The default is otherwise to use str instead of a formatting string\n if self.float_format is None:\n if self.fixed_width:\n float_format = partial('{value: .{digits:d}f}'.format,\n digits=self.digits)\n else:\n float_format = self.float_format\n else:\n float_format = lambda value: self.float_format % value\n\n formatted_values = format_values_with(float_format)\n\n if not self.fixed_width:\n return formatted_values\n\n # we need do convert to engineering format if some values are too small\n # and would appear as 0, or if some values are too big and take too\n # much space\n\n if len(formatted_values) > 0:\n maxlen = max(len(x) for x in formatted_values)\n too_long = maxlen > self.digits + 6\n else:\n too_long = False\n\n with np.errstate(invalid='ignore'):\n abs_vals = np.abs(self.values)\n # this is pretty arbitrary for now\n # large values: more that 8 characters including decimal symbol\n # and first digit, hence > 1e6\n has_large_values = (abs_vals > 1e6).any()\n has_small_values = ((abs_vals < 10**(-self.digits)) &\n (abs_vals > 0)).any()\n\n if has_small_values or (too_long and has_large_values):\n float_format = partial('{value: .{digits:d}e}'.format,\n digits=self.digits)\n formatted_values = format_values_with(float_format)\n\n return formatted_values"} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef _format_strings(self):\n\n values = self.values\n\n if not isinstance(values, DatetimeIndex):\n values = DatetimeIndex(values)\n\n if self.formatter is not None and callable(self.formatter):\n return [self.formatter(x) for x in values]\n\n fmt_values = format_array_from_datetime(\n values.asi8.ravel(),\n format=_get_format_datetime64_from_values(values,\n self.date_format),\n na_rep=self.nat_rep).reshape(values.shape)\n return fmt_values.tolist()", "response": "return a list of strings that can be used to format the ISO - 8601 data for the current object"} {"instruction": "Make a summary of the following Python 3 code\ndef _format_strings(self):\n\n values = self.values.astype(object)\n is_dates_only = _is_dates_only(values)\n formatter = (self.formatter or\n _get_format_datetime64(is_dates_only,\n date_format=self.date_format))\n fmt_values = [formatter(x) for x in values]\n\n return fmt_values", "response": "return a list of strings that can be used to format the TZ"} {"instruction": "Implement a Python 3 function for\ngiving an Interval or IntervalIndex return the corresponding interval with closed bounds.", "response": "def _get_interval_closed_bounds(interval):\n \"\"\"\n Given an Interval or IntervalIndex, return the corresponding interval with\n closed bounds.\n \"\"\"\n left, right = interval.left, interval.right\n if interval.open_left:\n left = _get_next_label(left)\n if interval.open_right:\n right = _get_prev_label(right)\n return left, right"} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef _is_valid_endpoint(endpoint):\n return any([is_number(endpoint),\n isinstance(endpoint, Timestamp),\n isinstance(endpoint, Timedelta),\n endpoint is None])", "response": "helper for interval_range to check if start and end are valid types"} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef save(self):\n # GH21227 internal compression is not used when file-like passed.\n if self.compression and hasattr(self.path_or_buf, 'write'):\n msg = (\"compression has no effect when passing file-like \"\n \"object as input.\")\n warnings.warn(msg, RuntimeWarning, stacklevel=2)\n\n # when zip compression is called.\n is_zip = isinstance(self.path_or_buf, ZipFile) or (\n not hasattr(self.path_or_buf, 'write')\n and self.compression == 'zip')\n\n if is_zip:\n # zipfile doesn't support writing string to archive. uses string\n # buffer to receive csv writing and dump into zip compression\n # file handle. GH21241, GH21118\n f = StringIO()\n close = False\n elif hasattr(self.path_or_buf, 'write'):\n f = self.path_or_buf\n close = False\n else:\n f, handles = _get_handle(self.path_or_buf, self.mode,\n encoding=self.encoding,\n compression=self.compression)\n close = True\n\n try:\n writer_kwargs = dict(lineterminator=self.line_terminator,\n delimiter=self.sep, quoting=self.quoting,\n doublequote=self.doublequote,\n escapechar=self.escapechar,\n quotechar=self.quotechar)\n if self.encoding == 'ascii':\n self.writer = csvlib.writer(f, **writer_kwargs)\n else:\n writer_kwargs['encoding'] = self.encoding\n self.writer = UnicodeWriter(f, **writer_kwargs)\n\n self._save()\n\n finally:\n if is_zip:\n # GH17778 handles zip compression separately.\n buf = f.getvalue()\n if hasattr(self.path_or_buf, 'write'):\n self.path_or_buf.write(buf)\n else:\n f, handles = _get_handle(self.path_or_buf, self.mode,\n encoding=self.encoding,\n compression=self.compression)\n f.write(buf)\n close = True\n if close:\n f.close()\n for _fh in handles:\n _fh.close()", "response": "Save the object to a file."} {"instruction": "How would you code a function in Python 3 to\nadd delegated names to a class using a class decorator. This provides an alternative usage to directly calling `_add_delegate_accessors` below a class definition. Parameters ---------- delegate : object the class to get methods/properties & doc-strings accessors : Sequence[str] List of accessor to add typ : {'property', 'method'} overwrite : boolean, default False overwrite the method/property in the target class if it exists Returns ------- callable A class decorator. Examples -------- @delegate_names(Categorical, [\"categories\", \"ordered\"], \"property\") class CategoricalAccessor(PandasDelegate): [...]", "response": "def delegate_names(delegate, accessors, typ, overwrite=False):\n \"\"\"\n Add delegated names to a class using a class decorator. This provides\n an alternative usage to directly calling `_add_delegate_accessors`\n below a class definition.\n\n Parameters\n ----------\n delegate : object\n the class to get methods/properties & doc-strings\n accessors : Sequence[str]\n List of accessor to add\n typ : {'property', 'method'}\n overwrite : boolean, default False\n overwrite the method/property in the target class if it exists\n\n Returns\n -------\n callable\n A class decorator.\n\n Examples\n --------\n @delegate_names(Categorical, [\"categories\", \"ordered\"], \"property\")\n class CategoricalAccessor(PandasDelegate):\n [...]\n \"\"\"\n def add_delegate_accessors(cls):\n cls._add_delegate_accessors(delegate, accessors, typ,\n overwrite=overwrite)\n return cls\n\n return add_delegate_accessors"} {"instruction": "Write a Python 3 script to\nreturn a set of additional __dir__ for this object.", "response": "def _dir_additions(self):\n \"\"\"\n Add additional __dir__ for this object.\n \"\"\"\n rv = set()\n for accessor in self._accessors:\n try:\n getattr(self, accessor)\n rv.add(accessor)\n except AttributeError:\n pass\n return rv"} {"instruction": "Can you write a function in Python 3 where it\nadds the methods and properties to the class from the delegate class.", "response": "def _add_delegate_accessors(cls, delegate, accessors, typ,\n overwrite=False):\n \"\"\"\n Add accessors to cls from the delegate class.\n\n Parameters\n ----------\n cls : the class to add the methods/properties to\n delegate : the class to get methods/properties & doc-strings\n accessors : string list of accessors to add\n typ : 'property' or 'method'\n overwrite : boolean, default False\n overwrite the method/property in the target class if it exists.\n \"\"\"\n\n def _create_delegator_property(name):\n\n def _getter(self):\n return self._delegate_property_get(name)\n\n def _setter(self, new_values):\n return self._delegate_property_set(name, new_values)\n\n _getter.__name__ = name\n _setter.__name__ = name\n\n return property(fget=_getter, fset=_setter,\n doc=getattr(delegate, name).__doc__)\n\n def _create_delegator_method(name):\n\n def f(self, *args, **kwargs):\n return self._delegate_method(name, *args, **kwargs)\n\n f.__name__ = name\n f.__doc__ = getattr(delegate, name).__doc__\n\n return f\n\n for name in accessors:\n\n if typ == 'property':\n f = _create_delegator_property(name)\n else:\n f = _create_delegator_method(name)\n\n # don't overwrite existing methods/properties\n if overwrite or not hasattr(cls, name):\n setattr(cls, name, f)"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef _evaluate_standard(op, op_str, a, b, **eval_kwargs):\n if _TEST_MODE:\n _store_test_result(False)\n with np.errstate(all='ignore'):\n return op(a, b)", "response": "Standard evaluation for the given operator."} {"instruction": "How would you code a function in Python 3 to\nreturn a boolean if we WILL be using numexpr", "response": "def _can_use_numexpr(op, op_str, a, b, dtype_check):\n \"\"\" return a boolean if we WILL be using numexpr \"\"\"\n if op_str is not None:\n\n # required min elements (otherwise we are adding overhead)\n if np.prod(a.shape) > _MIN_ELEMENTS:\n\n # check for dtype compatibility\n dtypes = set()\n for o in [a, b]:\n if hasattr(o, 'get_dtype_counts'):\n s = o.get_dtype_counts()\n if len(s) > 1:\n return False\n dtypes |= set(s.index)\n elif isinstance(o, np.ndarray):\n dtypes |= {o.dtype.name}\n\n # allowed are a superset\n if not len(dtypes) or _ALLOWED_DTYPES[dtype_check] >= dtypes:\n return True\n\n return False"} {"instruction": "Create a Python 3 function for\nevaluating and return the expression of the op on a and b", "response": "def evaluate(op, op_str, a, b, use_numexpr=True,\n **eval_kwargs):\n \"\"\" evaluate and return the expression of the op on a and b\n\n Parameters\n ----------\n\n op : the actual operand\n op_str: the string version of the op\n a : left operand\n b : right operand\n use_numexpr : whether to try to use numexpr (default True)\n \"\"\"\n\n use_numexpr = use_numexpr and _bool_arith_check(op_str, a, b)\n if use_numexpr:\n return _evaluate(op, op_str, a, b, **eval_kwargs)\n return _evaluate_standard(op, op_str, a, b)"} {"instruction": "Given the following Python 3 function, write the documentation\ndef where(cond, a, b, use_numexpr=True):\n\n if use_numexpr:\n return _where(cond, a, b)\n return _where_standard(cond, a, b)", "response": "evaluate the where condition cond on a and b"} {"instruction": "Implement a Python 3 function for\nwriting the DataFrame to an Excel file.", "response": "def write(self, writer, sheet_name='Sheet1', startrow=0,\n startcol=0, freeze_panes=None, engine=None):\n \"\"\"\n writer : string or ExcelWriter object\n File path or existing ExcelWriter\n sheet_name : string, default 'Sheet1'\n Name of sheet which will contain DataFrame\n startrow :\n upper left cell row to dump data frame\n startcol :\n upper left cell column to dump data frame\n freeze_panes : tuple of integer (length 2), default None\n Specifies the one-based bottommost row and rightmost column that\n is to be frozen\n engine : string, default None\n write engine to use if writer is a path - you can also set this\n via the options ``io.excel.xlsx.writer``, ``io.excel.xls.writer``,\n and ``io.excel.xlsm.writer``.\n \"\"\"\n from pandas.io.excel import ExcelWriter\n from pandas.io.common import _stringify_path\n\n if isinstance(writer, ExcelWriter):\n need_save = False\n else:\n writer = ExcelWriter(_stringify_path(writer), engine=engine)\n need_save = True\n\n formatted_cells = self.get_formatted_cells()\n writer.write_cells(formatted_cells, sheet_name,\n startrow=startrow, startcol=startcol,\n freeze_panes=freeze_panes)\n if need_save:\n writer.save()"} {"instruction": "How would you code a function in Python 3 to\nwrite a DataFrame to the feather - format.", "response": "def to_feather(df, path):\n \"\"\"\n Write a DataFrame to the feather-format\n\n Parameters\n ----------\n df : DataFrame\n path : string file path, or file-like object\n\n \"\"\"\n path = _stringify_path(path)\n if not isinstance(df, DataFrame):\n raise ValueError(\"feather only support IO with DataFrames\")\n\n feather = _try_import()[0]\n valid_types = {'string', 'unicode'}\n\n # validate index\n # --------------\n\n # validate that we have only a default index\n # raise on anything else as we don't serialize the index\n\n if not isinstance(df.index, Int64Index):\n raise ValueError(\"feather does not support serializing {} \"\n \"for the index; you can .reset_index()\"\n \"to make the index into column(s)\".format(\n type(df.index)))\n\n if not df.index.equals(RangeIndex.from_range(range(len(df)))):\n raise ValueError(\"feather does not support serializing a \"\n \"non-default index for the index; you \"\n \"can .reset_index() to make the index \"\n \"into column(s)\")\n\n if df.index.name is not None:\n raise ValueError(\"feather does not serialize index meta-data on a \"\n \"default index\")\n\n # validate columns\n # ----------------\n\n # must have value column names (strings only)\n if df.columns.inferred_type not in valid_types:\n raise ValueError(\"feather must have string column names\")\n\n feather.write_feather(df, path)"} {"instruction": "How would you explain what the following Python 3 function does\ndef read_feather(path, columns=None, use_threads=True):\n\n feather, pyarrow = _try_import()\n path = _stringify_path(path)\n\n if LooseVersion(pyarrow.__version__) < LooseVersion('0.11.0'):\n int_use_threads = int(use_threads)\n if int_use_threads < 1:\n int_use_threads = 1\n return feather.read_feather(path, columns=columns,\n nthreads=int_use_threads)\n\n return feather.read_feather(path, columns=columns,\n use_threads=bool(use_threads))", "response": "Load a feather - format object from a file path."} {"instruction": "Write a Python 3 function that can\ngenerate a regular range of dates.", "response": "def generate_regular_range(start, end, periods, freq):\n \"\"\"\n Generate a range of dates with the spans between dates described by\n the given `freq` DateOffset.\n\n Parameters\n ----------\n start : Timestamp or None\n first point of produced date range\n end : Timestamp or None\n last point of produced date range\n periods : int\n number of periods in produced date range\n freq : DateOffset\n describes space between dates in produced date range\n\n Returns\n -------\n ndarray[np.int64] representing nanosecond unix timestamps\n \"\"\"\n if isinstance(freq, Tick):\n stride = freq.nanos\n if periods is None:\n b = Timestamp(start).value\n # cannot just use e = Timestamp(end) + 1 because arange breaks when\n # stride is too large, see GH10887\n e = (b + (Timestamp(end).value - b) // stride * stride +\n stride // 2 + 1)\n # end.tz == start.tz by this point due to _generate implementation\n tz = start.tz\n elif start is not None:\n b = Timestamp(start).value\n e = _generate_range_overflow_safe(b, periods, stride, side='start')\n tz = start.tz\n elif end is not None:\n e = Timestamp(end).value + stride\n b = _generate_range_overflow_safe(e, periods, stride, side='end')\n tz = end.tz\n else:\n raise ValueError(\"at least 'start' or 'end' should be specified \"\n \"if a 'period' is given.\")\n\n with np.errstate(over=\"raise\"):\n # If the range is sufficiently large, np.arange may overflow\n # and incorrectly return an empty array if not caught.\n try:\n values = np.arange(b, e, stride, dtype=np.int64)\n except FloatingPointError:\n xdr = [b]\n while xdr[-1] != e:\n xdr.append(xdr[-1] + stride)\n values = np.array(xdr[:-1], dtype=np.int64)\n\n else:\n tz = None\n # start and end should have the same timezone by this point\n if start is not None:\n tz = start.tz\n elif end is not None:\n tz = end.tz\n\n xdr = generate_range(start=start, end=end,\n periods=periods, offset=freq)\n\n values = np.array([x.value for x in xdr], dtype=np.int64)\n\n return values, tz"} {"instruction": "How would you implement a function in Python 3 that\ngenerates a new range of the current time - domain.", "response": "def _generate_range_overflow_safe(endpoint, periods, stride, side='start'):\n \"\"\"\n Calculate the second endpoint for passing to np.arange, checking\n to avoid an integer overflow. Catch OverflowError and re-raise\n as OutOfBoundsDatetime.\n\n Parameters\n ----------\n endpoint : int\n nanosecond timestamp of the known endpoint of the desired range\n periods : int\n number of periods in the desired range\n stride : int\n nanoseconds between periods in the desired range\n side : {'start', 'end'}\n which end of the range `endpoint` refers to\n\n Returns\n -------\n other_end : int\n\n Raises\n ------\n OutOfBoundsDatetime\n \"\"\"\n # GH#14187 raise instead of incorrectly wrapping around\n assert side in ['start', 'end']\n\n i64max = np.uint64(np.iinfo(np.int64).max)\n msg = ('Cannot generate range with {side}={endpoint} and '\n 'periods={periods}'\n .format(side=side, endpoint=endpoint, periods=periods))\n\n with np.errstate(over=\"raise\"):\n # if periods * strides cannot be multiplied within the *uint64* bounds,\n # we cannot salvage the operation by recursing, so raise\n try:\n addend = np.uint64(periods) * np.uint64(np.abs(stride))\n except FloatingPointError:\n raise OutOfBoundsDatetime(msg)\n\n if np.abs(addend) <= i64max:\n # relatively easy case without casting concerns\n return _generate_range_overflow_safe_signed(\n endpoint, periods, stride, side)\n\n elif ((endpoint > 0 and side == 'start' and stride > 0) or\n (endpoint < 0 and side == 'end' and stride > 0)):\n # no chance of not-overflowing\n raise OutOfBoundsDatetime(msg)\n\n elif (side == 'end' and endpoint > i64max and endpoint - stride <= i64max):\n # in _generate_regular_range we added `stride` thereby overflowing\n # the bounds. Adjust to fix this.\n return _generate_range_overflow_safe(endpoint - stride,\n periods - 1, stride, side)\n\n # split into smaller pieces\n mid_periods = periods // 2\n remaining = periods - mid_periods\n assert 0 < remaining < periods, (remaining, periods, endpoint, stride)\n\n midpoint = _generate_range_overflow_safe(endpoint, mid_periods,\n stride, side)\n return _generate_range_overflow_safe(midpoint, remaining, stride, side)"} {"instruction": "Can you implement a function in Python 3 that\nchecks to see if we can set a locale and subsequently get the locale without raising an Exception.", "response": "def can_set_locale(lc, lc_var=locale.LC_ALL):\n \"\"\"\n Check to see if we can set a locale, and subsequently get the locale,\n without raising an Exception.\n\n Parameters\n ----------\n lc : str\n The locale to attempt to set.\n lc_var : int, default `locale.LC_ALL`\n The category of the locale being set.\n\n Returns\n -------\n is_valid : bool\n Whether the passed locale can be set\n \"\"\"\n\n try:\n with set_locale(lc, lc_var=lc_var):\n pass\n except (ValueError, locale.Error):\n # horrible name for a Exception subclass\n return False\n else:\n return True"} {"instruction": "Write a Python 3 script for\nreturning a list of normalized locales that do not throw an Exception when set.", "response": "def _valid_locales(locales, normalize):\n \"\"\"\n Return a list of normalized locales that do not throw an ``Exception``\n when set.\n\n Parameters\n ----------\n locales : str\n A string where each locale is separated by a newline.\n normalize : bool\n Whether to call ``locale.normalize`` on each locale.\n\n Returns\n -------\n valid_locales : list\n A list of valid locales.\n \"\"\"\n if normalize:\n normalizer = lambda x: locale.normalize(x.strip())\n else:\n normalizer = lambda x: x.strip()\n\n return list(filter(can_set_locale, map(normalizer, locales)))"} {"instruction": "Can you implement a function in Python 3 that\ngets all the available locales for a given prefix.", "response": "def get_locales(prefix=None, normalize=True,\n locale_getter=_default_locale_getter):\n \"\"\"\n Get all the locales that are available on the system.\n\n Parameters\n ----------\n prefix : str\n If not ``None`` then return only those locales with the prefix\n provided. For example to get all English language locales (those that\n start with ``\"en\"``), pass ``prefix=\"en\"``.\n normalize : bool\n Call ``locale.normalize`` on the resulting list of available locales.\n If ``True``, only locales that can be set without throwing an\n ``Exception`` are returned.\n locale_getter : callable\n The function to use to retrieve the current locales. This should return\n a string with each locale separated by a newline character.\n\n Returns\n -------\n locales : list of strings\n A list of locale strings that can be set with ``locale.setlocale()``.\n For example::\n\n locale.setlocale(locale.LC_ALL, locale_string)\n\n On error will return None (no locale available, e.g. Windows)\n\n \"\"\"\n try:\n raw_locales = locale_getter()\n except Exception:\n return None\n\n try:\n # raw_locales is \"\\n\" separated list of locales\n # it may contain non-decodable parts, so split\n # extract what we can and then rejoin.\n raw_locales = raw_locales.split(b'\\n')\n out_locales = []\n for x in raw_locales:\n out_locales.append(str(\n x, encoding=options.display.encoding))\n\n except TypeError:\n pass\n\n if prefix is None:\n return _valid_locales(out_locales, normalize)\n\n pattern = re.compile('{prefix}.*'.format(prefix=prefix))\n found = pattern.findall('\\n'.join(out_locales))\n return _valid_locales(found, normalize)"} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef ensure_float(arr):\n\n if issubclass(arr.dtype.type, (np.integer, np.bool_)):\n arr = arr.astype(float)\n return arr", "response": "Ensures that an array object has a float dtype if possible."} {"instruction": "Can you implement a function in Python 3 that\nensures that an array - like object is a Categorical.", "response": "def ensure_categorical(arr):\n \"\"\"\n Ensure that an array-like object is a Categorical (if not already).\n\n Parameters\n ----------\n arr : array-like\n The array that we want to convert into a Categorical.\n\n Returns\n -------\n cat_arr : The original array cast as a Categorical. If it already\n is a Categorical, we return as is.\n \"\"\"\n\n if not is_categorical(arr):\n from pandas import Categorical\n arr = Categorical(arr)\n return arr"} {"instruction": "Write a Python 3 function for\nensuring that an array of some integer dtype has an int64 dtype if possible.", "response": "def ensure_int64_or_float64(arr, copy=False):\n \"\"\"\n Ensure that an dtype array of some integer dtype\n has an int64 dtype if possible\n If it's not possible, potentially because of overflow,\n convert the array to float64 instead.\n\n Parameters\n ----------\n arr : array-like\n The array whose data type we want to enforce.\n copy: boolean\n Whether to copy the original array or reuse\n it in place, if possible.\n\n Returns\n -------\n out_arr : The input array cast as int64 if\n possible without overflow.\n Otherwise the input array cast to float64.\n \"\"\"\n try:\n return arr.astype('int64', copy=copy, casting='safe')\n except TypeError:\n return arr.astype('float64', copy=copy)"} {"instruction": "Given the following Python 3 function, write the documentation\ndef classes_and_not_datetimelike(*klasses):\n return lambda tipo: (issubclass(tipo, klasses) and\n not issubclass(tipo, (np.datetime64, np.timedelta64)))", "response": "Evaluate if the tipo is a subclass of the klasses\n and not a datetimelike."} {"instruction": "Make a summary of the following Python 3 code\ndef is_sparse(arr):\n from pandas.core.arrays.sparse import SparseDtype\n\n dtype = getattr(arr, 'dtype', arr)\n return isinstance(dtype, SparseDtype)", "response": "Checks whether an array - like is a 1 - D pandas sparse array."} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef is_scipy_sparse(arr):\n\n global _is_scipy_sparse\n\n if _is_scipy_sparse is None:\n try:\n from scipy.sparse import issparse as _is_scipy_sparse\n except ImportError:\n _is_scipy_sparse = lambda _: False\n\n return _is_scipy_sparse(arr)", "response": "Check whether an array - like is a scipy. sparse. spmatrix instance."} {"instruction": "Write a Python 3 script for\nchecking if an object is a DateOffset or a list - like of DatetOffsets.", "response": "def is_offsetlike(arr_or_obj):\n \"\"\"\n Check if obj or all elements of list-like is DateOffset\n\n Parameters\n ----------\n arr_or_obj : object\n\n Returns\n -------\n boolean\n Whether the object is a DateOffset or listlike of DatetOffsets\n\n Examples\n --------\n >>> is_offsetlike(pd.DateOffset(days=1))\n True\n >>> is_offsetlike('offset')\n False\n >>> is_offsetlike([pd.offsets.Minute(4), pd.offsets.MonthEnd()])\n True\n >>> is_offsetlike(np.array([pd.DateOffset(months=3), pd.Timestamp.now()]))\n False\n \"\"\"\n if isinstance(arr_or_obj, ABCDateOffset):\n return True\n elif (is_list_like(arr_or_obj) and len(arr_or_obj) and\n is_object_dtype(arr_or_obj)):\n return all(isinstance(x, ABCDateOffset) for x in arr_or_obj)\n return False"} {"instruction": "Create a Python 3 function for\nchecking whether an array - like is a periodical index.", "response": "def is_period(arr):\n \"\"\"\n Check whether an array-like is a periodical index.\n\n .. deprecated:: 0.24.0\n\n Parameters\n ----------\n arr : array-like\n The array-like to check.\n\n Returns\n -------\n boolean\n Whether or not the array-like is a periodical index.\n\n Examples\n --------\n >>> is_period([1, 2, 3])\n False\n >>> is_period(pd.Index([1, 2, 3]))\n False\n >>> is_period(pd.PeriodIndex([\"2017-01-01\"], freq=\"D\"))\n True\n \"\"\"\n\n warnings.warn(\"'is_period' is deprecated and will be removed in a future \"\n \"version. Use 'is_period_dtype' or is_period_arraylike' \"\n \"instead.\", FutureWarning, stacklevel=2)\n\n return isinstance(arr, ABCPeriodIndex) or is_period_arraylike(arr)"} {"instruction": "How would you explain what the following Python 3 function does\ndef is_string_dtype(arr_or_dtype):\n\n # TODO: gh-15585: consider making the checks stricter.\n def condition(dtype):\n return dtype.kind in ('O', 'S', 'U') and not is_period_dtype(dtype)\n return _is_dtype(arr_or_dtype, condition)", "response": "Checks whether the provided array or dtype is of the string dtype."} {"instruction": "Create a Python 3 function to\ncheck whether an array - like is a periodical array - like or PeriodIndex instance.", "response": "def is_period_arraylike(arr):\n \"\"\"\n Check whether an array-like is a periodical array-like or PeriodIndex.\n\n Parameters\n ----------\n arr : array-like\n The array-like to check.\n\n Returns\n -------\n boolean\n Whether or not the array-like is a periodical array-like or\n PeriodIndex instance.\n\n Examples\n --------\n >>> is_period_arraylike([1, 2, 3])\n False\n >>> is_period_arraylike(pd.Index([1, 2, 3]))\n False\n >>> is_period_arraylike(pd.PeriodIndex([\"2017-01-01\"], freq=\"D\"))\n True\n \"\"\"\n\n if isinstance(arr, (ABCPeriodIndex, ABCPeriodArray)):\n return True\n elif isinstance(arr, (np.ndarray, ABCSeries)):\n return is_period_dtype(arr.dtype)\n return getattr(arr, 'inferred_type', None) == 'period'"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef is_datetime_arraylike(arr):\n\n if isinstance(arr, ABCDatetimeIndex):\n return True\n elif isinstance(arr, (np.ndarray, ABCSeries)):\n return (is_object_dtype(arr.dtype)\n and lib.infer_dtype(arr, skipna=False) == 'datetime')\n return getattr(arr, 'inferred_type', None) == 'datetime'", "response": "Checks whether an array - like is a datetime array - like or DatetimeIndex."} {"instruction": "Here you have a function in Python 3, explain what it does\ndef is_datetimelike(arr):\n\n return (is_datetime64_dtype(arr) or is_datetime64tz_dtype(arr) or\n is_timedelta64_dtype(arr) or\n isinstance(arr, ABCPeriodIndex))", "response": "Checks whether an array - like is a datetime - like array - like."} {"instruction": "Create a Python 3 function to\ncheck if two dtypes are equal.", "response": "def is_dtype_equal(source, target):\n \"\"\"\n Check if two dtypes are equal.\n\n Parameters\n ----------\n source : The first dtype to compare\n target : The second dtype to compare\n\n Returns\n ----------\n boolean\n Whether or not the two dtypes are equal.\n\n Examples\n --------\n >>> is_dtype_equal(int, float)\n False\n >>> is_dtype_equal(\"int\", int)\n True\n >>> is_dtype_equal(object, \"category\")\n False\n >>> is_dtype_equal(CategoricalDtype(), \"category\")\n True\n >>> is_dtype_equal(DatetimeTZDtype(), \"datetime64\")\n False\n \"\"\"\n\n try:\n source = _get_dtype(source)\n target = _get_dtype(target)\n return source == target\n except (TypeError, AttributeError):\n\n # invalid comparison\n # object == category will hit this\n return False"} {"instruction": "Write a Python 3 function for\nchecking whether two arrays have compatible dtypes to do a union.", "response": "def is_dtype_union_equal(source, target):\n \"\"\"\n Check whether two arrays have compatible dtypes to do a union.\n numpy types are checked with ``is_dtype_equal``. Extension types are\n checked separately.\n\n Parameters\n ----------\n source : The first dtype to compare\n target : The second dtype to compare\n\n Returns\n ----------\n boolean\n Whether or not the two dtypes are equal.\n\n >>> is_dtype_equal(\"int\", int)\n True\n\n >>> is_dtype_equal(CategoricalDtype(['a', 'b'],\n ... CategoricalDtype(['b', 'c']))\n True\n\n >>> is_dtype_equal(CategoricalDtype(['a', 'b'],\n ... CategoricalDtype(['b', 'c'], ordered=True))\n False\n \"\"\"\n source = _get_dtype(source)\n target = _get_dtype(target)\n if is_categorical_dtype(source) and is_categorical_dtype(target):\n # ordered False for both\n return source.ordered is target.ordered\n return is_dtype_equal(source, target)"} {"instruction": "Make a summary of the following Python 3 code\ndef is_datetime64_ns_dtype(arr_or_dtype):\n\n if arr_or_dtype is None:\n return False\n try:\n tipo = _get_dtype(arr_or_dtype)\n except TypeError:\n if is_datetime64tz_dtype(arr_or_dtype):\n tipo = _get_dtype(arr_or_dtype.dtype)\n else:\n return False\n return tipo == _NS_DTYPE or getattr(tipo, 'base', None) == _NS_DTYPE", "response": "Checks whether the provided array or dtype is of the datetime64 [ ns ] dtype."} {"instruction": "Implement a function in Python 3 to\ncheck if a string - like object to a numeric array.", "response": "def is_numeric_v_string_like(a, b):\n \"\"\"\n Check if we are comparing a string-like object to a numeric ndarray.\n\n NumPy doesn't like to compare such objects, especially numeric arrays\n and scalar string-likes.\n\n Parameters\n ----------\n a : array-like, scalar\n The first object to check.\n b : array-like, scalar\n The second object to check.\n\n Returns\n -------\n boolean\n Whether we return a comparing a string-like object to a numeric array.\n\n Examples\n --------\n >>> is_numeric_v_string_like(1, 1)\n False\n >>> is_numeric_v_string_like(\"foo\", \"foo\")\n False\n >>> is_numeric_v_string_like(1, \"foo\") # non-array numeric\n False\n >>> is_numeric_v_string_like(np.array([1]), \"foo\")\n True\n >>> is_numeric_v_string_like(\"foo\", np.array([1])) # symmetric check\n True\n >>> is_numeric_v_string_like(np.array([1, 2]), np.array([\"foo\"]))\n True\n >>> is_numeric_v_string_like(np.array([\"foo\"]), np.array([1, 2]))\n True\n >>> is_numeric_v_string_like(np.array([1]), np.array([2]))\n False\n >>> is_numeric_v_string_like(np.array([\"foo\"]), np.array([\"foo\"]))\n False\n \"\"\"\n\n is_a_array = isinstance(a, np.ndarray)\n is_b_array = isinstance(b, np.ndarray)\n\n is_a_numeric_array = is_a_array and is_numeric_dtype(a)\n is_b_numeric_array = is_b_array and is_numeric_dtype(b)\n is_a_string_array = is_a_array and is_string_like_dtype(a)\n is_b_string_array = is_b_array and is_string_like_dtype(b)\n\n is_a_scalar_string_like = not is_a_array and is_string_like(a)\n is_b_scalar_string_like = not is_b_array and is_string_like(b)\n\n return ((is_a_numeric_array and is_b_scalar_string_like) or\n (is_b_numeric_array and is_a_scalar_string_like) or\n (is_a_numeric_array and is_b_string_array) or\n (is_b_numeric_array and is_a_string_array))"} {"instruction": "Implement a Python 3 function for\nchecking if a datetime - like object to a numeric object.", "response": "def is_datetimelike_v_numeric(a, b):\n \"\"\"\n Check if we are comparing a datetime-like object to a numeric object.\n\n By \"numeric,\" we mean an object that is either of an int or float dtype.\n\n Parameters\n ----------\n a : array-like, scalar\n The first object to check.\n b : array-like, scalar\n The second object to check.\n\n Returns\n -------\n boolean\n Whether we return a comparing a datetime-like to a numeric object.\n\n Examples\n --------\n >>> dt = np.datetime64(pd.datetime(2017, 1, 1))\n >>>\n >>> is_datetimelike_v_numeric(1, 1)\n False\n >>> is_datetimelike_v_numeric(dt, dt)\n False\n >>> is_datetimelike_v_numeric(1, dt)\n True\n >>> is_datetimelike_v_numeric(dt, 1) # symmetric check\n True\n >>> is_datetimelike_v_numeric(np.array([dt]), 1)\n True\n >>> is_datetimelike_v_numeric(np.array([1]), dt)\n True\n >>> is_datetimelike_v_numeric(np.array([dt]), np.array([1]))\n True\n >>> is_datetimelike_v_numeric(np.array([1]), np.array([2]))\n False\n >>> is_datetimelike_v_numeric(np.array([dt]), np.array([dt]))\n False\n \"\"\"\n\n if not hasattr(a, 'dtype'):\n a = np.asarray(a)\n if not hasattr(b, 'dtype'):\n b = np.asarray(b)\n\n def is_numeric(x):\n \"\"\"\n Check if an object has a numeric dtype (i.e. integer or float).\n \"\"\"\n return is_integer_dtype(x) or is_float_dtype(x)\n\n is_datetimelike = needs_i8_conversion\n return ((is_datetimelike(a) and is_numeric(b)) or\n (is_datetimelike(b) and is_numeric(a)))"} {"instruction": "Make a summary of the following Python 3 code\ndef is_datetimelike_v_object(a, b):\n\n if not hasattr(a, 'dtype'):\n a = np.asarray(a)\n if not hasattr(b, 'dtype'):\n b = np.asarray(b)\n\n is_datetimelike = needs_i8_conversion\n return ((is_datetimelike(a) and is_object_dtype(b)) or\n (is_datetimelike(b) and is_object_dtype(a)))", "response": "Checks if two objects are datetimelike."} {"instruction": "Here you have a function in Python 3, explain what it does\ndef needs_i8_conversion(arr_or_dtype):\n\n if arr_or_dtype is None:\n return False\n return (is_datetime_or_timedelta_dtype(arr_or_dtype) or\n is_datetime64tz_dtype(arr_or_dtype) or\n is_period_dtype(arr_or_dtype))", "response": "Checks whether the array or dtype is of a datetime - like dtype and if so determines if the array or dtype needs conversion."} {"instruction": "Make a summary of the following Python 3 code\ndef is_bool_dtype(arr_or_dtype):\n if arr_or_dtype is None:\n return False\n try:\n dtype = _get_dtype(arr_or_dtype)\n except TypeError:\n return False\n\n if isinstance(arr_or_dtype, CategoricalDtype):\n arr_or_dtype = arr_or_dtype.categories\n # now we use the special definition for Index\n\n if isinstance(arr_or_dtype, ABCIndexClass):\n\n # TODO(jreback)\n # we don't have a boolean Index class\n # so its object, we need to infer to\n # guess this\n return (arr_or_dtype.is_object and\n arr_or_dtype.inferred_type == 'boolean')\n elif is_extension_array_dtype(arr_or_dtype):\n dtype = getattr(arr_or_dtype, 'dtype', arr_or_dtype)\n return dtype._is_boolean\n\n return issubclass(dtype.type, np.bool_)", "response": "Checks whether the provided array or dtype is of a boolean dtype."} {"instruction": "Create a Python 3 function to\ncheck whether an array - like is of a pandas extension class instance.", "response": "def is_extension_type(arr):\n \"\"\"\n Check whether an array-like is of a pandas extension class instance.\n\n Extension classes include categoricals, pandas sparse objects (i.e.\n classes represented within the pandas library and not ones external\n to it like scipy sparse matrices), and datetime-like arrays.\n\n Parameters\n ----------\n arr : array-like\n The array-like to check.\n\n Returns\n -------\n boolean\n Whether or not the array-like is of a pandas extension class instance.\n\n Examples\n --------\n >>> is_extension_type([1, 2, 3])\n False\n >>> is_extension_type(np.array([1, 2, 3]))\n False\n >>>\n >>> cat = pd.Categorical([1, 2, 3])\n >>>\n >>> is_extension_type(cat)\n True\n >>> is_extension_type(pd.Series(cat))\n True\n >>> is_extension_type(pd.SparseArray([1, 2, 3]))\n True\n >>> is_extension_type(pd.SparseSeries([1, 2, 3]))\n True\n >>>\n >>> from scipy.sparse import bsr_matrix\n >>> is_extension_type(bsr_matrix([1, 2, 3]))\n False\n >>> is_extension_type(pd.DatetimeIndex([1, 2, 3]))\n False\n >>> is_extension_type(pd.DatetimeIndex([1, 2, 3], tz=\"US/Eastern\"))\n True\n >>>\n >>> dtype = DatetimeTZDtype(\"ns\", tz=\"US/Eastern\")\n >>> s = pd.Series([], dtype=dtype)\n >>> is_extension_type(s)\n True\n \"\"\"\n\n if is_categorical(arr):\n return True\n elif is_sparse(arr):\n return True\n elif is_datetime64tz_dtype(arr):\n return True\n return False"} {"instruction": "Can you implement a function in Python 3 that\nchecks if an object is an extension array type.", "response": "def is_extension_array_dtype(arr_or_dtype):\n \"\"\"\n Check if an object is a pandas extension array type.\n\n See the :ref:`Use Guide ` for more.\n\n Parameters\n ----------\n arr_or_dtype : object\n For array-like input, the ``.dtype`` attribute will\n be extracted.\n\n Returns\n -------\n bool\n Whether the `arr_or_dtype` is an extension array type.\n\n Notes\n -----\n This checks whether an object implements the pandas extension\n array interface. In pandas, this includes:\n\n * Categorical\n * Sparse\n * Interval\n * Period\n * DatetimeArray\n * TimedeltaArray\n\n Third-party libraries may implement arrays or types satisfying\n this interface as well.\n\n Examples\n --------\n >>> from pandas.api.types import is_extension_array_dtype\n >>> arr = pd.Categorical(['a', 'b'])\n >>> is_extension_array_dtype(arr)\n True\n >>> is_extension_array_dtype(arr.dtype)\n True\n\n >>> arr = np.array(['a', 'b'])\n >>> is_extension_array_dtype(arr.dtype)\n False\n \"\"\"\n dtype = getattr(arr_or_dtype, 'dtype', arr_or_dtype)\n return (isinstance(dtype, ExtensionDtype) or\n registry.find(dtype) is not None)"} {"instruction": "Write a Python 3 script to\nreturn a boolean if the condition is satisfied for the arr_or_dtype.", "response": "def _is_dtype(arr_or_dtype, condition):\n \"\"\"\n Return a boolean if the condition is satisfied for the arr_or_dtype.\n\n Parameters\n ----------\n arr_or_dtype : array-like, str, np.dtype, or ExtensionArrayType\n The array-like or dtype object whose dtype we want to extract.\n condition : callable[Union[np.dtype, ExtensionDtype]]\n\n Returns\n -------\n bool\n\n \"\"\"\n\n if arr_or_dtype is None:\n return False\n try:\n dtype = _get_dtype(arr_or_dtype)\n except (TypeError, ValueError, UnicodeEncodeError):\n return False\n return condition(dtype)"} {"instruction": "Implement a Python 3 function for\nreturning the dtype instance associated with an array - like object.", "response": "def _get_dtype(arr_or_dtype):\n \"\"\"\n Get the dtype instance associated with an array\n or dtype object.\n\n Parameters\n ----------\n arr_or_dtype : array-like\n The array-like or dtype object whose dtype we want to extract.\n\n Returns\n -------\n obj_dtype : The extract dtype instance from the\n passed in array or dtype object.\n\n Raises\n ------\n TypeError : The passed in object is None.\n \"\"\"\n\n if arr_or_dtype is None:\n raise TypeError(\"Cannot deduce dtype from null object\")\n\n # fastpath\n elif isinstance(arr_or_dtype, np.dtype):\n return arr_or_dtype\n elif isinstance(arr_or_dtype, type):\n return np.dtype(arr_or_dtype)\n\n # if we have an array-like\n elif hasattr(arr_or_dtype, 'dtype'):\n arr_or_dtype = arr_or_dtype.dtype\n\n return pandas_dtype(arr_or_dtype)"} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef _is_dtype_type(arr_or_dtype, condition):\n\n if arr_or_dtype is None:\n return condition(type(None))\n\n # fastpath\n if isinstance(arr_or_dtype, np.dtype):\n return condition(arr_or_dtype.type)\n elif isinstance(arr_or_dtype, type):\n if issubclass(arr_or_dtype, (PandasExtensionDtype, ExtensionDtype)):\n arr_or_dtype = arr_or_dtype.type\n return condition(np.dtype(arr_or_dtype).type)\n elif arr_or_dtype is None:\n return condition(type(None))\n\n # if we have an array-like\n if hasattr(arr_or_dtype, 'dtype'):\n arr_or_dtype = arr_or_dtype.dtype\n\n # we are not possibly a dtype\n elif is_list_like(arr_or_dtype):\n return condition(type(None))\n\n try:\n tipo = pandas_dtype(arr_or_dtype).type\n except (TypeError, ValueError, UnicodeEncodeError):\n if is_scalar(arr_or_dtype):\n return condition(type(None))\n\n return False\n\n return condition(tipo)", "response": "Returns a boolean if the condition is satisfied for the arr_or_dtype."} {"instruction": "Write a Python 3 script to\ninfer the dtype of a single object.", "response": "def infer_dtype_from_object(dtype):\n \"\"\"\n Get a numpy dtype.type-style object for a dtype object.\n\n This methods also includes handling of the datetime64[ns] and\n datetime64[ns, TZ] objects.\n\n If no dtype can be found, we return ``object``.\n\n Parameters\n ----------\n dtype : dtype, type\n The dtype object whose numpy dtype.type-style\n object we want to extract.\n\n Returns\n -------\n dtype_object : The extracted numpy dtype.type-style object.\n \"\"\"\n\n if isinstance(dtype, type) and issubclass(dtype, np.generic):\n # Type object from a dtype\n return dtype\n elif isinstance(dtype, (np.dtype, PandasExtensionDtype, ExtensionDtype)):\n # dtype object\n try:\n _validate_date_like_dtype(dtype)\n except TypeError:\n # Should still pass if we don't have a date-like\n pass\n return dtype.type\n\n try:\n dtype = pandas_dtype(dtype)\n except TypeError:\n pass\n\n if is_extension_array_dtype(dtype):\n return dtype.type\n elif isinstance(dtype, str):\n\n # TODO(jreback)\n # should deprecate these\n if dtype in ['datetimetz', 'datetime64tz']:\n return DatetimeTZDtype.type\n elif dtype in ['period']:\n raise NotImplementedError\n\n if dtype == 'datetime' or dtype == 'timedelta':\n dtype += '64'\n try:\n return infer_dtype_from_object(getattr(np, dtype))\n except (AttributeError, TypeError):\n # Handles cases like _get_dtype(int) i.e.,\n # Python objects that are valid dtypes\n # (unlike user-defined types, in general)\n #\n # TypeError handles the float16 type code of 'e'\n # further handle internal types\n pass\n\n return infer_dtype_from_object(np.dtype(dtype))"} {"instruction": "Can you create a Python 3 function that\nvalidates that the dtype is a date - like dtype.", "response": "def _validate_date_like_dtype(dtype):\n \"\"\"\n Check whether the dtype is a date-like dtype. Raises an error if invalid.\n\n Parameters\n ----------\n dtype : dtype, type\n The dtype to check.\n\n Raises\n ------\n TypeError : The dtype could not be casted to a date-like dtype.\n ValueError : The dtype is an illegal date-like dtype (e.g. the\n the frequency provided is too specific)\n \"\"\"\n\n try:\n typ = np.datetime_data(dtype)[0]\n except ValueError as e:\n raise TypeError('{error}'.format(error=e))\n if typ != 'generic' and typ != 'ns':\n msg = '{name!r} is too specific of a frequency, try passing {type!r}'\n raise ValueError(msg.format(name=dtype.name, type=dtype.type.__name__))"} {"instruction": "Write a Python 3 script to\nconvert input into a pandas only dtype object or a numpy dtype object.", "response": "def pandas_dtype(dtype):\n \"\"\"\n Convert input into a pandas only dtype object or a numpy dtype object.\n\n Parameters\n ----------\n dtype : object to be converted\n\n Returns\n -------\n np.dtype or a pandas dtype\n\n Raises\n ------\n TypeError if not a dtype\n \"\"\"\n # short-circuit\n if isinstance(dtype, np.ndarray):\n return dtype.dtype\n elif isinstance(dtype, (np.dtype, PandasExtensionDtype, ExtensionDtype)):\n return dtype\n\n # registered extension types\n result = registry.find(dtype)\n if result is not None:\n return result\n\n # try a numpy dtype\n # raise a consistent TypeError if failed\n try:\n npdtype = np.dtype(dtype)\n except Exception:\n # we don't want to force a repr of the non-string\n if not isinstance(dtype, str):\n raise TypeError(\"data type not understood\")\n raise TypeError(\"data type '{}' not understood\".format(\n dtype))\n\n # Any invalid dtype (such as pd.Timestamp) should raise an error.\n # np.dtype(invalid_type).kind = 0 for such objects. However, this will\n # also catch some valid dtypes such as object, np.object_ and 'object'\n # which we safeguard against by catching them earlier and returning\n # np.dtype(valid_dtype) before this condition is evaluated.\n if is_hashable(dtype) and dtype in [object, np.object_, 'object', 'O']:\n # check hashability to avoid errors/DeprecationWarning when we get\n # here and `dtype` is an array\n return npdtype\n elif npdtype.kind == 'O':\n raise TypeError(\"dtype '{}' not understood\".format(dtype))\n\n return npdtype"} {"instruction": "Make a summary of the following Python 3 code\ndef _groupby_and_merge(by, on, left, right, _merge_pieces,\n check_duplicates=True):\n \"\"\"\n groupby & merge; we are always performing a left-by type operation\n\n Parameters\n ----------\n by: field to group\n on: duplicates field\n left: left frame\n right: right frame\n _merge_pieces: function for merging\n check_duplicates: boolean, default True\n should we check & clean duplicates\n \"\"\"\n\n pieces = []\n if not isinstance(by, (list, tuple)):\n by = [by]\n\n lby = left.groupby(by, sort=False)\n\n # if we can groupby the rhs\n # then we can get vastly better perf\n try:\n\n # we will check & remove duplicates if indicated\n if check_duplicates:\n if on is None:\n on = []\n elif not isinstance(on, (list, tuple)):\n on = [on]\n\n if right.duplicated(by + on).any():\n right = right.drop_duplicates(by + on, keep='last')\n rby = right.groupby(by, sort=False)\n except KeyError:\n rby = None\n\n for key, lhs in lby:\n\n if rby is None:\n rhs = right\n else:\n try:\n rhs = right.take(rby.indices[key])\n except KeyError:\n # key doesn't exist in left\n lcols = lhs.columns.tolist()\n cols = lcols + [r for r in right.columns\n if r not in set(lcols)]\n merged = lhs.reindex(columns=cols)\n merged.index = range(len(merged))\n pieces.append(merged)\n continue\n\n merged = _merge_pieces(lhs, rhs)\n\n # make sure join keys are in the merged\n # TODO, should _merge_pieces do this?\n for k in by:\n try:\n if k in merged:\n merged[k] = key\n except KeyError:\n pass\n\n pieces.append(merged)\n\n # preserve the original order\n # if we have a missing piece this can be reset\n from pandas.core.reshape.concat import concat\n result = concat(pieces, ignore_index=True)\n result = result.reindex(columns=pieces[0].columns, copy=False)\n return result, lby", "response": "Group by and merge pieces of the items in the tree."} {"instruction": "Here you have a function in Python 3, explain what it does\ndef merge_ordered(left, right, on=None,\n left_on=None, right_on=None,\n left_by=None, right_by=None,\n fill_method=None, suffixes=('_x', '_y'),\n how='outer'):\n \"\"\"Perform merge with optional filling/interpolation designed for ordered\n data like time series data. Optionally perform group-wise merge (see\n examples)\n\n Parameters\n ----------\n left : DataFrame\n right : DataFrame\n on : label or list\n Field names to join on. Must be found in both DataFrames.\n left_on : label or list, or array-like\n Field names to join on in left DataFrame. Can be a vector or list of\n vectors of the length of the DataFrame to use a particular vector as\n the join key instead of columns\n right_on : label or list, or array-like\n Field names to join on in right DataFrame or vector/list of vectors per\n left_on docs\n left_by : column name or list of column names\n Group left DataFrame by group columns and merge piece by piece with\n right DataFrame\n right_by : column name or list of column names\n Group right DataFrame by group columns and merge piece by piece with\n left DataFrame\n fill_method : {'ffill', None}, default None\n Interpolation method for data\n suffixes : Sequence, default is (\"_x\", \"_y\")\n A length-2 sequence where each element is optionally a string\n indicating the suffix to add to overlapping column names in\n `left` and `right` respectively. Pass a value of `None` instead\n of a string to indicate that the column name from `left` or\n `right` should be left as-is, with no suffix. At least one of the\n values must not be None.\n\n .. versionchanged:: 0.25.0\n how : {'left', 'right', 'outer', 'inner'}, default 'outer'\n * left: use only keys from left frame (SQL: left outer join)\n * right: use only keys from right frame (SQL: right outer join)\n * outer: use union of keys from both frames (SQL: full outer join)\n * inner: use intersection of keys from both frames (SQL: inner join)\n\n .. versionadded:: 0.19.0\n\n Returns\n -------\n merged : DataFrame\n The output type will the be same as 'left', if it is a subclass\n of DataFrame.\n\n See Also\n --------\n merge\n merge_asof\n\n Examples\n --------\n >>> A >>> B\n key lvalue group key rvalue\n 0 a 1 a 0 b 1\n 1 c 2 a 1 c 2\n 2 e 3 a 2 d 3\n 3 a 1 b\n 4 c 2 b\n 5 e 3 b\n\n >>> merge_ordered(A, B, fill_method='ffill', left_by='group')\n group key lvalue rvalue\n 0 a a 1 NaN\n 1 a b 1 1.0\n 2 a c 2 2.0\n 3 a d 2 3.0\n 4 a e 3 3.0\n 5 b a 1 NaN\n 6 b b 1 1.0\n 7 b c 2 2.0\n 8 b d 2 3.0\n 9 b e 3 3.0\n \"\"\"\n def _merger(x, y):\n # perform the ordered merge operation\n op = _OrderedMerge(x, y, on=on, left_on=left_on, right_on=right_on,\n suffixes=suffixes, fill_method=fill_method,\n how=how)\n return op.get_result()\n\n if left_by is not None and right_by is not None:\n raise ValueError('Can only group either left or right frames')\n elif left_by is not None:\n result, _ = _groupby_and_merge(left_by, on, left, right,\n lambda x, y: _merger(x, y),\n check_duplicates=False)\n elif right_by is not None:\n result, _ = _groupby_and_merge(right_by, on, right, left,\n lambda x, y: _merger(y, x),\n check_duplicates=False)\n else:\n result = _merger(left, right)\n return result", "response": "Perform a merge of two DataFrame objects with optional filling and interpolation designed for ordered time series data like time series data."} {"instruction": "Can you create a Python 3 function that\nperforms an asof merge between two DataFrames.", "response": "def merge_asof(left, right, on=None,\n left_on=None, right_on=None,\n left_index=False, right_index=False,\n by=None, left_by=None, right_by=None,\n suffixes=('_x', '_y'),\n tolerance=None,\n allow_exact_matches=True,\n direction='backward'):\n \"\"\"Perform an asof merge. This is similar to a left-join except that we\n match on nearest key rather than equal keys.\n\n Both DataFrames must be sorted by the key.\n\n For each row in the left DataFrame:\n\n - A \"backward\" search selects the last row in the right DataFrame whose\n 'on' key is less than or equal to the left's key.\n\n - A \"forward\" search selects the first row in the right DataFrame whose\n 'on' key is greater than or equal to the left's key.\n\n - A \"nearest\" search selects the row in the right DataFrame whose 'on'\n key is closest in absolute distance to the left's key.\n\n The default is \"backward\" and is compatible in versions below 0.20.0.\n The direction parameter was added in version 0.20.0 and introduces\n \"forward\" and \"nearest\".\n\n Optionally match on equivalent keys with 'by' before searching with 'on'.\n\n .. versionadded:: 0.19.0\n\n Parameters\n ----------\n left : DataFrame\n right : DataFrame\n on : label\n Field name to join on. Must be found in both DataFrames.\n The data MUST be ordered. Furthermore this must be a numeric column,\n such as datetimelike, integer, or float. On or left_on/right_on\n must be given.\n left_on : label\n Field name to join on in left DataFrame.\n right_on : label\n Field name to join on in right DataFrame.\n left_index : boolean\n Use the index of the left DataFrame as the join key.\n\n .. versionadded:: 0.19.2\n\n right_index : boolean\n Use the index of the right DataFrame as the join key.\n\n .. versionadded:: 0.19.2\n\n by : column name or list of column names\n Match on these columns before performing merge operation.\n left_by : column name\n Field names to match on in the left DataFrame.\n\n .. versionadded:: 0.19.2\n\n right_by : column name\n Field names to match on in the right DataFrame.\n\n .. versionadded:: 0.19.2\n\n suffixes : 2-length sequence (tuple, list, ...)\n Suffix to apply to overlapping column names in the left and right\n side, respectively.\n tolerance : integer or Timedelta, optional, default None\n Select asof tolerance within this range; must be compatible\n with the merge index.\n allow_exact_matches : boolean, default True\n\n - If True, allow matching with the same 'on' value\n (i.e. less-than-or-equal-to / greater-than-or-equal-to)\n - If False, don't match the same 'on' value\n (i.e., strictly less-than / strictly greater-than)\n\n direction : 'backward' (default), 'forward', or 'nearest'\n Whether to search for prior, subsequent, or closest matches.\n\n .. versionadded:: 0.20.0\n\n Returns\n -------\n merged : DataFrame\n\n See Also\n --------\n merge\n merge_ordered\n\n Examples\n --------\n >>> left = pd.DataFrame({'a': [1, 5, 10], 'left_val': ['a', 'b', 'c']})\n >>> left\n a left_val\n 0 1 a\n 1 5 b\n 2 10 c\n\n >>> right = pd.DataFrame({'a': [1, 2, 3, 6, 7],\n ... 'right_val': [1, 2, 3, 6, 7]})\n >>> right\n a right_val\n 0 1 1\n 1 2 2\n 2 3 3\n 3 6 6\n 4 7 7\n\n >>> pd.merge_asof(left, right, on='a')\n a left_val right_val\n 0 1 a 1\n 1 5 b 3\n 2 10 c 7\n\n >>> pd.merge_asof(left, right, on='a', allow_exact_matches=False)\n a left_val right_val\n 0 1 a NaN\n 1 5 b 3.0\n 2 10 c 7.0\n\n >>> pd.merge_asof(left, right, on='a', direction='forward')\n a left_val right_val\n 0 1 a 1.0\n 1 5 b 6.0\n 2 10 c NaN\n\n >>> pd.merge_asof(left, right, on='a', direction='nearest')\n a left_val right_val\n 0 1 a 1\n 1 5 b 6\n 2 10 c 7\n\n We can use indexed DataFrames as well.\n\n >>> left = pd.DataFrame({'left_val': ['a', 'b', 'c']}, index=[1, 5, 10])\n >>> left\n left_val\n 1 a\n 5 b\n 10 c\n\n >>> right = pd.DataFrame({'right_val': [1, 2, 3, 6, 7]},\n ... index=[1, 2, 3, 6, 7])\n >>> right\n right_val\n 1 1\n 2 2\n 3 3\n 6 6\n 7 7\n\n >>> pd.merge_asof(left, right, left_index=True, right_index=True)\n left_val right_val\n 1 a 1\n 5 b 3\n 10 c 7\n\n Here is a real-world times-series example\n\n >>> quotes\n time ticker bid ask\n 0 2016-05-25 13:30:00.023 GOOG 720.50 720.93\n 1 2016-05-25 13:30:00.023 MSFT 51.95 51.96\n 2 2016-05-25 13:30:00.030 MSFT 51.97 51.98\n 3 2016-05-25 13:30:00.041 MSFT 51.99 52.00\n 4 2016-05-25 13:30:00.048 GOOG 720.50 720.93\n 5 2016-05-25 13:30:00.049 AAPL 97.99 98.01\n 6 2016-05-25 13:30:00.072 GOOG 720.50 720.88\n 7 2016-05-25 13:30:00.075 MSFT 52.01 52.03\n\n >>> trades\n time ticker price quantity\n 0 2016-05-25 13:30:00.023 MSFT 51.95 75\n 1 2016-05-25 13:30:00.038 MSFT 51.95 155\n 2 2016-05-25 13:30:00.048 GOOG 720.77 100\n 3 2016-05-25 13:30:00.048 GOOG 720.92 100\n 4 2016-05-25 13:30:00.048 AAPL 98.00 100\n\n By default we are taking the asof of the quotes\n\n >>> pd.merge_asof(trades, quotes,\n ... on='time',\n ... by='ticker')\n time ticker price quantity bid ask\n 0 2016-05-25 13:30:00.023 MSFT 51.95 75 51.95 51.96\n 1 2016-05-25 13:30:00.038 MSFT 51.95 155 51.97 51.98\n 2 2016-05-25 13:30:00.048 GOOG 720.77 100 720.50 720.93\n 3 2016-05-25 13:30:00.048 GOOG 720.92 100 720.50 720.93\n 4 2016-05-25 13:30:00.048 AAPL 98.00 100 NaN NaN\n\n We only asof within 2ms between the quote time and the trade time\n\n >>> pd.merge_asof(trades, quotes,\n ... on='time',\n ... by='ticker',\n ... tolerance=pd.Timedelta('2ms'))\n time ticker price quantity bid ask\n 0 2016-05-25 13:30:00.023 MSFT 51.95 75 51.95 51.96\n 1 2016-05-25 13:30:00.038 MSFT 51.95 155 NaN NaN\n 2 2016-05-25 13:30:00.048 GOOG 720.77 100 720.50 720.93\n 3 2016-05-25 13:30:00.048 GOOG 720.92 100 720.50 720.93\n 4 2016-05-25 13:30:00.048 AAPL 98.00 100 NaN NaN\n\n We only asof within 10ms between the quote time and the trade time\n and we exclude exact matches on time. However *prior* data will\n propagate forward\n\n >>> pd.merge_asof(trades, quotes,\n ... on='time',\n ... by='ticker',\n ... tolerance=pd.Timedelta('10ms'),\n ... allow_exact_matches=False)\n time ticker price quantity bid ask\n 0 2016-05-25 13:30:00.023 MSFT 51.95 75 NaN NaN\n 1 2016-05-25 13:30:00.038 MSFT 51.95 155 51.97 51.98\n 2 2016-05-25 13:30:00.048 GOOG 720.77 100 NaN NaN\n 3 2016-05-25 13:30:00.048 GOOG 720.92 100 NaN NaN\n 4 2016-05-25 13:30:00.048 AAPL 98.00 100 NaN NaN\n \"\"\"\n op = _AsOfMerge(left, right,\n on=on, left_on=left_on, right_on=right_on,\n left_index=left_index, right_index=right_index,\n by=by, left_by=left_by, right_by=right_by,\n suffixes=suffixes,\n how='asof', tolerance=tolerance,\n allow_exact_matches=allow_exact_matches,\n direction=direction)\n return op.get_result()"} {"instruction": "Here you have a function in Python 3, explain what it does\ndef _restore_dropped_levels_multijoin(left, right, dropped_level_names,\n join_index, lindexer, rindexer):\n \"\"\"\n *this is an internal non-public method*\n\n Returns the levels, labels and names of a multi-index to multi-index join.\n Depending on the type of join, this method restores the appropriate\n dropped levels of the joined multi-index.\n The method relies on lidx, rindexer which hold the index positions of\n left and right, where a join was feasible\n\n Parameters\n ----------\n left : MultiIndex\n left index\n right : MultiIndex\n right index\n dropped_level_names : str array\n list of non-common level names\n join_index : MultiIndex\n the index of the join between the\n common levels of left and right\n lindexer : intp array\n left indexer\n rindexer : intp array\n right indexer\n\n Returns\n -------\n levels : list of Index\n levels of combined multiindexes\n labels : intp array\n labels of combined multiindexes\n names : str array\n names of combined multiindexes\n\n \"\"\"\n\n def _convert_to_mulitindex(index):\n if isinstance(index, MultiIndex):\n return index\n else:\n return MultiIndex.from_arrays([index.values],\n names=[index.name])\n\n # For multi-multi joins with one overlapping level,\n # the returned index if of type Index\n # Assure that join_index is of type MultiIndex\n # so that dropped levels can be appended\n join_index = _convert_to_mulitindex(join_index)\n\n join_levels = join_index.levels\n join_codes = join_index.codes\n join_names = join_index.names\n\n # lindexer and rindexer hold the indexes where the join occurred\n # for left and right respectively. If left/right is None then\n # the join occurred on all indices of left/right\n if lindexer is None:\n lindexer = range(left.size)\n\n if rindexer is None:\n rindexer = range(right.size)\n\n # Iterate through the levels that must be restored\n for dropped_level_name in dropped_level_names:\n if dropped_level_name in left.names:\n idx = left\n indexer = lindexer\n else:\n idx = right\n indexer = rindexer\n\n # The index of the level name to be restored\n name_idx = idx.names.index(dropped_level_name)\n\n restore_levels = idx.levels[name_idx]\n # Inject -1 in the codes list where a join was not possible\n # IOW indexer[i]=-1\n codes = idx.codes[name_idx]\n restore_codes = algos.take_nd(codes, indexer, fill_value=-1)\n\n join_levels = join_levels + [restore_levels]\n join_codes = join_codes + [restore_codes]\n join_names = join_names + [dropped_level_name]\n\n return join_levels, join_codes, join_names", "response": "This method restores the dropped levels of a multi - index to the appropriate MultiIndex."} {"instruction": "Create a Python 3 function to\nrestore index levels specified as on parameters.", "response": "def _maybe_restore_index_levels(self, result):\n \"\"\"\n Restore index levels specified as `on` parameters\n\n Here we check for cases where `self.left_on` and `self.right_on` pairs\n each reference an index level in their respective DataFrames. The\n joined columns corresponding to these pairs are then restored to the\n index of `result`.\n\n **Note:** This method has side effects. It modifies `result` in-place\n\n Parameters\n ----------\n result: DataFrame\n merge result\n\n Returns\n -------\n None\n \"\"\"\n names_to_restore = []\n for name, left_key, right_key in zip(self.join_names,\n self.left_on,\n self.right_on):\n if (self.orig_left._is_level_reference(left_key) and\n self.orig_right._is_level_reference(right_key) and\n name not in result.index.names):\n\n names_to_restore.append(name)\n\n if names_to_restore:\n result.set_index(names_to_restore, inplace=True)"} {"instruction": "Write a Python 3 script to\nreturn the join indexers", "response": "def _get_join_indexers(self):\n \"\"\" return the join indexers \"\"\"\n return _get_join_indexers(self.left_join_keys,\n self.right_join_keys,\n sort=self.sort,\n how=self.how)"} {"instruction": "Implement a Python 3 function for\ncreating a join index by rearranging one index to match another index.", "response": "def _create_join_index(self, index, other_index, indexer,\n other_indexer, how='left'):\n \"\"\"\n Create a join index by rearranging one index to match another\n\n Parameters\n ----------\n index: Index being rearranged\n other_index: Index used to supply values not found in index\n indexer: how to rearrange index\n how: replacement is only necessary if indexer based on other_index\n\n Returns\n -------\n join_index\n \"\"\"\n join_index = index.take(indexer)\n if (self.how in (how, 'outer') and\n not isinstance(other_index, MultiIndex)):\n # if final index requires values in other_index but not target\n # index, indexer may hold missing (-1) values, causing Index.take\n # to take the final value in target index\n mask = indexer == -1\n if np.any(mask):\n # if values missing (-1) from target index,\n # take from other_index instead\n join_list = join_index.to_numpy()\n other_list = other_index.take(other_indexer).to_numpy()\n join_list[mask] = other_list[mask]\n join_index = Index(join_list, dtype=join_index.dtype,\n name=join_index.name)\n return join_index"} {"instruction": "Write a Python 3 script for\ngetting the keys that should be used to merge the two entries.", "response": "def _get_merge_keys(self):\n \"\"\"\n Note: has side effects (copy/delete key columns)\n\n Parameters\n ----------\n left\n right\n on\n\n Returns\n -------\n left_keys, right_keys\n \"\"\"\n left_keys = []\n right_keys = []\n join_names = []\n right_drop = []\n left_drop = []\n\n left, right = self.left, self.right\n\n is_lkey = lambda x: is_array_like(x) and len(x) == len(left)\n is_rkey = lambda x: is_array_like(x) and len(x) == len(right)\n\n # Note that pd.merge_asof() has separate 'on' and 'by' parameters. A\n # user could, for example, request 'left_index' and 'left_by'. In a\n # regular pd.merge(), users cannot specify both 'left_index' and\n # 'left_on'. (Instead, users have a MultiIndex). That means the\n # self.left_on in this function is always empty in a pd.merge(), but\n # a pd.merge_asof(left_index=True, left_by=...) will result in a\n # self.left_on array with a None in the middle of it. This requires\n # a work-around as designated in the code below.\n # See _validate_specification() for where this happens.\n\n # ugh, spaghetti re #733\n if _any(self.left_on) and _any(self.right_on):\n for lk, rk in zip(self.left_on, self.right_on):\n if is_lkey(lk):\n left_keys.append(lk)\n if is_rkey(rk):\n right_keys.append(rk)\n join_names.append(None) # what to do?\n else:\n if rk is not None:\n right_keys.append(\n right._get_label_or_level_values(rk))\n join_names.append(rk)\n else:\n # work-around for merge_asof(right_index=True)\n right_keys.append(right.index)\n join_names.append(right.index.name)\n else:\n if not is_rkey(rk):\n if rk is not None:\n right_keys.append(\n right._get_label_or_level_values(rk))\n else:\n # work-around for merge_asof(right_index=True)\n right_keys.append(right.index)\n if lk is not None and lk == rk:\n # avoid key upcast in corner case (length-0)\n if len(left) > 0:\n right_drop.append(rk)\n else:\n left_drop.append(lk)\n else:\n right_keys.append(rk)\n if lk is not None:\n left_keys.append(left._get_label_or_level_values(lk))\n join_names.append(lk)\n else:\n # work-around for merge_asof(left_index=True)\n left_keys.append(left.index)\n join_names.append(left.index.name)\n elif _any(self.left_on):\n for k in self.left_on:\n if is_lkey(k):\n left_keys.append(k)\n join_names.append(None)\n else:\n left_keys.append(left._get_label_or_level_values(k))\n join_names.append(k)\n if isinstance(self.right.index, MultiIndex):\n right_keys = [lev._values.take(lev_codes) for lev, lev_codes\n in zip(self.right.index.levels,\n self.right.index.codes)]\n else:\n right_keys = [self.right.index._values]\n elif _any(self.right_on):\n for k in self.right_on:\n if is_rkey(k):\n right_keys.append(k)\n join_names.append(None)\n else:\n right_keys.append(right._get_label_or_level_values(k))\n join_names.append(k)\n if isinstance(self.left.index, MultiIndex):\n left_keys = [lev._values.take(lev_codes) for lev, lev_codes\n in zip(self.left.index.levels,\n self.left.index.codes)]\n else:\n left_keys = [self.left.index.values]\n\n if left_drop:\n self.left = self.left._drop_labels_or_levels(left_drop)\n\n if right_drop:\n self.right = self.right._drop_labels_or_levels(right_drop)\n\n return left_keys, right_keys, join_names"} {"instruction": "Can you create a Python 3 function that\nreturns the indexers for the join keys", "response": "def _get_join_indexers(self):\n \"\"\" return the join indexers \"\"\"\n\n def flip(xs):\n \"\"\" unlike np.transpose, this returns an array of tuples \"\"\"\n labels = list(string.ascii_lowercase[:len(xs)])\n dtypes = [x.dtype for x in xs]\n labeled_dtypes = list(zip(labels, dtypes))\n return np.array(lzip(*xs), labeled_dtypes)\n\n # values to compare\n left_values = (self.left.index.values if self.left_index else\n self.left_join_keys[-1])\n right_values = (self.right.index.values if self.right_index else\n self.right_join_keys[-1])\n tolerance = self.tolerance\n\n # we require sortedness and non-null values in the join keys\n msg_sorted = \"{side} keys must be sorted\"\n msg_missings = \"Merge keys contain null values on {side} side\"\n\n if not Index(left_values).is_monotonic:\n if isnull(left_values).any():\n raise ValueError(msg_missings.format(side='left'))\n else:\n raise ValueError(msg_sorted.format(side='left'))\n\n if not Index(right_values).is_monotonic:\n if isnull(right_values).any():\n raise ValueError(msg_missings.format(side='right'))\n else:\n raise ValueError(msg_sorted.format(side='right'))\n\n # initial type conversion as needed\n if needs_i8_conversion(left_values):\n left_values = left_values.view('i8')\n right_values = right_values.view('i8')\n if tolerance is not None:\n tolerance = tolerance.value\n\n # a \"by\" parameter requires special handling\n if self.left_by is not None:\n # remove 'on' parameter from values if one existed\n if self.left_index and self.right_index:\n left_by_values = self.left_join_keys\n right_by_values = self.right_join_keys\n else:\n left_by_values = self.left_join_keys[0:-1]\n right_by_values = self.right_join_keys[0:-1]\n\n # get tuple representation of values if more than one\n if len(left_by_values) == 1:\n left_by_values = left_by_values[0]\n right_by_values = right_by_values[0]\n else:\n left_by_values = flip(left_by_values)\n right_by_values = flip(right_by_values)\n\n # upcast 'by' parameter because HashTable is limited\n by_type = _get_cython_type_upcast(left_by_values.dtype)\n by_type_caster = _type_casters[by_type]\n left_by_values = by_type_caster(left_by_values)\n right_by_values = by_type_caster(right_by_values)\n\n # choose appropriate function by type\n func = _asof_by_function(self.direction)\n return func(left_values,\n right_values,\n left_by_values,\n right_by_values,\n self.allow_exact_matches,\n tolerance)\n else:\n # choose appropriate function by type\n func = _asof_function(self.direction)\n return func(left_values,\n right_values,\n self.allow_exact_matches,\n tolerance)"} {"instruction": "Explain what the following Python 3 code does\ndef is_dtype(cls, dtype):\n dtype = getattr(dtype, 'dtype', dtype)\n\n if isinstance(dtype, (ABCSeries, ABCIndexClass,\n ABCDataFrame, np.dtype)):\n # https://github.com/pandas-dev/pandas/issues/22960\n # avoid passing data to `construct_from_string`. This could\n # cause a FutureWarning from numpy about failing elementwise\n # comparison from, e.g., comparing DataFrame == 'category'.\n return False\n elif dtype is None:\n return False\n elif isinstance(dtype, cls):\n return True\n try:\n return cls.construct_from_string(dtype) is not None\n except TypeError:\n return False", "response": "Check if we are in the specified dtype."} {"instruction": "Given the following Python 3 function, write the documentation\ndef cat_core(list_of_columns, sep):\n list_with_sep = [sep] * (2 * len(list_of_columns) - 1)\n list_with_sep[::2] = list_of_columns\n return np.sum(list_with_sep, axis=0)", "response": "A utility function for concatenating the columns of a tree with a separator string"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef str_count(arr, pat, flags=0):\n regex = re.compile(pat, flags=flags)\n f = lambda x: len(regex.findall(x))\n return _na_map(f, arr, dtype=int)", "response": "Count occurrences of pattern in each string of the Series or Index."} {"instruction": "Can you implement a function in Python 3 that\ntests if a given pattern or regex is contained within a given string.", "response": "def str_contains(arr, pat, case=True, flags=0, na=np.nan, regex=True):\n \"\"\"\n Test if pattern or regex is contained within a string of a Series or Index.\n\n Return boolean Series or Index based on whether a given pattern or regex is\n contained within a string of a Series or Index.\n\n Parameters\n ----------\n pat : str\n Character sequence or regular expression.\n case : bool, default True\n If True, case sensitive.\n flags : int, default 0 (no flags)\n Flags to pass through to the re module, e.g. re.IGNORECASE.\n na : default NaN\n Fill value for missing values.\n regex : bool, default True\n If True, assumes the pat is a regular expression.\n\n If False, treats the pat as a literal string.\n\n Returns\n -------\n Series or Index of boolean values\n A Series or Index of boolean values indicating whether the\n given pattern is contained within the string of each element\n of the Series or Index.\n\n See Also\n --------\n match : Analogous, but stricter, relying on re.match instead of re.search.\n Series.str.startswith : Test if the start of each string element matches a\n pattern.\n Series.str.endswith : Same as startswith, but tests the end of string.\n\n Examples\n --------\n\n Returning a Series of booleans using only a literal pattern.\n\n >>> s1 = pd.Series(['Mouse', 'dog', 'house and parrot', '23', np.NaN])\n >>> s1.str.contains('og', regex=False)\n 0 False\n 1 True\n 2 False\n 3 False\n 4 NaN\n dtype: object\n\n Returning an Index of booleans using only a literal pattern.\n\n >>> ind = pd.Index(['Mouse', 'dog', 'house and parrot', '23.0', np.NaN])\n >>> ind.str.contains('23', regex=False)\n Index([False, False, False, True, nan], dtype='object')\n\n Specifying case sensitivity using `case`.\n\n >>> s1.str.contains('oG', case=True, regex=True)\n 0 False\n 1 False\n 2 False\n 3 False\n 4 NaN\n dtype: object\n\n Specifying `na` to be `False` instead of `NaN` replaces NaN values\n with `False`. If Series or Index does not contain NaN values\n the resultant dtype will be `bool`, otherwise, an `object` dtype.\n\n >>> s1.str.contains('og', na=False, regex=True)\n 0 False\n 1 True\n 2 False\n 3 False\n 4 False\n dtype: bool\n\n Returning 'house' or 'dog' when either expression occurs in a string.\n\n >>> s1.str.contains('house|dog', regex=True)\n 0 False\n 1 True\n 2 True\n 3 False\n 4 NaN\n dtype: object\n\n Ignoring case sensitivity using `flags` with regex.\n\n >>> import re\n >>> s1.str.contains('PARROT', flags=re.IGNORECASE, regex=True)\n 0 False\n 1 False\n 2 True\n 3 False\n 4 NaN\n dtype: object\n\n Returning any digit using regular expression.\n\n >>> s1.str.contains('\\\\d', regex=True)\n 0 False\n 1 False\n 2 False\n 3 True\n 4 NaN\n dtype: object\n\n Ensure `pat` is a not a literal pattern when `regex` is set to True.\n Note in the following example one might expect only `s2[1]` and `s2[3]` to\n return `True`. However, '.0' as a regex matches any character\n followed by a 0.\n\n >>> s2 = pd.Series(['40', '40.0', '41', '41.0', '35'])\n >>> s2.str.contains('.0', regex=True)\n 0 True\n 1 True\n 2 False\n 3 True\n 4 False\n dtype: bool\n \"\"\"\n if regex:\n if not case:\n flags |= re.IGNORECASE\n\n regex = re.compile(pat, flags=flags)\n\n if regex.groups > 0:\n warnings.warn(\"This pattern has match groups. To actually get the\"\n \" groups, use str.extract.\", UserWarning,\n stacklevel=3)\n\n f = lambda x: bool(regex.search(x))\n else:\n if case:\n f = lambda x: pat in x\n else:\n upper_pat = pat.upper()\n f = lambda x: upper_pat in x\n uppered = _na_map(lambda x: x.upper(), arr)\n return _na_map(f, uppered, na, dtype=bool)\n return _na_map(f, arr, na, dtype=bool)"} {"instruction": "How would you code a function in Python 3 to\nreturn a Series of booleans indicating whether each string element in the array starts with pat.", "response": "def str_startswith(arr, pat, na=np.nan):\n \"\"\"\n Test if the start of each string element matches a pattern.\n\n Equivalent to :meth:`str.startswith`.\n\n Parameters\n ----------\n pat : str\n Character sequence. Regular expressions are not accepted.\n na : object, default NaN\n Object shown if element tested is not a string.\n\n Returns\n -------\n Series or Index of bool\n A Series of booleans indicating whether the given pattern matches\n the start of each string element.\n\n See Also\n --------\n str.startswith : Python standard library string method.\n Series.str.endswith : Same as startswith, but tests the end of string.\n Series.str.contains : Tests if string element contains a pattern.\n\n Examples\n --------\n >>> s = pd.Series(['bat', 'Bear', 'cat', np.nan])\n >>> s\n 0 bat\n 1 Bear\n 2 cat\n 3 NaN\n dtype: object\n\n >>> s.str.startswith('b')\n 0 True\n 1 False\n 2 False\n 3 NaN\n dtype: object\n\n Specifying `na` to be `False` instead of `NaN`.\n\n >>> s.str.startswith('b', na=False)\n 0 True\n 1 False\n 2 False\n 3 False\n dtype: bool\n \"\"\"\n f = lambda x: x.startswith(pat)\n return _na_map(f, arr, na, dtype=bool)"} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef str_endswith(arr, pat, na=np.nan):\n f = lambda x: x.endswith(pat)\n return _na_map(f, arr, na, dtype=bool)", "response": "Returns a Series of booleans indicating whether the end of each string element matches pat."} {"instruction": "Implement a function in Python 3 to\nreturn a Series or Index of repeated strings in the Series or Index.", "response": "def str_repeat(arr, repeats):\n \"\"\"\n Duplicate each string in the Series or Index.\n\n Parameters\n ----------\n repeats : int or sequence of int\n Same value for all (int) or different value per (sequence).\n\n Returns\n -------\n Series or Index of object\n Series or Index of repeated string objects specified by\n input parameter repeats.\n\n Examples\n --------\n >>> s = pd.Series(['a', 'b', 'c'])\n >>> s\n 0 a\n 1 b\n 2 c\n dtype: object\n\n Single int repeats string in Series\n\n >>> s.str.repeat(repeats=2)\n 0 aa\n 1 bb\n 2 cc\n dtype: object\n\n Sequence of int repeats corresponding string in Series\n\n >>> s.str.repeat(repeats=[1, 2, 3])\n 0 a\n 1 bb\n 2 ccc\n dtype: object\n \"\"\"\n if is_scalar(repeats):\n def scalar_rep(x):\n try:\n return bytes.__mul__(x, repeats)\n except TypeError:\n return str.__mul__(x, repeats)\n\n return _na_map(scalar_rep, arr)\n else:\n\n def rep(x, r):\n try:\n return bytes.__mul__(x, r)\n except TypeError:\n return str.__mul__(x, r)\n\n repeats = np.asarray(repeats, dtype=object)\n result = libops.vec_binop(com.values_from_object(arr), repeats, rep)\n return result"} {"instruction": "Write a Python 3 function that can\ndetermine if each string matches a regular expression.", "response": "def str_match(arr, pat, case=True, flags=0, na=np.nan):\n \"\"\"\n Determine if each string matches a regular expression.\n\n Parameters\n ----------\n pat : str\n Character sequence or regular expression.\n case : bool, default True\n If True, case sensitive.\n flags : int, default 0 (no flags)\n re module flags, e.g. re.IGNORECASE.\n na : default NaN\n Fill value for missing values.\n\n Returns\n -------\n Series/array of boolean values\n\n See Also\n --------\n contains : Analogous, but less strict, relying on re.search instead of\n re.match.\n extract : Extract matched groups.\n \"\"\"\n if not case:\n flags |= re.IGNORECASE\n\n regex = re.compile(pat, flags=flags)\n\n dtype = bool\n f = lambda x: bool(regex.match(x))\n\n return _na_map(f, arr, na, dtype=dtype)"} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef _groups_or_na_fun(regex):\n if regex.groups == 0:\n raise ValueError(\"pattern contains no capture groups\")\n empty_row = [np.nan] * regex.groups\n\n def f(x):\n if not isinstance(x, str):\n return empty_row\n m = regex.search(x)\n if m:\n return [np.nan if item is None else item for item in m.groups()]\n else:\n return empty_row\n return f", "response": "Used in both extract_noexpand and extract_frame"} {"instruction": "Explain what the following Python 3 code does\ndef _str_extract_noexpand(arr, pat, flags=0):\n from pandas import DataFrame, Index\n\n regex = re.compile(pat, flags=flags)\n groups_or_na = _groups_or_na_fun(regex)\n\n if regex.groups == 1:\n result = np.array([groups_or_na(val)[0] for val in arr], dtype=object)\n name = _get_single_group_name(regex)\n else:\n if isinstance(arr, Index):\n raise ValueError(\"only one regex group is supported with Index\")\n name = None\n names = dict(zip(regex.groupindex.values(), regex.groupindex.keys()))\n columns = [names.get(1 + i, i) for i in range(regex.groups)]\n if arr.empty:\n result = DataFrame(columns=columns, dtype=object)\n else:\n result = DataFrame(\n [groups_or_na(val) for val in arr],\n columns=columns,\n index=arr.index,\n dtype=object)\n return result, name", "response": "This function is called from str_extract and can return Series DataFrame or Index."} {"instruction": "Here you have a function in Python 3, explain what it does\ndef _str_extract_frame(arr, pat, flags=0):\n from pandas import DataFrame\n\n regex = re.compile(pat, flags=flags)\n groups_or_na = _groups_or_na_fun(regex)\n names = dict(zip(regex.groupindex.values(), regex.groupindex.keys()))\n columns = [names.get(1 + i, i) for i in range(regex.groups)]\n\n if len(arr) == 0:\n return DataFrame(columns=columns, dtype=object)\n try:\n result_index = arr.index\n except AttributeError:\n result_index = None\n return DataFrame(\n [groups_or_na(val) for val in arr],\n columns=columns,\n index=result_index,\n dtype=object)", "response": "This function extracts groups from the\n first match of regular expression pat and returns a DataFrame."} {"instruction": "Here you have a function in Python 3, explain what it does\ndef str_extractall(arr, pat, flags=0):\n\n regex = re.compile(pat, flags=flags)\n # the regex must contain capture groups.\n if regex.groups == 0:\n raise ValueError(\"pattern contains no capture groups\")\n\n if isinstance(arr, ABCIndexClass):\n arr = arr.to_series().reset_index(drop=True)\n\n names = dict(zip(regex.groupindex.values(), regex.groupindex.keys()))\n columns = [names.get(1 + i, i) for i in range(regex.groups)]\n match_list = []\n index_list = []\n is_mi = arr.index.nlevels > 1\n\n for subject_key, subject in arr.iteritems():\n if isinstance(subject, str):\n\n if not is_mi:\n subject_key = (subject_key, )\n\n for match_i, match_tuple in enumerate(regex.findall(subject)):\n if isinstance(match_tuple, str):\n match_tuple = (match_tuple,)\n na_tuple = [np.NaN if group == \"\" else group\n for group in match_tuple]\n match_list.append(na_tuple)\n result_key = tuple(subject_key + (match_i, ))\n index_list.append(result_key)\n\n from pandas import MultiIndex\n index = MultiIndex.from_tuples(\n index_list, names=arr.index.names + [\"match\"])\n\n result = arr._constructor_expanddim(match_list, index=index,\n columns=columns)\n return result", "response": "r Extracts groups from all the subject strings in the Series that match the regular expression pat."} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef str_get_dummies(arr, sep='|'):\n arr = arr.fillna('')\n try:\n arr = sep + arr + sep\n except TypeError:\n arr = sep + arr.astype(str) + sep\n\n tags = set()\n for ts in arr.str.split(sep):\n tags.update(ts)\n tags = sorted(tags - {\"\"})\n\n dummies = np.empty((len(arr), len(tags)), dtype=np.int64)\n\n for i, t in enumerate(tags):\n pat = sep + t + sep\n dummies[:, i] = lib.map_infer(arr.values, lambda x: pat in x)\n return dummies, tags", "response": "Split each string in the Series by sep and return a DataFrame\n Dummy variables corresponding to values of the Series."} {"instruction": "Write a Python 3 script to\nfind all occurrences of pattern or regular expression in the Series/Index. Equivalent to applying :func:`re.findall` to all the elements in the Series/Index. Parameters ---------- pat : str Pattern or regular expression. flags : int, default 0 Flags from ``re`` module, e.g. `re.IGNORECASE` (default is 0, which means no flags). Returns ------- Series/Index of lists of strings All non-overlapping matches of pattern or regular expression in each string of this Series/Index. See Also -------- count : Count occurrences of pattern or regular expression in each string of the Series/Index. extractall : For each string in the Series, extract groups from all matches of regular expression and return a DataFrame with one row for each match and one column for each group. re.findall : The equivalent ``re`` function to all non-overlapping matches of pattern or regular expression in string, as a list of strings. Examples -------- >>> s = pd.Series(['Lion', 'Monkey', 'Rabbit']) The search for the pattern 'Monkey' returns one match: >>> s.str.findall('Monkey') 0 [] 1 [Monkey] 2 [] dtype: object On the other hand, the search for the pattern 'MONKEY' doesn't return any match: >>> s.str.findall('MONKEY') 0 [] 1 [] 2 [] dtype: object Flags can be added to the pattern or regular expression. For instance, to find the pattern 'MONKEY' ignoring the case: >>> import re >>> s.str.findall('MONKEY', flags=re.IGNORECASE) 0 [] 1 [Monkey] 2 [] dtype: object When the pattern matches more than one string in the Series, all matches are returned: >>> s.str.findall('on') 0 [on] 1 [on] 2 [] dtype: object Regular expressions are supported too. For instance, the search for all the strings ending with the word 'on' is shown next: >>> s.str.findall('on$') 0 [on] 1 [] 2 [] dtype: object If the pattern is found more than once in the same string, then a list of multiple strings is returned: >>> s.str.findall('b') 0 [] 1 [] 2 [b, b] dtype: object", "response": "def str_findall(arr, pat, flags=0):\n \"\"\"\n Find all occurrences of pattern or regular expression in the Series/Index.\n\n Equivalent to applying :func:`re.findall` to all the elements in the\n Series/Index.\n\n Parameters\n ----------\n pat : str\n Pattern or regular expression.\n flags : int, default 0\n Flags from ``re`` module, e.g. `re.IGNORECASE` (default is 0, which\n means no flags).\n\n Returns\n -------\n Series/Index of lists of strings\n All non-overlapping matches of pattern or regular expression in each\n string of this Series/Index.\n\n See Also\n --------\n count : Count occurrences of pattern or regular expression in each string\n of the Series/Index.\n extractall : For each string in the Series, extract groups from all matches\n of regular expression and return a DataFrame with one row for each\n match and one column for each group.\n re.findall : The equivalent ``re`` function to all non-overlapping matches\n of pattern or regular expression in string, as a list of strings.\n\n Examples\n --------\n\n >>> s = pd.Series(['Lion', 'Monkey', 'Rabbit'])\n\n The search for the pattern 'Monkey' returns one match:\n\n >>> s.str.findall('Monkey')\n 0 []\n 1 [Monkey]\n 2 []\n dtype: object\n\n On the other hand, the search for the pattern 'MONKEY' doesn't return any\n match:\n\n >>> s.str.findall('MONKEY')\n 0 []\n 1 []\n 2 []\n dtype: object\n\n Flags can be added to the pattern or regular expression. For instance,\n to find the pattern 'MONKEY' ignoring the case:\n\n >>> import re\n >>> s.str.findall('MONKEY', flags=re.IGNORECASE)\n 0 []\n 1 [Monkey]\n 2 []\n dtype: object\n\n When the pattern matches more than one string in the Series, all matches\n are returned:\n\n >>> s.str.findall('on')\n 0 [on]\n 1 [on]\n 2 []\n dtype: object\n\n Regular expressions are supported too. For instance, the search for all the\n strings ending with the word 'on' is shown next:\n\n >>> s.str.findall('on$')\n 0 [on]\n 1 []\n 2 []\n dtype: object\n\n If the pattern is found more than once in the same string, then a list of\n multiple strings is returned:\n\n >>> s.str.findall('b')\n 0 []\n 1 []\n 2 [b, b]\n dtype: object\n \"\"\"\n regex = re.compile(pat, flags=flags)\n return _na_map(regex.findall, arr)"} {"instruction": "Given the following Python 3 function, write the documentation\ndef str_find(arr, sub, start=0, end=None, side='left'):\n\n if not isinstance(sub, str):\n msg = 'expected a string object, not {0}'\n raise TypeError(msg.format(type(sub).__name__))\n\n if side == 'left':\n method = 'find'\n elif side == 'right':\n method = 'rfind'\n else: # pragma: no cover\n raise ValueError('Invalid side')\n\n if end is None:\n f = lambda x: getattr(x, method)(sub, start)\n else:\n f = lambda x: getattr(x, method)(sub, start, end)\n\n return _na_map(f, arr, dtype=int)", "response": "Find the indexes in each string in the Series or Index where the substring is fully contained between [ start end )."} {"instruction": "How would you explain what the following Python 3 function does\ndef str_pad(arr, width, side='left', fillchar=' '):\n if not isinstance(fillchar, str):\n msg = 'fillchar must be a character, not {0}'\n raise TypeError(msg.format(type(fillchar).__name__))\n\n if len(fillchar) != 1:\n raise TypeError('fillchar must be a character, not str')\n\n if not is_integer(width):\n msg = 'width must be of integer type, not {0}'\n raise TypeError(msg.format(type(width).__name__))\n\n if side == 'left':\n f = lambda x: x.rjust(width, fillchar)\n elif side == 'right':\n f = lambda x: x.ljust(width, fillchar)\n elif side == 'both':\n f = lambda x: x.center(width, fillchar)\n else: # pragma: no cover\n raise ValueError('Invalid side')\n\n return _na_map(f, arr)", "response": "Pads the string of the object with the specified width."} {"instruction": "Make a summary of the following Python 3 code\ndef str_slice(arr, start=None, stop=None, step=None):\n obj = slice(start, stop, step)\n f = lambda x: x[obj]\n return _na_map(f, arr)", "response": "Slice substrings from each element in the Series or Index."} {"instruction": "Can you implement a function in Python 3 that\nreplaces a positional slice of a string with another value. Parameters ---------- start : int, optional Left index position to use for the slice. If not specified (None), the slice is unbounded on the left, i.e. slice from the start of the string. stop : int, optional Right index position to use for the slice. If not specified (None), the slice is unbounded on the right, i.e. slice until the end of the string. repl : str, optional String for replacement. If not specified (None), the sliced region is replaced with an empty string. Returns ------- Series or Index Same type as the original object. See Also -------- Series.str.slice : Just slicing without replacement. Examples -------- >>> s = pd.Series(['a', 'ab', 'abc', 'abdc', 'abcde']) >>> s 0 a 1 ab 2 abc 3 abdc 4 abcde dtype: object Specify just `start`, meaning replace `start` until the end of the string with `repl`. >>> s.str.slice_replace(1, repl='X') 0 aX 1 aX 2 aX 3 aX 4 aX dtype: object Specify just `stop`, meaning the start of the string to `stop` is replaced with `repl`, and the rest of the string is included. >>> s.str.slice_replace(stop=2, repl='X') 0 X 1 X 2 Xc 3 Xdc 4 Xcde dtype: object Specify `start` and `stop`, meaning the slice from `start` to `stop` is replaced with `repl`. Everything before or after `start` and `stop` is included as is. >>> s.str.slice_replace(start=1, stop=3, repl='X') 0 aX 1 aX 2 aX 3 aXc 4 aXde dtype: object", "response": "def str_slice_replace(arr, start=None, stop=None, repl=None):\n \"\"\"\n Replace a positional slice of a string with another value.\n\n Parameters\n ----------\n start : int, optional\n Left index position to use for the slice. If not specified (None),\n the slice is unbounded on the left, i.e. slice from the start\n of the string.\n stop : int, optional\n Right index position to use for the slice. If not specified (None),\n the slice is unbounded on the right, i.e. slice until the\n end of the string.\n repl : str, optional\n String for replacement. If not specified (None), the sliced region\n is replaced with an empty string.\n\n Returns\n -------\n Series or Index\n Same type as the original object.\n\n See Also\n --------\n Series.str.slice : Just slicing without replacement.\n\n Examples\n --------\n >>> s = pd.Series(['a', 'ab', 'abc', 'abdc', 'abcde'])\n >>> s\n 0 a\n 1 ab\n 2 abc\n 3 abdc\n 4 abcde\n dtype: object\n\n Specify just `start`, meaning replace `start` until the end of the\n string with `repl`.\n\n >>> s.str.slice_replace(1, repl='X')\n 0 aX\n 1 aX\n 2 aX\n 3 aX\n 4 aX\n dtype: object\n\n Specify just `stop`, meaning the start of the string to `stop` is replaced\n with `repl`, and the rest of the string is included.\n\n >>> s.str.slice_replace(stop=2, repl='X')\n 0 X\n 1 X\n 2 Xc\n 3 Xdc\n 4 Xcde\n dtype: object\n\n Specify `start` and `stop`, meaning the slice from `start` to `stop` is\n replaced with `repl`. Everything before or after `start` and `stop` is\n included as is.\n\n >>> s.str.slice_replace(start=1, stop=3, repl='X')\n 0 aX\n 1 aX\n 2 aX\n 3 aXc\n 4 aXde\n dtype: object\n \"\"\"\n if repl is None:\n repl = ''\n\n def f(x):\n if x[start:stop] == '':\n local_stop = start\n else:\n local_stop = stop\n y = ''\n if start is not None:\n y += x[:start]\n y += repl\n if stop is not None:\n y += x[local_stop:]\n return y\n\n return _na_map(f, arr)"} {"instruction": "Can you implement a function in Python 3 that\nstrips whitespace (including newlines) from each string in the Series/Index. Parameters ---------- to_strip : str or unicode side : {'left', 'right', 'both'}, default 'both' Returns ------- Series or Index", "response": "def str_strip(arr, to_strip=None, side='both'):\n \"\"\"\n Strip whitespace (including newlines) from each string in the\n Series/Index.\n\n Parameters\n ----------\n to_strip : str or unicode\n side : {'left', 'right', 'both'}, default 'both'\n\n Returns\n -------\n Series or Index\n \"\"\"\n if side == 'both':\n f = lambda x: x.strip(to_strip)\n elif side == 'left':\n f = lambda x: x.lstrip(to_strip)\n elif side == 'right':\n f = lambda x: x.rstrip(to_strip)\n else: # pragma: no cover\n raise ValueError('Invalid side')\n return _na_map(f, arr)"} {"instruction": "Given the following Python 3 function, write the documentation\ndef str_wrap(arr, width, **kwargs):\n kwargs['width'] = width\n\n tw = textwrap.TextWrapper(**kwargs)\n\n return _na_map(lambda s: '\\n'.join(tw.wrap(s)), arr)", "response": "r Wrap long strings in a Series or Index."} {"instruction": "Can you create a Python 3 function that\nreturns the element from each component in the sequence at specified position.", "response": "def str_get(arr, i):\n \"\"\"\n Extract element from each component at specified position.\n\n Extract element from lists, tuples, or strings in each element in the\n Series/Index.\n\n Parameters\n ----------\n i : int\n Position of element to extract.\n\n Returns\n -------\n Series or Index\n\n Examples\n --------\n >>> s = pd.Series([\"String\",\n ... (1, 2, 3),\n ... [\"a\", \"b\", \"c\"],\n ... 123,\n ... -456,\n ... {1: \"Hello\", \"2\": \"World\"}])\n >>> s\n 0 String\n 1 (1, 2, 3)\n 2 [a, b, c]\n 3 123\n 4 -456\n 5 {1: 'Hello', '2': 'World'}\n dtype: object\n\n >>> s.str.get(1)\n 0 t\n 1 2\n 2 b\n 3 NaN\n 4 NaN\n 5 Hello\n dtype: object\n\n >>> s.str.get(-1)\n 0 g\n 1 3\n 2 c\n 3 NaN\n 4 NaN\n 5 None\n dtype: object\n \"\"\"\n def f(x):\n if isinstance(x, dict):\n return x.get(i)\n elif len(x) > i >= -len(x):\n return x[i]\n return np.nan\n return _na_map(f, arr)"} {"instruction": "Make a summary of the following Python 3 code\ndef str_decode(arr, encoding, errors=\"strict\"):\n if encoding in _cpython_optimized_decoders:\n # CPython optimized implementation\n f = lambda x: x.decode(encoding, errors)\n else:\n decoder = codecs.getdecoder(encoding)\n f = lambda x: decoder(x, errors)[0]\n return _na_map(f, arr)", "response": "Decode character string in the Series or Index using specified encoding."} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef str_encode(arr, encoding, errors=\"strict\"):\n if encoding in _cpython_optimized_encoders:\n # CPython optimized implementation\n f = lambda x: x.encode(encoding, errors)\n else:\n encoder = codecs.getencoder(encoding)\n f = lambda x: encoder(x, errors)[0]\n return _na_map(f, arr)", "response": "Encode character string in the Series / Index using specified encoding."} {"instruction": "Explain what the following Python 3 code does\ndef copy(source):\n \"Copy a docstring from another source function (if present)\"\n\n def do_copy(target):\n if source.__doc__:\n target.__doc__ = source.__doc__\n return target\n\n return do_copy", "response": "Copy a docstring from another source function ( if present )"} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef _get_series_list(self, others, ignore_index=False):\n\n # Once str.cat defaults to alignment, this function can be simplified;\n # will not need `ignore_index` and the second boolean output anymore\n\n from pandas import Index, Series, DataFrame\n\n # self._orig is either Series or Index\n idx = self._orig if isinstance(self._orig, Index) else self._orig.index\n\n err_msg = ('others must be Series, Index, DataFrame, np.ndarrary or '\n 'list-like (either containing only strings or containing '\n 'only objects of type Series/Index/list-like/np.ndarray)')\n\n # Generally speaking, all objects without an index inherit the index\n # `idx` of the calling Series/Index - i.e. must have matching length.\n # Objects with an index (i.e. Series/Index/DataFrame) keep their own\n # index, *unless* ignore_index is set to True.\n if isinstance(others, Series):\n warn = not others.index.equals(idx)\n # only reconstruct Series when absolutely necessary\n los = [Series(others.values, index=idx)\n if ignore_index and warn else others]\n return (los, warn)\n elif isinstance(others, Index):\n warn = not others.equals(idx)\n los = [Series(others.values,\n index=(idx if ignore_index else others))]\n return (los, warn)\n elif isinstance(others, DataFrame):\n warn = not others.index.equals(idx)\n if ignore_index and warn:\n # without copy, this could change \"others\"\n # that was passed to str.cat\n others = others.copy()\n others.index = idx\n return ([others[x] for x in others], warn)\n elif isinstance(others, np.ndarray) and others.ndim == 2:\n others = DataFrame(others, index=idx)\n return ([others[x] for x in others], False)\n elif is_list_like(others, allow_sets=False):\n others = list(others) # ensure iterators do not get read twice etc\n\n # in case of list-like `others`, all elements must be\n # either one-dimensional list-likes or scalars\n if all(is_list_like(x, allow_sets=False) for x in others):\n los = []\n join_warn = False\n depr_warn = False\n # iterate through list and append list of series for each\n # element (which we check to be one-dimensional and non-nested)\n while others:\n nxt = others.pop(0) # nxt is guaranteed list-like by above\n\n # GH 21950 - DeprecationWarning\n # only allowing Series/Index/np.ndarray[1-dim] will greatly\n # simply this function post-deprecation.\n if not (isinstance(nxt, (Series, Index)) or\n (isinstance(nxt, np.ndarray) and nxt.ndim == 1)):\n depr_warn = True\n\n if not isinstance(nxt, (DataFrame, Series,\n Index, np.ndarray)):\n # safety for non-persistent list-likes (e.g. iterators)\n # do not map indexed/typed objects; info needed below\n nxt = list(nxt)\n\n # known types for which we can avoid deep inspection\n no_deep = ((isinstance(nxt, np.ndarray) and nxt.ndim == 1)\n or isinstance(nxt, (Series, Index)))\n # nested list-likes are forbidden:\n # -> elements of nxt must not be list-like\n is_legal = ((no_deep and nxt.dtype == object)\n or all(not is_list_like(x) for x in nxt))\n\n # DataFrame is false positive of is_legal\n # because \"x in df\" returns column names\n if not is_legal or isinstance(nxt, DataFrame):\n raise TypeError(err_msg)\n\n nxt, wnx = self._get_series_list(nxt,\n ignore_index=ignore_index)\n los = los + nxt\n join_warn = join_warn or wnx\n\n if depr_warn:\n warnings.warn('list-likes other than Series, Index, or '\n 'np.ndarray WITHIN another list-like are '\n 'deprecated and will be removed in a future '\n 'version.', FutureWarning, stacklevel=3)\n return (los, join_warn)\n elif all(not is_list_like(x) for x in others):\n return ([Series(others, index=idx)], False)\n raise TypeError(err_msg)", "response": "A method to convert a list of Series or Index objects into a list of Series."} {"instruction": "Make a summary of the following Python 3 code\ndef cat(self, others=None, sep=None, na_rep=None, join=None):\n from pandas import Index, Series, concat\n\n if isinstance(others, str):\n raise ValueError(\"Did you mean to supply a `sep` keyword?\")\n if sep is None:\n sep = ''\n\n if isinstance(self._orig, Index):\n data = Series(self._orig, index=self._orig)\n else: # Series\n data = self._orig\n\n # concatenate Series/Index with itself if no \"others\"\n if others is None:\n data = ensure_object(data)\n na_mask = isna(data)\n if na_rep is None and na_mask.any():\n data = data[~na_mask]\n elif na_rep is not None and na_mask.any():\n data = np.where(na_mask, na_rep, data)\n return sep.join(data)\n\n try:\n # turn anything in \"others\" into lists of Series\n others, warn = self._get_series_list(others,\n ignore_index=(join is None))\n except ValueError: # do not catch TypeError raised by _get_series_list\n if join is None:\n raise ValueError('All arrays must be same length, except '\n 'those having an index if `join` is not None')\n else:\n raise ValueError('If `others` contains arrays or lists (or '\n 'other list-likes without an index), these '\n 'must all be of the same length as the '\n 'calling Series/Index.')\n\n if join is None and warn:\n warnings.warn(\"A future version of pandas will perform index \"\n \"alignment when `others` is a Series/Index/\"\n \"DataFrame (or a list-like containing one). To \"\n \"disable alignment (the behavior before v.0.23) and \"\n \"silence this warning, use `.values` on any Series/\"\n \"Index/DataFrame in `others`. To enable alignment \"\n \"and silence this warning, pass `join='left'|\"\n \"'outer'|'inner'|'right'`. The future default will \"\n \"be `join='left'`.\", FutureWarning, stacklevel=2)\n\n # if join is None, _get_series_list already force-aligned indexes\n join = 'left' if join is None else join\n\n # align if required\n if any(not data.index.equals(x.index) for x in others):\n # Need to add keys for uniqueness in case of duplicate columns\n others = concat(others, axis=1,\n join=(join if join == 'inner' else 'outer'),\n keys=range(len(others)), sort=False, copy=False)\n data, others = data.align(others, join=join)\n others = [others[x] for x in others] # again list of Series\n\n all_cols = [ensure_object(x) for x in [data] + others]\n na_masks = np.array([isna(x) for x in all_cols])\n union_mask = np.logical_or.reduce(na_masks, axis=0)\n\n if na_rep is None and union_mask.any():\n # no na_rep means NaNs for all rows where any column has a NaN\n # only necessary if there are actually any NaNs\n result = np.empty(len(data), dtype=object)\n np.putmask(result, union_mask, np.nan)\n\n not_masked = ~union_mask\n result[not_masked] = cat_core([x[not_masked] for x in all_cols],\n sep)\n elif na_rep is not None and union_mask.any():\n # fill NaNs with na_rep in case there are actually any NaNs\n all_cols = [np.where(nm, na_rep, col)\n for nm, col in zip(na_masks, all_cols)]\n result = cat_core(all_cols, sep)\n else:\n # no NaNs - can just concatenate\n result = cat_core(all_cols, sep)\n\n if isinstance(self._orig, Index):\n # add dtype for case that result is all-NA\n result = Index(result, dtype=object, name=self._orig.name)\n else: # Series\n result = Series(result, dtype=object, index=data.index,\n name=self._orig.name)\n return result", "response": "Returns a string that is the concatenation of the strings in the Series or Index of the others."} {"instruction": "Write a Python 3 function that can\nreturn a new string with zeros at the left and length of the string.", "response": "def zfill(self, width):\n \"\"\"\n Pad strings in the Series/Index by prepending '0' characters.\n\n Strings in the Series/Index are padded with '0' characters on the\n left of the string to reach a total string length `width`. Strings\n in the Series/Index with length greater or equal to `width` are\n unchanged.\n\n Parameters\n ----------\n width : int\n Minimum length of resulting string; strings with length less\n than `width` be prepended with '0' characters.\n\n Returns\n -------\n Series/Index of objects\n\n See Also\n --------\n Series.str.rjust : Fills the left side of strings with an arbitrary\n character.\n Series.str.ljust : Fills the right side of strings with an arbitrary\n character.\n Series.str.pad : Fills the specified sides of strings with an arbitrary\n character.\n Series.str.center : Fills boths sides of strings with an arbitrary\n character.\n\n Notes\n -----\n Differs from :meth:`str.zfill` which has special handling\n for '+'/'-' in the string.\n\n Examples\n --------\n >>> s = pd.Series(['-1', '1', '1000', 10, np.nan])\n >>> s\n 0 -1\n 1 1\n 2 1000\n 3 10\n 4 NaN\n dtype: object\n\n Note that ``10`` and ``NaN`` are not strings, therefore they are\n converted to ``NaN``. The minus sign in ``'-1'`` is treated as a\n regular character and the zero is added to the left of it\n (:meth:`str.zfill` would have moved it to the left). ``1000``\n remains unchanged as it is longer than `width`.\n\n >>> s.str.zfill(3)\n 0 0-1\n 1 001\n 2 1000\n 3 NaN\n 4 NaN\n dtype: object\n \"\"\"\n result = str_pad(self._parent, width, side='left', fillchar='0')\n return self._wrap_result(result)"} {"instruction": "Create a Python 3 function to\nreturn the Unicode normal form for the strings in the Series / Index.", "response": "def normalize(self, form):\n \"\"\"\n Return the Unicode normal form for the strings in the Series/Index.\n For more information on the forms, see the\n :func:`unicodedata.normalize`.\n\n Parameters\n ----------\n form : {'NFC', 'NFKC', 'NFD', 'NFKD'}\n Unicode form\n\n Returns\n -------\n normalized : Series/Index of objects\n \"\"\"\n import unicodedata\n f = lambda x: unicodedata.normalize(form, x)\n result = _na_map(f, self._parent)\n return self._wrap_result(result)"} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef get_sys_info():\n \"Returns system information as a dict\"\n\n blob = []\n\n # get full commit hash\n commit = None\n if os.path.isdir(\".git\") and os.path.isdir(\"pandas\"):\n try:\n pipe = subprocess.Popen('git log --format=\"%H\" -n 1'.split(\" \"),\n stdout=subprocess.PIPE,\n stderr=subprocess.PIPE)\n so, serr = pipe.communicate()\n except (OSError, ValueError):\n pass\n else:\n if pipe.returncode == 0:\n commit = so\n try:\n commit = so.decode('utf-8')\n except ValueError:\n pass\n commit = commit.strip().strip('\"')\n\n blob.append(('commit', commit))\n\n try:\n (sysname, nodename, release,\n version, machine, processor) = platform.uname()\n blob.extend([\n (\"python\", '.'.join(map(str, sys.version_info))),\n (\"python-bits\", struct.calcsize(\"P\") * 8),\n (\"OS\", \"{sysname}\".format(sysname=sysname)),\n (\"OS-release\", \"{release}\".format(release=release)),\n # (\"Version\", \"{version}\".format(version=version)),\n (\"machine\", \"{machine}\".format(machine=machine)),\n (\"processor\", \"{processor}\".format(processor=processor)),\n (\"byteorder\", \"{byteorder}\".format(byteorder=sys.byteorder)),\n (\"LC_ALL\", \"{lc}\".format(lc=os.environ.get('LC_ALL', \"None\"))),\n (\"LANG\", \"{lang}\".format(lang=os.environ.get('LANG', \"None\"))),\n (\"LOCALE\", '.'.join(map(str, locale.getlocale()))),\n ])\n except (KeyError, ValueError):\n pass\n\n return blob", "response": "Returns system information as a dict"} {"instruction": "How would you explain what the following Python 3 function does\ndef whitelist_method_generator(base, klass, whitelist):\n\n method_wrapper_template = \\\n \"\"\"def %(name)s(%(sig)s) :\n \\\"\"\"\n %(doc)s\n \\\"\"\"\n f = %(self)s.__getattr__('%(name)s')\n return f(%(args)s)\"\"\"\n property_wrapper_template = \\\n \"\"\"@property\ndef %(name)s(self) :\n \\\"\"\"%(doc)s\\\"\"\"\n return self.__getattr__('%(name)s')\"\"\"\n\n for name in whitelist:\n # don't override anything that was explicitly defined\n # in the base class\n if hasattr(base, name):\n continue\n # ugly, but we need the name string itself in the method.\n f = getattr(klass, name)\n doc = f.__doc__\n doc = doc if type(doc) == str else ''\n if isinstance(f, types.MethodType):\n wrapper_template = method_wrapper_template\n decl, args = make_signature(f)\n # pass args by name to f because otherwise\n # GroupBy._make_wrapper won't know whether\n # we passed in an axis parameter.\n args_by_name = ['{0}={0}'.format(arg) for arg in args[1:]]\n params = {'name': name,\n 'doc': doc,\n 'sig': ','.join(decl),\n 'self': args[0],\n 'args': ','.join(args_by_name)}\n else:\n wrapper_template = property_wrapper_template\n params = {'name': name, 'doc': doc}\n yield wrapper_template % params", "response": "Generates a generator that yields all GroupBy member defs for DataFrame or Series names in whitelist."} {"instruction": "How would you code a function in Python 3 to\nreturn a sliced object.", "response": "def _gotitem(self, key, ndim, subset=None):\n \"\"\"\n Sub-classes to define. Return a sliced object.\n\n Parameters\n ----------\n key : string / list of selections\n ndim : 1,2\n requested ndim of result\n subset : object, default None\n subset to act on\n \"\"\"\n # create a new object to prevent aliasing\n if subset is None:\n subset = self.obj\n\n # we need to make a shallow copy of ourselves\n # with the same groupby\n kwargs = {attr: getattr(self, attr) for attr in self._attributes}\n\n # Try to select from a DataFrame, falling back to a Series\n try:\n groupby = self._groupby[key]\n except IndexError:\n groupby = self._groupby\n\n self = self.__class__(subset,\n groupby=groupby,\n parent=self,\n **kwargs)\n self._reset_cache()\n if subset.ndim == 2:\n if is_scalar(key) and key in subset or is_list_like(key):\n self._selection = key\n return self"} {"instruction": "Given the following Python 3 function, write the documentation\ndef to_str(s):\n if isinstance(s, bytes):\n s = s.decode('utf-8')\n elif not isinstance(s, str):\n s = str(s)\n return s", "response": "Convert bytes and non - string into Python 3 str\n "} {"instruction": "Write a Python 3 script for\nbinding the name and qualname attributes of the function f to the class name and class name.", "response": "def set_function_name(f, name, cls):\n \"\"\"\n Bind the name/qualname attributes of the function\n \"\"\"\n f.__name__ = name\n f.__qualname__ = '{klass}.{name}'.format(\n klass=cls.__name__,\n name=name)\n f.__module__ = cls.__module__\n return f"} {"instruction": "Can you implement a function in Python 3 that\nraises exception with existing traceback.", "response": "def raise_with_traceback(exc, traceback=Ellipsis):\n \"\"\"\n Raise exception with existing traceback.\n If traceback is not passed, uses sys.exc_info() to get traceback.\n \"\"\"\n if traceback == Ellipsis:\n _, _, traceback = sys.exc_info()\n raise exc.with_traceback(traceback)"} {"instruction": "Create a Python 3 function for\nconverting a style dictionary to an openpyxl style object", "response": "def _convert_to_style(cls, style_dict):\n \"\"\"\n converts a style_dict to an openpyxl style object\n Parameters\n ----------\n style_dict : style dictionary to convert\n \"\"\"\n\n from openpyxl.style import Style\n xls_style = Style()\n for key, value in style_dict.items():\n for nk, nv in value.items():\n if key == \"borders\":\n (xls_style.borders.__getattribute__(nk)\n .__setattr__('border_style', nv))\n else:\n xls_style.__getattribute__(key).__setattr__(nk, nv)\n\n return xls_style"} {"instruction": "Can you implement a function in Python 3 that\nconverts a style_dict to a set of kwargs suitable for initializing an openpyxl v2 style object.", "response": "def _convert_to_style_kwargs(cls, style_dict):\n \"\"\"\n Convert a style_dict to a set of kwargs suitable for initializing\n or updating-on-copy an openpyxl v2 style object\n Parameters\n ----------\n style_dict : dict\n A dict with zero or more of the following keys (or their synonyms).\n 'font'\n 'fill'\n 'border' ('borders')\n 'alignment'\n 'number_format'\n 'protection'\n Returns\n -------\n style_kwargs : dict\n A dict with the same, normalized keys as ``style_dict`` but each\n value has been replaced with a native openpyxl style object of the\n appropriate class.\n \"\"\"\n\n _style_key_map = {\n 'borders': 'border',\n }\n\n style_kwargs = {}\n for k, v in style_dict.items():\n if k in _style_key_map:\n k = _style_key_map[k]\n _conv_to_x = getattr(cls, '_convert_to_{k}'.format(k=k),\n lambda x: None)\n new_v = _conv_to_x(v)\n if new_v:\n style_kwargs[k] = new_v\n\n return style_kwargs"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef _convert_to_color(cls, color_spec):\n\n from openpyxl.styles import Color\n\n if isinstance(color_spec, str):\n return Color(color_spec)\n else:\n return Color(**color_spec)", "response": "Convert color_spec to an openpyxl v2 Color object."} {"instruction": "Can you write a function in Python 3 where it\nconverts font_dict to an openpyxl v2 Font object.", "response": "def _convert_to_font(cls, font_dict):\n \"\"\"\n Convert ``font_dict`` to an openpyxl v2 Font object\n Parameters\n ----------\n font_dict : dict\n A dict with zero or more of the following keys (or their synonyms).\n 'name'\n 'size' ('sz')\n 'bold' ('b')\n 'italic' ('i')\n 'underline' ('u')\n 'strikethrough' ('strike')\n 'color'\n 'vertAlign' ('vertalign')\n 'charset'\n 'scheme'\n 'family'\n 'outline'\n 'shadow'\n 'condense'\n Returns\n -------\n font : openpyxl.styles.Font\n \"\"\"\n\n from openpyxl.styles import Font\n\n _font_key_map = {\n 'sz': 'size',\n 'b': 'bold',\n 'i': 'italic',\n 'u': 'underline',\n 'strike': 'strikethrough',\n 'vertalign': 'vertAlign',\n }\n\n font_kwargs = {}\n for k, v in font_dict.items():\n if k in _font_key_map:\n k = _font_key_map[k]\n if k == 'color':\n v = cls._convert_to_color(v)\n font_kwargs[k] = v\n\n return Font(**font_kwargs)"} {"instruction": "Make a summary of the following Python 3 code\ndef _convert_to_fill(cls, fill_dict):\n\n from openpyxl.styles import PatternFill, GradientFill\n\n _pattern_fill_key_map = {\n 'patternType': 'fill_type',\n 'patterntype': 'fill_type',\n 'fgColor': 'start_color',\n 'fgcolor': 'start_color',\n 'bgColor': 'end_color',\n 'bgcolor': 'end_color',\n }\n\n _gradient_fill_key_map = {\n 'fill_type': 'type',\n }\n\n pfill_kwargs = {}\n gfill_kwargs = {}\n for k, v in fill_dict.items():\n pk = gk = None\n if k in _pattern_fill_key_map:\n pk = _pattern_fill_key_map[k]\n if k in _gradient_fill_key_map:\n gk = _gradient_fill_key_map[k]\n if pk in ['start_color', 'end_color']:\n v = cls._convert_to_color(v)\n if gk == 'stop':\n v = cls._convert_to_stop(v)\n if pk:\n pfill_kwargs[pk] = v\n elif gk:\n gfill_kwargs[gk] = v\n else:\n pfill_kwargs[k] = v\n gfill_kwargs[k] = v\n\n try:\n return PatternFill(**pfill_kwargs)\n except TypeError:\n return GradientFill(**gfill_kwargs)", "response": "Convert a dict of fill attributes to an openpyxl v2 Fill object."} {"instruction": "Write a Python 3 function for\nconverting a dict of keys to an openpyxl v2 Side object.", "response": "def _convert_to_side(cls, side_spec):\n \"\"\"\n Convert ``side_spec`` to an openpyxl v2 Side object\n Parameters\n ----------\n side_spec : str, dict\n A string specifying the border style, or a dict with zero or more\n of the following keys (or their synonyms).\n 'style' ('border_style')\n 'color'\n Returns\n -------\n side : openpyxl.styles.Side\n \"\"\"\n\n from openpyxl.styles import Side\n\n _side_key_map = {\n 'border_style': 'style',\n }\n\n if isinstance(side_spec, str):\n return Side(style=side_spec)\n\n side_kwargs = {}\n for k, v in side_spec.items():\n if k in _side_key_map:\n k = _side_key_map[k]\n if k == 'color':\n v = cls._convert_to_color(v)\n side_kwargs[k] = v\n\n return Side(**side_kwargs)"} {"instruction": "Make a summary of the following Python 3 code\ndef _convert_to_border(cls, border_dict):\n\n from openpyxl.styles import Border\n\n _border_key_map = {\n 'diagonalup': 'diagonalUp',\n 'diagonaldown': 'diagonalDown',\n }\n\n border_kwargs = {}\n for k, v in border_dict.items():\n if k in _border_key_map:\n k = _border_key_map[k]\n if k == 'color':\n v = cls._convert_to_color(v)\n if k in ['left', 'right', 'top', 'bottom', 'diagonal']:\n v = cls._convert_to_side(v)\n border_kwargs[k] = v\n\n return Border(**border_kwargs)", "response": "Convert dict to openpyxl v2 Border object"} {"instruction": "Can you write a function in Python 3 where it\nconstructs and return a row or column based frame apply object", "response": "def frame_apply(obj, func, axis=0, broadcast=None,\n raw=False, reduce=None, result_type=None,\n ignore_failures=False,\n args=None, kwds=None):\n \"\"\" construct and return a row or column based frame apply object \"\"\"\n\n axis = obj._get_axis_number(axis)\n if axis == 0:\n klass = FrameRowApply\n elif axis == 1:\n klass = FrameColumnApply\n\n return klass(obj, func, broadcast=broadcast,\n raw=raw, reduce=reduce, result_type=result_type,\n ignore_failures=ignore_failures,\n args=args, kwds=kwds)"} {"instruction": "Write a Python 3 function for\ncomputing the result of the function", "response": "def get_result(self):\n \"\"\" compute the results \"\"\"\n\n # dispatch to agg\n if is_list_like(self.f) or is_dict_like(self.f):\n return self.obj.aggregate(self.f, axis=self.axis,\n *self.args, **self.kwds)\n\n # all empty\n if len(self.columns) == 0 and len(self.index) == 0:\n return self.apply_empty_result()\n\n # string dispatch\n if isinstance(self.f, str):\n # Support for `frame.transform('method')`\n # Some methods (shift, etc.) require the axis argument, others\n # don't, so inspect and insert if necessary.\n func = getattr(self.obj, self.f)\n sig = inspect.getfullargspec(func)\n if 'axis' in sig.args:\n self.kwds['axis'] = self.axis\n return func(*self.args, **self.kwds)\n\n # ufunc\n elif isinstance(self.f, np.ufunc):\n with np.errstate(all='ignore'):\n results = self.obj._data.apply('apply', func=self.f)\n return self.obj._constructor(data=results, index=self.index,\n columns=self.columns, copy=False)\n\n # broadcasting\n if self.result_type == 'broadcast':\n return self.apply_broadcast()\n\n # one axis empty\n elif not all(self.obj.shape):\n return self.apply_empty_result()\n\n # raw\n elif self.raw and not self.obj._is_mixed_type:\n return self.apply_raw()\n\n return self.apply_standard()"} {"instruction": "Here you have a function in Python 3, explain what it does\ndef apply_raw(self):\n\n try:\n result = reduction.reduce(self.values, self.f, axis=self.axis)\n except Exception:\n result = np.apply_along_axis(self.f, self.axis, self.values)\n\n # TODO: mixed type case\n if result.ndim == 2:\n return self.obj._constructor(result,\n index=self.index,\n columns=self.columns)\n else:\n return self.obj._constructor_sliced(result,\n index=self.agg_axis)", "response": "apply to the values as a numpy array"} {"instruction": "Write a Python 3 script for\nwrapping the results for the rows", "response": "def wrap_results_for_axis(self):\n \"\"\" return the results for the rows \"\"\"\n\n results = self.results\n result = self.obj._constructor(data=results)\n\n if not isinstance(results[0], ABCSeries):\n try:\n result.index = self.res_columns\n except ValueError:\n pass\n\n try:\n result.columns = self.res_index\n except ValueError:\n pass\n\n return result"} {"instruction": "Can you implement a function in Python 3 that\nwraps the results for the columns", "response": "def wrap_results_for_axis(self):\n \"\"\" return the results for the columns \"\"\"\n results = self.results\n\n # we have requested to expand\n if self.result_type == 'expand':\n result = self.infer_to_same_shape()\n\n # we have a non-series and don't want inference\n elif not isinstance(results[0], ABCSeries):\n from pandas import Series\n result = Series(results)\n result.index = self.res_index\n\n # we may want to infer results\n else:\n result = self.infer_to_same_shape()\n\n return result"} {"instruction": "Write a Python 3 function that can\ninfer the results to the same shape as the input object", "response": "def infer_to_same_shape(self):\n \"\"\" infer the results to the same shape as the input object \"\"\"\n results = self.results\n\n result = self.obj._constructor(data=results)\n result = result.T\n\n # set the index\n result.index = self.res_index\n\n # infer dtypes\n result = result.infer_objects()\n\n return result"} {"instruction": "Make a summary of the following Python 3 code\ndef cartesian_product(X):\n msg = \"Input must be a list-like of list-likes\"\n if not is_list_like(X):\n raise TypeError(msg)\n for x in X:\n if not is_list_like(x):\n raise TypeError(msg)\n\n if len(X) == 0:\n return []\n\n lenX = np.fromiter((len(x) for x in X), dtype=np.intp)\n cumprodX = np.cumproduct(lenX)\n\n a = np.roll(cumprodX, 1)\n a[0] = 1\n\n if cumprodX[-1] != 0:\n b = cumprodX[-1] / cumprodX\n else:\n # if any factor is empty, the cartesian product is empty\n b = np.zeros_like(cumprodX)\n\n return [np.tile(np.repeat(np.asarray(com.values_from_object(x)), b[i]),\n np.product(a[i]))\n for i, x in enumerate(X)]", "response": "Returns the cartesian product of the input iterables."} {"instruction": "Explain what the following Python 3 code does\ndef _strip_schema(url):\n result = parse_url(url, allow_fragments=False)\n return result.netloc + result.path", "response": "Returns the url without the s3:// part"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef xception(c, k=8, n_middle=8):\n \"Preview version of Xception network. Not tested yet - use at own risk. No pretrained model yet.\"\n layers = [\n conv(3, k*4, 3, 2),\n conv(k*4, k*8, 3),\n ConvSkip(k*8, k*16, act=False),\n ConvSkip(k*16, k*32),\n ConvSkip(k*32, k*91),\n ]\n for i in range(n_middle): layers.append(middle_flow(k*91))\n layers += [\n ConvSkip(k*91,k*128),\n sep_conv(k*128,k*192,act=False),\n sep_conv(k*192,k*256),\n nn.ReLU(),\n nn.AdaptiveAvgPool2d(1),\n Flatten(),\n nn.Linear(k*256,c)\n ]\n return nn.Sequential(*layers)", "response": "Preview version of Xception network. Not tested yet - use at own risk. No pretrained model yet."} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef get_model(self, opt_fn, emb_sz, n_hid, n_layers, **kwargs):\n m = get_language_model(self.nt, emb_sz, n_hid, n_layers, self.pad_idx, **kwargs)\n model = SingleModel(to_gpu(m))\n return RNN_Learner(self, model, opt_fn=opt_fn)", "response": "Method returns an instance of the RNN_Learner class that wraps an instance of the RNN_Encoder module."} {"instruction": "Make a summary of the following Python 3 code\ndef get_files(path:PathOrStr, extensions:Collection[str]=None, recurse:bool=False,\n include:Optional[Collection[str]]=None)->FilePathList:\n \"Return list of files in `path` that have a suffix in `extensions`; optionally `recurse`.\"\n if recurse:\n res = []\n for i,(p,d,f) in enumerate(os.walk(path)):\n # skip hidden dirs\n if include is not None and i==0: d[:] = [o for o in d if o in include]\n else: d[:] = [o for o in d if not o.startswith('.')]\n res += _get_files(path, p, f, extensions)\n return res\n else:\n f = [o.name for o in os.scandir(path) if o.is_file()]\n return _get_files(path, path, f, extensions)", "response": "Return list of files in path that have a suffix in extensions ; optionally recurse."} {"instruction": "How would you explain what the following Python 3 function does\ndef _databunch_load_empty(cls, path, fname:str='export.pkl'):\n \"Load an empty `DataBunch` from the exported file in `path/fname` with optional `tfms`.\"\n sd = LabelLists.load_empty(path, fn=fname)\n return sd.databunch()", "response": "Load an empty DataBunch from the exported file in path / fname with optional tfms."} {"instruction": "Write a Python 3 script for\napplying processor or self. processor to self.", "response": "def process(self, processor:PreProcessors=None):\n \"Apply `processor` or `self.processor` to `self`.\"\n if processor is not None: self.processor = processor\n self.processor = listify(self.processor)\n for p in self.processor: p.process(self)\n return self"} {"instruction": "Write a Python 3 function for\napplying processor or self. processor to item.", "response": "def process_one(self, item:ItemBase, processor:PreProcessors=None):\n \"Apply `processor` or `self.processor` to `item`.\"\n if processor is not None: self.processor = processor\n self.processor = listify(self.processor)\n for p in self.processor: item = p.process_one(item)\n return item"} {"instruction": "Here you have a function in Python 3, explain what it does\ndef reconstruct(self, t:Tensor, x:Tensor=None):\n \"Reconstruct one of the underlying item for its data `t`.\"\n return self[0].reconstruct(t,x) if has_arg(self[0].reconstruct, 'x') else self[0].reconstruct(t)", "response": "Reconstruct one of the underlying item for its data t."} {"instruction": "Explain what the following Python 3 code does\ndef new(self, items:Iterator, processor:PreProcessors=None, **kwargs)->'ItemList':\n \"Create a new `ItemList` from `items`, keeping the same attributes.\"\n processor = ifnone(processor, self.processor)\n copy_d = {o:getattr(self,o) for o in self.copy_new}\n kwargs = {**copy_d, **kwargs}\n return self.__class__(items=items, processor=processor, **kwargs)", "response": "Create a new ItemList from items keeping the same attributes."} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef from_folder(cls, path:PathOrStr, extensions:Collection[str]=None, recurse:bool=True,\n include:Optional[Collection[str]]=None, processor:PreProcessors=None, **kwargs)->'ItemList':\n \"\"\"Create an `ItemList` in `path` from the filenames that have a suffix in `extensions`.\n `recurse` determines if we search subfolders.\"\"\"\n path = Path(path)\n return cls(get_files(path, extensions, recurse=recurse, include=include), path=path, processor=processor, **kwargs)", "response": "Create an ItemList in path from the files in path that have a suffix in extensions."} {"instruction": "Can you create a Python 3 function that\ncreates an ItemList in path from the inputs in the cols of df.", "response": "def from_df(cls, df:DataFrame, path:PathOrStr='.', cols:IntsOrStrs=0, processor:PreProcessors=None, **kwargs)->'ItemList':\n \"Create an `ItemList` in `path` from the inputs in the `cols` of `df`.\"\n inputs = df.iloc[:,df_names_to_idx(cols, df)]\n assert inputs.isna().sum().sum() == 0, f\"You have NaN values in column(s) {cols} of your dataframe, please fix it.\"\n res = cls(items=_maybe_squeeze(inputs.values), path=path, inner_df=df, processor=processor, **kwargs)\n return res"} {"instruction": "Write a Python 3 function for\ncreating an ItemList in path from the inputs in csv_name.", "response": "def from_csv(cls, path:PathOrStr, csv_name:str, cols:IntsOrStrs=0, delimiter:str=None, header:str='infer',\n processor:PreProcessors=None, **kwargs)->'ItemList':\n \"\"\"Create an `ItemList` in `path` from the inputs in the `cols` of `path/csv_name`\"\"\"\n df = pd.read_csv(Path(path)/csv_name, delimiter=delimiter, header=header)\n return cls.from_df(df, path=path, cols=cols, processor=processor, **kwargs)"} {"instruction": "How would you explain what the following Python 3 function does\ndef use_partial_data(self, sample_pct:float=0.01, seed:int=None)->'ItemList':\n \"Use only a sample of `sample_pct`of the full dataset and an optional `seed`.\"\n if seed is not None: np.random.seed(seed)\n rand_idx = np.random.permutation(range_of(self))\n cut = int(sample_pct * len(self))\n return self[rand_idx[:cut]]", "response": "Use only a sample of sample_pct of the full dataset and an optional seed."} {"instruction": "Write a Python 3 script for\nsaving self. items to fn in self. path.", "response": "def to_text(self, fn:str):\n \"Save `self.items` to `fn` in `self.path`.\"\n with open(self.path/fn, 'w') as f: f.writelines([f'{o}\\n' for o in self._relative_item_paths()])"} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef filter_by_folder(self, include=None, exclude=None):\n \"Only keep filenames in `include` folder or reject the ones in `exclude`.\"\n include,exclude = listify(include),listify(exclude)\n def _inner(o):\n if isinstance(o, Path): n = o.relative_to(self.path).parts[0]\n else: n = o.split(os.path.sep)[len(str(self.path).split(os.path.sep))]\n if include and not n in include: return False\n if exclude and n in exclude: return False\n return True\n return self.filter_by_func(_inner)", "response": "Only keep filenames in include folder or reject the ones in exclude."} {"instruction": "Can you tell what is the following Python 3 function doing\ndef filter_by_rand(self, p:float, seed:int=None):\n \"Keep random sample of `items` with probability `p` and an optional `seed`.\"\n if seed is not None: np.random.seed(seed)\n return self.filter_by_func(lambda o: rand_bool(p))", "response": "Keep random sample of items with probability p and an optional seed."} {"instruction": "Explain what the following Python 3 code does\ndef split_none(self):\n \"Don't split the data and create an empty validation set.\"\n val = self[[]]\n val.ignore_empty = True\n return self._split(self.path, self, val)", "response": "Don t split the data and create an empty validation set."} {"instruction": "Can you create a Python 3 function that\nsplits the data between train and valid.", "response": "def split_by_list(self, train, valid):\n \"Split the data between `train` and `valid`.\"\n return self._split(self.path, train, valid)"} {"instruction": "Write a Python 3 script to\nsplit the data between train_idx and valid_idx.", "response": "def split_by_idxs(self, train_idx, valid_idx):\n \"Split the data between `train_idx` and `valid_idx`.\"\n return self.split_by_list(self[train_idx], self[valid_idx])"} {"instruction": "Can you implement a function in Python 3 that\nsplits the data according to the indexes in valid_idx.", "response": "def split_by_idx(self, valid_idx:Collection[int])->'ItemLists':\n \"Split the data according to the indexes in `valid_idx`.\"\n #train_idx = [i for i in range_of(self.items) if i not in valid_idx]\n train_idx = np.setdiff1d(arange_of(self.items), valid_idx)\n return self.split_by_idxs(train_idx, valid_idx)"} {"instruction": "Here you have a function in Python 3, explain what it does\ndef split_by_folder(self, train:str='train', valid:str='valid')->'ItemLists':\n \"Split the data depending on the folder (`train` or `valid`) in which the filenames are.\"\n return self.split_by_idxs(self._get_by_folder(train), self._get_by_folder(valid))", "response": "Split the data depending on the folder ( train or valid ) in which the filenames are."} {"instruction": "Implement a function in Python 3 to\nsplit the items randomly by putting valid_pct in the validation set optional seed can be passed.", "response": "def split_by_rand_pct(self, valid_pct:float=0.2, seed:int=None)->'ItemLists':\n \"Split the items randomly by putting `valid_pct` in the validation set, optional `seed` can be passed.\"\n if valid_pct==0.: return self.split_none()\n if seed is not None: np.random.seed(seed)\n rand_idx = np.random.permutation(range_of(self))\n cut = int(valid_pct * len(self))\n return self.split_by_idx(rand_idx[:cut])"} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef split_subsets(self, train_size:float, valid_size:float, seed=None) -> 'ItemLists':\n \"Split the items into train set with size `train_size * n` and valid set with size `valid_size * n`.\"\n assert 0 < train_size < 1\n assert 0 < valid_size < 1\n assert train_size + valid_size <= 1.\n if seed is not None: np.random.seed(seed)\n n = len(self.items)\n rand_idx = np.random.permutation(range(n))\n train_cut, valid_cut = int(train_size * n), int(valid_size * n)\n return self.split_by_idxs(rand_idx[:train_cut], rand_idx[-valid_cut:])", "response": "Split the items into train set with size train_size * n and valid set with size valid_size * n."} {"instruction": "How would you code a function in Python 3 to\nsplit the data by result of func which returns True for validation set.", "response": "def split_by_valid_func(self, func:Callable)->'ItemLists':\n \"Split the data by result of `func` (which returns `True` for validation set).\"\n valid_idx = [i for i,o in enumerate(self.items) if func(o)]\n return self.split_by_idx(valid_idx)"} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef split_by_files(self, valid_names:'ItemList')->'ItemLists':\n \"Split the data by using the names in `valid_names` for validation.\"\n if isinstance(self.items[0], Path): return self.split_by_valid_func(lambda o: o.name in valid_names)\n else: return self.split_by_valid_func(lambda o: os.path.basename(o) in valid_names)", "response": "Split the data by using the names in valid_names for validation."} {"instruction": "Given the following Python 3 function, write the documentation\ndef split_by_fname_file(self, fname:PathOrStr, path:PathOrStr=None)->'ItemLists':\n \"Split the data by using the names in `fname` for the validation set. `path` will override `self.path`.\"\n path = Path(ifnone(path, self.path))\n valid_names = loadtxt_str(path/fname)\n return self.split_by_files(valid_names)", "response": "Split the data by using the names in fname for the validation set. path will override self. path."} {"instruction": "How would you implement a function in Python 3 that\nsplits the data from the col in the dataframe in self. inner_df.", "response": "def split_from_df(self, col:IntsOrStrs=2):\n \"Split the data from the `col` in the dataframe in `self.inner_df`.\"\n valid_idx = np.where(self.inner_df.iloc[:,df_names_to_idx(col, self.inner_df)])[0]\n return self.split_by_idx(valid_idx)"} {"instruction": "Implement a function in Python 3 to\nreturn label_cls or guess one from the first element of labels.", "response": "def get_label_cls(self, labels, label_cls:Callable=None, label_delim:str=None, **kwargs):\n \"Return `label_cls` or guess one from the first element of `labels`.\"\n if label_cls is not None: return label_cls\n if self.label_cls is not None: return self.label_cls\n if label_delim is not None: return MultiCategoryList\n it = index_row(labels,0)\n if isinstance(it, (float, np.float32)): return FloatList\n if isinstance(try_int(it), (str, Integral)): return CategoryList\n if isinstance(it, Collection): return MultiCategoryList\n return ItemList"} {"instruction": "Can you implement a function in Python 3 that\nlabels self. items with labels.", "response": "def _label_from_list(self, labels:Iterator, label_cls:Callable=None, from_item_lists:bool=False, **kwargs)->'LabelList':\n \"Label `self.items` with `labels`.\"\n if not from_item_lists:\n raise Exception(\"Your data isn't split, if you don't want a validation set, please use `split_none`.\")\n labels = array(labels, dtype=object)\n label_cls = self.get_label_cls(labels, label_cls=label_cls, **kwargs)\n y = label_cls(labels, path=self.path, **kwargs)\n res = self._label_list(x=self, y=y)\n return res"} {"instruction": "Given the following Python 3 function, write the documentation\ndef label_from_df(self, cols:IntsOrStrs=1, label_cls:Callable=None, **kwargs):\n \"Label `self.items` from the values in `cols` in `self.inner_df`.\"\n labels = self.inner_df.iloc[:,df_names_to_idx(cols, self.inner_df)]\n assert labels.isna().sum().sum() == 0, f\"You have NaN values in column(s) {cols} of your dataframe, please fix it.\"\n if is_listy(cols) and len(cols) > 1 and (label_cls is None or label_cls == MultiCategoryList):\n new_kwargs,label_cls = dict(one_hot=True, classes= cols),MultiCategoryList\n kwargs = {**new_kwargs, **kwargs}\n return self._label_from_list(_maybe_squeeze(labels), label_cls=label_cls, **kwargs)", "response": "Label self. items from the values in cols in self. inner_df."} {"instruction": "Write a Python 3 function that can\nlabel every item with const.", "response": "def label_const(self, const:Any=0, label_cls:Callable=None, **kwargs)->'LabelList':\n \"Label every item with `const`.\"\n return self.label_from_func(func=lambda o: const, label_cls=label_cls, **kwargs)"} {"instruction": "Here you have a function in Python 3, explain what it does\ndef label_empty(self, **kwargs):\n \"Label every item with an `EmptyLabel`.\"\n kwargs['label_cls'] = EmptyLabelList\n return self.label_from_func(func=lambda o: 0., **kwargs)", "response": "Label every item with an EmptyLabel."} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef label_from_func(self, func:Callable, label_cls:Callable=None, **kwargs)->'LabelList':\n \"Apply `func` to every input to get its label.\"\n return self._label_from_list([func(o) for o in self.items], label_cls=label_cls, **kwargs)", "response": "Apply func to every input to get its label."} {"instruction": "Make a summary of the following Python 3 code\ndef label_from_folder(self, label_cls:Callable=None, **kwargs)->'LabelList':\n \"Give a label to each filename depending on its folder.\"\n return self.label_from_func(func=lambda o: (o.parts if isinstance(o, Path) else o.split(os.path.sep))[-2],\n label_cls=label_cls, **kwargs)", "response": "Give a label to each filename depending on its folder."} {"instruction": "Implement a Python 3 function for\napplying the re in pat to determine the label of every filename. If full_path search in the full name.", "response": "def label_from_re(self, pat:str, full_path:bool=False, label_cls:Callable=None, **kwargs)->'LabelList':\n \"Apply the re in `pat` to determine the label of every filename. If `full_path`, search in the full name.\"\n pat = re.compile(pat)\n def _inner(o):\n s = str((os.path.join(self.path,o) if full_path else o).as_posix())\n res = pat.search(s)\n assert res,f'Failed to find \"{pat}\" in \"{s}\"'\n return res.group(1)\n return self.label_from_func(_inner, label_cls=label_cls, **kwargs)"} {"instruction": "Create a Python 3 function for\ngenerating classes from items by taking the sorted unique values.", "response": "def generate_classes(self, items):\n \"Generate classes from `items` by taking the sorted unique values.\"\n classes = set()\n for c in items: classes = classes.union(set(c))\n classes = list(classes)\n classes.sort()\n return classes"} {"instruction": "How would you explain what the following Python 3 function does\ndef label_from_lists(self, train_labels:Iterator, valid_labels:Iterator, label_cls:Callable=None, **kwargs)->'LabelList':\n \"Use the labels in `train_labels` and `valid_labels` to label the data. `label_cls` will overwrite the default.\"\n label_cls = self.train.get_label_cls(train_labels, label_cls)\n self.train = self.train._label_list(x=self.train, y=label_cls(train_labels, **kwargs))\n self.valid = self.valid._label_list(x=self.valid, y=self.train.y.new(valid_labels, **kwargs))\n self.__class__ = LabelLists\n self.process()\n return self", "response": "Use the labels in train_labels and valid_labels to label the data. label_cls will overwrite the default."} {"instruction": "Make a summary of the following Python 3 code\ndef transform(self, tfms:Optional[Tuple[TfmList,TfmList]]=(None,None), **kwargs):\n \"Set `tfms` to be applied to the xs of the train and validation set.\"\n if not tfms: tfms=(None,None)\n assert is_listy(tfms) and len(tfms) == 2, \"Please pass a list of two lists of transforms (train and valid).\"\n self.train.transform(tfms[0], **kwargs)\n self.valid.transform(tfms[1], **kwargs)\n if self.test: self.test.transform(tfms[1], **kwargs)\n return self", "response": "Set tfms to be applied to the xs of the train and validation set."} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef transform_y(self, tfms:Optional[Tuple[TfmList,TfmList]]=(None,None), **kwargs):\n \"Set `tfms` to be applied to the ys of the train and validation set.\"\n if not tfms: tfms=(None,None)\n self.train.transform_y(tfms[0], **kwargs)\n self.valid.transform_y(tfms[1], **kwargs)\n if self.test: self.test.transform_y(tfms[1], **kwargs)\n return self", "response": "Set tfms to be applied to the ys of the train and validation set."} {"instruction": "Can you tell what is the following Python 3 function doing\ndef get_processors(self):\n \"Read the default class processors if none have been set.\"\n procs_x,procs_y = listify(self.train.x._processor),listify(self.train.y._processor)\n xp = ifnone(self.train.x.processor, [p(ds=self.train.x) for p in procs_x])\n yp = ifnone(self.train.y.processor, [p(ds=self.train.y) for p in procs_y])\n return xp,yp", "response": "Read the default class processors if none have been set."} {"instruction": "How would you implement a function in Python 3 that\nprocesses the inner datasets.", "response": "def process(self):\n \"Process the inner datasets.\"\n xp,yp = self.get_processors()\n for ds,n in zip(self.lists, ['train','valid','test']): ds.process(xp, yp, name=n)\n #progress_bar clear the outputs so in some case warnings issued during processing disappear.\n for ds in self.lists:\n if getattr(ds, 'warn', False): warn(ds.warn)\n return self"} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef databunch(self, path:PathOrStr=None, bs:int=64, val_bs:int=None, num_workers:int=defaults.cpus,\n dl_tfms:Optional[Collection[Callable]]=None, device:torch.device=None, collate_fn:Callable=data_collate,\n no_check:bool=False, **kwargs)->'DataBunch':\n \"Create an `DataBunch` from self, `path` will override `self.path`, `kwargs` are passed to `DataBunch.create`.\"\n path = Path(ifnone(path, self.path))\n data = self.x._bunch.create(self.train, self.valid, test_ds=self.test, path=path, bs=bs, val_bs=val_bs,\n num_workers=num_workers, device=device, collate_fn=collate_fn, no_check=no_check, **kwargs)\n if getattr(self, 'normalize', False):#In case a normalization was serialized\n norm = self.normalize\n data.normalize((norm['mean'], norm['std']), do_x=norm['do_x'], do_y=norm['do_y'])\n data.label_list = self\n return data", "response": "Create an DataBunch from self path will override self. path kwargs are passed to DataBunch. create."} {"instruction": "Implement a function in Python 3 to\ncreate a LabelLists with empty sets from the serialized state.", "response": "def load_state(cls, path:PathOrStr, state:dict):\n \"Create a `LabelLists` with empty sets from the serialized `state`.\"\n path = Path(path)\n train_ds = LabelList.load_state(path, state)\n valid_ds = LabelList.load_state(path, state)\n return LabelLists(path, train=train_ds, valid=valid_ds)"} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef load_empty(cls, path:PathOrStr, fn:PathOrStr='export.pkl'):\n \"Create a `LabelLists` with empty sets from the serialized file in `path/fn`.\"\n path = Path(path)\n state = torch.load(open(path/fn, 'rb'))\n return LabelLists.load_state(path, state)", "response": "Create a LabelLists with empty sets from the serialized file in path / fn."} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef set_item(self,item):\n \"For inference, will briefly replace the dataset with one that only contains `item`.\"\n self.item = self.x.process_one(item)\n yield None\n self.item = None", "response": "For inference will briefly replace the dataset with one that only contains item."} {"instruction": "Implement a Python 3 function for\ncreating pd. DataFrame containing items from self. x and self. y.", "response": "def to_df(self)->None:\n \"Create `pd.DataFrame` containing `items` from `self.x` and `self.y`.\"\n return pd.DataFrame(dict(x=self.x._relative_item_paths(), y=[str(o) for o in self.y]))"} {"instruction": "Given the following Python 3 function, write the documentation\ndef to_csv(self, dest:str)->None:\n \"Save `self.to_df()` to a CSV file in `self.path`/`dest`.\"\n self.to_df().to_csv(self.path/dest, index=False)", "response": "Save self. to_df to a CSV file in self. path / dest."} {"instruction": "Can you tell what is the following Python 3 function doing\ndef get_state(self, **kwargs):\n \"Return the minimal state for export.\"\n state = {'x_cls':self.x.__class__, 'x_proc':self.x.processor,\n 'y_cls':self.y.__class__, 'y_proc':self.y.processor,\n 'tfms':self.tfms, 'tfm_y':self.tfm_y, 'tfmargs':self.tfmargs}\n if hasattr(self, 'tfms_y'): state['tfms_y'] = self.tfms_y\n if hasattr(self, 'tfmargs_y'): state['tfmargs_y'] = self.tfmargs_y\n return {**state, **kwargs}", "response": "Return the minimal state for export."} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef export(self, fn:PathOrStr, **kwargs):\n \"Export the minimal state and save it in `fn` to load an empty version for inference.\"\n pickle.dump(self.get_state(**kwargs), open(fn, 'wb'))", "response": "Export the minimal state and save it in fn to load an empty version for inference."} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef load_empty(cls, path:PathOrStr, fn:PathOrStr):\n \"Load the state in `fn` to create an empty `LabelList` for inference.\"\n return cls.load_state(path, pickle.load(open(Path(path)/fn, 'rb')))", "response": "Load the state in fn to create an empty LabelList for inference."} {"instruction": "Explain what the following Python 3 code does\ndef load_state(cls, path:PathOrStr, state:dict) -> 'LabelList':\n \"Create a `LabelList` from `state`.\"\n x = state['x_cls']([], path=path, processor=state['x_proc'], ignore_empty=True)\n y = state['y_cls']([], path=path, processor=state['y_proc'], ignore_empty=True)\n res = cls(x, y, tfms=state['tfms'], tfm_y=state['tfm_y'], **state['tfmargs']).process()\n if state.get('tfms_y', False): res.tfms_y = state['tfms_y']\n if state.get('tfmargs_y', False): res.tfmargs_y = state['tfmargs_y']\n if state.get('normalize', False): res.normalize = state['normalize']\n return res", "response": "Create a LabelList from state."} {"instruction": "How would you implement a function in Python 3 that\nlaunches the processing on self. x and self. y with xp and yp.", "response": "def process(self, xp:PreProcessor=None, yp:PreProcessor=None, name:str=None):\n \"Launch the processing on `self.x` and `self.y` with `xp` and `yp`.\"\n self.y.process(yp)\n if getattr(self.y, 'filter_missing_y', False):\n filt = array([o is None for o in self.y.items])\n if filt.sum()>0:\n #Warnings are given later since progress_bar might make them disappear.\n self.warn = f\"You are labelling your items with {self.y.__class__.__name__}.\\n\"\n self.warn += f\"Your {name} set contained the following unknown labels, the corresponding items have been discarded.\\n\"\n for p in self.y.processor:\n if len(getattr(p, 'warns', [])) > 0:\n warnings = list(set(p.warns))\n self.warn += ', '.join(warnings[:5])\n if len(warnings) > 5: self.warn += \"...\"\n p.warns = []\n self.x,self.y = self.x[~filt],self.y[~filt]\n self.x.process(xp)\n return self"} {"instruction": "Create a Python 3 function for\nsetting the tfms and tfm_y value to be applied to the inputs and targets.", "response": "def transform(self, tfms:TfmList, tfm_y:bool=None, **kwargs):\n \"Set the `tfms` and `tfm_y` value to be applied to the inputs and targets.\"\n _check_kwargs(self.x, tfms, **kwargs)\n if tfm_y is None: tfm_y = self.tfm_y\n if tfm_y: _check_kwargs(self.y, tfms, **kwargs)\n self.tfms, self.tfmargs = tfms,kwargs\n self.tfm_y, self.tfmargs_y = tfm_y,kwargs\n self.tfms_y = None if tfms is None else list(filter(lambda t: t.use_on_y, listify(tfms)))\n return self"} {"instruction": "Write a Python 3 function for\nsetting tfms to be applied to the targets only.", "response": "def transform_y(self, tfms:TfmList=None, **kwargs):\n \"Set `tfms` to be applied to the targets only.\"\n _check_kwargs(self.y, tfms, **kwargs)\n self.tfm_y=True\n if tfms is None:\n self.tfms_y = list(filter(lambda t: t.use_on_y, listify(self.tfms)))\n self.tfmargs_y = {**self.tfmargs, **kwargs}\n else:\n tfms = list(filter(lambda t: t.use_on_y, tfms))\n self.tfms_y,self.tfmargs_y = tfms,kwargs\n return self"} {"instruction": "Can you implement a function in Python 3 that\ncreates a new ItemList from items keeping the same attributes.", "response": "def new(self, item_lists, processor:PreProcessor=None, **kwargs)->'ItemList':\n \"Create a new `ItemList` from `items`, keeping the same attributes.\"\n processor = ifnone(processor, self.processor)\n copy_d = {o:getattr(self,o) for o in self.copy_new}\n kwargs = {**copy_d, **kwargs}\n return self.__class__(item_lists, processor=processor, **kwargs)"} {"instruction": "Given the following Python 3 function, write the documentation\ndef parse_docstring(docstring):\n\n short_description = long_description = return_str = \"\"\n args = []\n\n if docstring:\n docstring = trim(docstring.lstrip(\"\\n\"))\n\n lines = docstring.split(\"\\n\", 1)\n short_description = lines[0]\n\n if len(lines) > 1:\n long_description = lines[1].strip()\n\n params_return_desc = None\n\n match = PARAM_OR_RETURN_REGEX.search(long_description)\n if match:\n long_desc_end = match.start()\n params_return_desc = long_description[long_desc_end:].strip()\n long_description = long_description[:long_desc_end].rstrip()\n\n if params_return_desc:\n args = [\n {\"name\": name, \"doc\": trim(doc), \"field\": field}\n for field, name, doc in NEW_REGEX.findall(params_return_desc)\n ]\n match = RETURN_REGEX.search(params_return_desc)\n if match:\n return_str = reindent(match.group(\"doc\"))\n comments = {p['name']: p['doc'] for p in args}\n return {\n \"short_description\": short_description,\n \"long_description\": long_description,\n \"args\": args,\n \"comments\": comments,\n \"return\": return_str\n }", "response": "Parse the docstring into its components."} {"instruction": "Create a Python 3 function for\nreturning env var value if it s defined and not an empty string or return Unknown", "response": "def get_env(name):\n \"Return env var value if it's defined and not an empty string, or return Unknown\"\n res = os.environ.get(name,'')\n return res if len(res) else \"Unknown\""} {"instruction": "How would you explain what the following Python 3 function does\ndef show_install(show_nvidia_smi:bool=False):\n \"Print user's setup information\"\n\n import platform, fastai.version\n\n rep = []\n opt_mods = []\n\n rep.append([\"=== Software ===\", None])\n rep.append([\"python\", platform.python_version()])\n rep.append([\"fastai\", fastai.__version__])\n rep.append([\"fastprogress\", fastprogress.__version__])\n rep.append([\"torch\", torch.__version__])\n\n # nvidia-smi\n cmd = \"nvidia-smi\"\n have_nvidia_smi = False\n try: result = subprocess.run(cmd.split(), shell=False, check=False, stdout=subprocess.PIPE)\n except: pass\n else:\n if result.returncode == 0 and result.stdout: have_nvidia_smi = True\n\n # XXX: if nvidia-smi is not available, another check could be:\n # /proc/driver/nvidia/version on most systems, since it's the\n # currently active version\n\n if have_nvidia_smi:\n smi = result.stdout.decode('utf-8')\n # matching: \"Driver Version: 396.44\"\n match = re.findall(r'Driver Version: +(\\d+\\.\\d+)', smi)\n if match: rep.append([\"nvidia driver\", match[0]])\n\n available = \"available\" if torch.cuda.is_available() else \"**Not available** \"\n rep.append([\"torch cuda\", f\"{torch.version.cuda} / is {available}\"])\n\n # no point reporting on cudnn if cuda is not available, as it\n # seems to be enabled at times even on cpu-only setups\n if torch.cuda.is_available():\n enabled = \"enabled\" if torch.backends.cudnn.enabled else \"**Not enabled** \"\n rep.append([\"torch cudnn\", f\"{torch.backends.cudnn.version()} / is {enabled}\"])\n\n rep.append([\"\\n=== Hardware ===\", None])\n\n # it's possible that torch might not see what nvidia-smi sees?\n gpu_total_mem = []\n nvidia_gpu_cnt = 0\n if have_nvidia_smi:\n try:\n cmd = \"nvidia-smi --query-gpu=memory.total --format=csv,nounits,noheader\"\n result = subprocess.run(cmd.split(), shell=False, check=False, stdout=subprocess.PIPE)\n except:\n print(\"have nvidia-smi, but failed to query it\")\n else:\n if result.returncode == 0 and result.stdout:\n output = result.stdout.decode('utf-8')\n gpu_total_mem = [int(x) for x in output.strip().split('\\n')]\n nvidia_gpu_cnt = len(gpu_total_mem)\n\n\n if nvidia_gpu_cnt: rep.append([\"nvidia gpus\", nvidia_gpu_cnt])\n\n torch_gpu_cnt = torch.cuda.device_count()\n if torch_gpu_cnt:\n rep.append([\"torch devices\", torch_gpu_cnt])\n # information for each gpu\n for i in range(torch_gpu_cnt):\n rep.append([f\" - gpu{i}\", (f\"{gpu_total_mem[i]}MB | \" if gpu_total_mem else \"\") + torch.cuda.get_device_name(i)])\n else:\n if nvidia_gpu_cnt:\n rep.append([f\"Have {nvidia_gpu_cnt} GPU(s), but torch can't use them (check nvidia driver)\", None])\n else:\n rep.append([f\"No GPUs available\", None])\n\n\n rep.append([\"\\n=== Environment ===\", None])\n\n rep.append([\"platform\", platform.platform()])\n\n if platform.system() == 'Linux':\n distro = try_import('distro')\n if distro:\n # full distro info\n rep.append([\"distro\", ' '.join(distro.linux_distribution())])\n else:\n opt_mods.append('distro');\n # partial distro info\n rep.append([\"distro\", platform.uname().version])\n\n rep.append([\"conda env\", get_env('CONDA_DEFAULT_ENV')])\n rep.append([\"python\", sys.executable])\n rep.append([\"sys.path\", \"\\n\".join(sys.path)])\n\n print(\"\\n\\n```text\")\n\n keylen = max([len(e[0]) for e in rep if e[1] is not None])\n for e in rep:\n print(f\"{e[0]:{keylen}}\", (f\": {e[1]}\" if e[1] is not None else \"\"))\n\n if have_nvidia_smi:\n if show_nvidia_smi: print(f\"\\n{smi}\")\n else:\n if torch_gpu_cnt: print(\"no nvidia-smi is found\")\n else: print(\"no supported gpus found on this system\")\n\n print(\"```\\n\")\n\n print(\"Please make sure to include opening/closing ``` when you paste into forums/github to make the reports appear formatted as code sections.\\n\")\n\n if opt_mods:\n print(\"Optional package(s) to enhance the diagnostics can be installed with:\")\n print(f\"pip install {' '.join(opt_mods)}\")\n print(\"Once installed, re-run this utility to get the additional information\")", "response": "Print user s setup information"} {"instruction": "Given the following Python 3 function, write the documentation\ndef pypi_module_version_is_available(module, version):\n \"Check whether module==version is available on pypi\"\n # returns True/False (or None if failed to execute the check)\n\n # using a hack that when passing \"module==\" w/ no version number to pip\n # it \"fails\" and returns all the available versions in stderr\n try:\n cmd = f\"pip install {module}==\"\n result = subprocess.run(cmd.split(), shell=False, check=False,\n stdout=subprocess.PIPE, stderr=subprocess.PIPE)\n except Exception as e:\n print(f\"Error: {e}\")\n return None\n else:\n if result.returncode == 1 and result.stderr:\n output = result.stderr.decode('utf-8')\n return True if version in output else False\n else:\n print(f\"Some error in {cmd}\")\n return None", "response": "Check whether module == version is available on pypi"} {"instruction": "Write a Python 3 script for\nsuggesting how to improve the setup to speed things up", "response": "def check_perf():\n \"Suggest how to improve the setup to speed things up\"\n\n from PIL import features, Image\n from packaging import version\n\n print(\"Running performance checks.\")\n\n # libjpeg_turbo check\n print(\"\\n*** libjpeg-turbo status\")\n if version.parse(Image.PILLOW_VERSION) >= version.parse(\"5.3.9\"):\n if features.check_feature('libjpeg_turbo'):\n print(\"\u2714 libjpeg-turbo is on\")\n else:\n print(\"\u2718 libjpeg-turbo is not on. It's recommended you install libjpeg-turbo to speed up JPEG decoding. See https://docs.fast.ai/performance.html#libjpeg-turbo\")\n else:\n print(f\"\u2753 libjpeg-turbo's status can't be derived - need Pillow(-SIMD)? >= 5.4.0 to tell, current version {Image.PILLOW_VERSION}\")\n # XXX: remove this check/note once Pillow and Pillow-SIMD 5.4.0 is available\n pillow_ver_5_4_is_avail = pypi_module_version_is_available(\"Pillow\", \"5.4.0\")\n if pillow_ver_5_4_is_avail == False:\n print(\"5.4.0 is not yet available, other than the dev version on github, which can be installed via pip from git+https://github.com/python-pillow/Pillow. See https://docs.fast.ai/performance.html#libjpeg-turbo\")\n\n # Pillow-SIMD check\n print(\"\\n*** Pillow-SIMD status\")\n if re.search(r'\\.post\\d+', Image.PILLOW_VERSION):\n print(f\"\u2714 Running Pillow-SIMD {Image.PILLOW_VERSION}\")\n else:\n print(f\"\u2718 Running Pillow {Image.PILLOW_VERSION}; It's recommended you install Pillow-SIMD to speed up image resizing and other operations. See https://docs.fast.ai/performance.html#pillow-simd\")\n\n # CUDA version check\n # compatibility table: k: min nvidia ver is required for v: cuda ver\n # note: windows nvidia driver version is slightly higher, see:\n # https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html\n # note: add new entries if pytorch starts supporting new cudaXX\n nvidia2cuda = {\n \"410.00\": \"10.0\",\n \"384.81\": \"9.0\",\n \"367.48\": \"8.0\",\n }\n print(\"\\n*** CUDA status\")\n if torch.cuda.is_available():\n pynvml = load_pynvml_env()\n nvidia_ver = (pynvml.nvmlSystemGetDriverVersion().decode('utf-8') if platform.system() != \"Darwin\" else \"Cannot be determined on OSX yet\")\n cuda_ver = torch.version.cuda\n max_cuda = \"8.0\"\n for k in sorted(nvidia2cuda.keys()):\n if version.parse(nvidia_ver) > version.parse(k): max_cuda = nvidia2cuda[k]\n if version.parse(str(max_cuda)) <= version.parse(cuda_ver):\n print(f\"\u2714 Running the latest CUDA {cuda_ver} with NVIDIA driver {nvidia_ver}\")\n else:\n print(f\"\u2718 You are running pytorch built against cuda {cuda_ver}, your NVIDIA driver {nvidia_ver} supports cuda10. See https://pytorch.org/get-started/locally/ to install pytorch built against the faster CUDA version.\")\n else:\n print(f\"\u2753 Running cpu-only torch version, CUDA check is not relevant\")\n\n print(\"\\nRefer to https://docs.fast.ai/performance.html to make sense out of these checks and suggestions.\")"} {"instruction": "Given the following Python 3 function, write the documentation\ndef annealing_linear(start:Number, end:Number, pct:float)->Number:\n \"Linearly anneal from `start` to `end` as pct goes from 0.0 to 1.0.\"\n return start + pct * (end-start)", "response": "Linearly anneal from start to end as pct goes from 0. 0 to 1. 0."} {"instruction": "Here you have a function in Python 3, explain what it does\ndef annealing_cos(start:Number, end:Number, pct:float)->Number:\n \"Cosine anneal from `start` to `end` as pct goes from 0.0 to 1.0.\"\n cos_out = np.cos(np.pi * pct) + 1\n return end + (start-end)/2 * cos_out", "response": "Cosine anneal from start to end as pct goes from 0. 0 to 1. 0."} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef do_annealing_poly(start:Number, end:Number, pct:float, degree:Number)->Number:\n \"Helper function for `anneal_poly`.\"\n return end + (start-end) * (1-pct)**degree", "response": "Helper function for anneal_poly."} {"instruction": "Write a Python 3 function for\ncreating an optim. Optim from opt_func with lr. Set lr on layer_groups.", "response": "def create(cls, opt_func:Union[type,Callable], lr:Union[float,Tuple,List], layer_groups:ModuleList, wd:Floats=0., \n true_wd:bool=False, bn_wd:bool=True)->optim.Optimizer:\n \"Create an `optim.Optimizer` from `opt_func` with `lr`. Set lr on `layer_groups`.\"\n split_params = split_no_wd_params(layer_groups)\n opt = opt_func([{'params': p, 'lr':0} for p in split_params])\n opt = cls(opt, wd=wd, true_wd=true_wd, bn_wd=bn_wd)\n opt.lr,opt.opt_func = listify(lr, layer_groups),opt_func\n return opt"} {"instruction": "Create a Python 3 function for\ncreating a new OptimWrapper from self with another layer_groups but the same hyper - parameters.", "response": "def new(self, layer_groups:Collection[nn.Module], split_no_wd:bool=True):\n \"Create a new `OptimWrapper` from `self` with another `layer_groups` but the same hyper-parameters.\"\n opt_func = getattr(self, 'opt_func', self.opt.__class__)\n res = self.create(opt_func, self.lr, layer_groups, wd=self.wd, true_wd=self.true_wd, bn_wd=self.bn_wd)\n res.mom,res.beta = self.mom,self.beta\n return res"} {"instruction": "How would you implement a function in Python 3 that\ncreates a new OptimWrapper from self with another layer_groups but the same hyper - parameters.", "response": "def new_with_params(self, param_groups:Collection[Collection[nn.Parameter]]):\n \"Create a new `OptimWrapper` from `self` with another `layer_groups` but the same hyper-parameters.\"\n opt_func = getattr(self, 'opt_func', self.opt.__class__)\n opt = opt_func([{'params': p, 'lr':0} for p in param_groups])\n opt = self.__class__(opt, wd=self.wd, true_wd=self.true_wd, bn_wd=self.bn_wd)\n opt.lr,opt.opt_func,opt.mom,opt.beta = self.lr,opt_func,self.mom,self.beta\n return opt"} {"instruction": "Can you implement a function in Python 3 that\nsets weight decay and step optimizer.", "response": "def step(self)->None:\n \"Set weight decay and step optimizer.\"\n # weight decay outside of optimizer step (AdamW)\n if self.true_wd:\n for lr,wd,pg1,pg2 in zip(self._lr,self._wd,self.opt.param_groups[::2],self.opt.param_groups[1::2]):\n for p in pg1['params']: p.data.mul_(1 - wd*lr)\n if self.bn_wd:\n for p in pg2['params']: p.data.mul_(1 - wd*lr)\n self.set_val('weight_decay', listify(0, self._wd))\n self.opt.step()"} {"instruction": "Given the following Python 3 function, write the documentation\ndef beta(self, val:float)->None:\n \"Set beta (or alpha as makes sense for given optimizer).\"\n if val is None: return\n if 'betas' in self.opt_keys: self.set_val('betas', (self._mom, listify(val, self._beta)))\n elif 'alpha' in self.opt_keys: self.set_val('alpha', listify(val, self._beta))\n self._beta = listify(val, self._beta)", "response": "Set beta ( or alpha as makes sense for given optimizer."} {"instruction": "Here you have a function in Python 3, explain what it does\ndef read_defaults(self)->None:\n \"Read the values inside the optimizer for the hyper-parameters.\"\n self._beta = None\n if 'lr' in self.opt_keys: self._lr = self.read_val('lr')\n if 'momentum' in self.opt_keys: self._mom = self.read_val('momentum')\n if 'alpha' in self.opt_keys: self._beta = self.read_val('alpha')\n if 'betas' in self.opt_keys: self._mom,self._beta = self.read_val('betas')\n if 'weight_decay' in self.opt_keys: self._wd = self.read_val('weight_decay')\n reserved_names = ['params', 'lr', 'momentum', 'alpha', 'betas', 'weight_decay']\n stat_names = [n for n in self.opt_keys if n not in reserved_names]\n self._stats = {n:self.read_val(n) for n in stat_names}", "response": "Read the values inside the optimizer for the hyper - parameters."} {"instruction": "How would you explain what the following Python 3 function does\ndef set_val(self, key:str, val:Any, bn_groups:bool=True)->Any:\n \"Set `val` inside the optimizer dictionary at `key`.\"\n if is_tuple(val): val = [(v1,v2) for v1,v2 in zip(*val)]\n for v,pg1,pg2 in zip(val,self.opt.param_groups[::2],self.opt.param_groups[1::2]):\n pg1[key] = v\n if bn_groups: pg2[key] = v\n return val", "response": "Set val inside the optimizer dictionary at key."} {"instruction": "Write a Python 3 function for\nreading a hyperparameter key in the optimizer dictionary.", "response": "def read_val(self, key:str) -> Union[List[float],Tuple[List[float],List[float]]]:\n \"Read a hyperparameter `key` in the optimizer dictionary.\"\n val = [pg[key] for pg in self.opt.param_groups[::2]]\n if is_tuple(val[0]): val = [o[0] for o in val], [o[1] for o in val]\n return val"} {"instruction": "Write a Python 3 function for\nreturning the inner state minus the layer groups.", "response": "def get_state(self):\n \"Return the inner state minus the layer groups.\"\n return {'opt_state':self.opt.state_dict(), 'lr':self._lr, 'wd':self._wd, 'beta':self._beta, 'mom':self._mom,\n 'opt_func':self.opt_func, 'true_wd':self.true_wd, 'bn_wd':self.bn_wd}"} {"instruction": "Can you write a function in Python 3 where it\nreturns the inner state of the Callback minimal or not.", "response": "def get_state(self, minimal:bool=True):\n \"Return the inner state of the `Callback`, `minimal` or not.\"\n to_remove = ['exclude', 'not_min'] + getattr(self, 'exclude', []).copy()\n if minimal: to_remove += getattr(self, 'not_min', []).copy()\n return {k:v for k,v in self.__dict__.items() if k not in to_remove}"} {"instruction": "Explain what the following Python 3 code does\ndef add_value(self, val:float)->None:\n \"Add `val` to calculate updated smoothed value.\"\n self.n += 1\n self.mov_avg = self.beta * self.mov_avg + (1 - self.beta) * val\n self.smooth = self.mov_avg / (1 - self.beta ** self.n)", "response": "Add val to calculate updated smoothed value."} {"instruction": "Implement a function in Python 3 to\nupdate metric computation with last_output and last_target.", "response": "def on_batch_end(self, last_output, last_target, **kwargs):\n \"Update metric computation with `last_output` and `last_target`.\"\n if not is_listy(last_target): last_target=[last_target]\n self.count += last_target[0].size(0)\n val = self.func(last_output, *last_target)\n if self.world:\n val = val.clone()\n dist.all_reduce(val, op=dist.ReduceOp.SUM)\n val /= self.world\n self.val += last_target[0].size(0) * val.detach().cpu()"} {"instruction": "Implement a Python 3 function for\nsetting the final result in last_metrics.", "response": "def on_epoch_end(self, last_metrics, **kwargs):\n \"Set the final result in `last_metrics`.\"\n return add_metrics(last_metrics, self.val/self.count)"} {"instruction": "How would you explain what the following Python 3 function does\ndef step(self)->Number:\n \"Return next value along annealed schedule.\"\n self.n += 1\n return self.func(self.start, self.end, self.n/self.n_iter)", "response": "Return next value along annealed schedule."} {"instruction": "Write a Python 3 script to\nbuild anneal schedule for all of the parameters.", "response": "def steps(self, *steps_cfg:StartOptEnd):\n \"Build anneal schedule for all of the parameters.\"\n return [Scheduler(step, n_iter, func=func)\n for (step,(n_iter,func)) in zip(steps_cfg, self.phases)]"} {"instruction": "How would you implement a function in Python 3 that\ninitializes our optimization params based on our annealing schedule.", "response": "def on_train_begin(self, n_epochs:int, epoch:int, **kwargs:Any)->None:\n \"Initialize our optimization params based on our annealing schedule.\"\n res = {'epoch':self.start_epoch} if self.start_epoch is not None else None\n self.start_epoch = ifnone(self.start_epoch, epoch)\n self.tot_epochs = ifnone(self.tot_epochs, n_epochs)\n n = len(self.learn.data.train_dl) * self.tot_epochs\n a1 = int(n * self.pct_start)\n a2 = n-a1\n self.phases = ((a1, annealing_cos), (a2, annealing_cos))\n low_lr = self.lr_max/self.div_factor\n self.lr_scheds = self.steps((low_lr, self.lr_max), (self.lr_max, self.lr_max/self.final_div))\n self.mom_scheds = self.steps(self.moms, (self.moms[1], self.moms[0]))\n self.opt = self.learn.opt\n self.opt.lr,self.opt.mom = self.lr_scheds[0].start,self.mom_scheds[0].start\n self.idx_s = 0\n return res"} {"instruction": "Write a Python 3 function for\ntaking one step forward on the annealing schedule for the optim params.", "response": "def on_batch_end(self, train, **kwargs:Any)->None:\n \"Take one step forward on the annealing schedule for the optim params.\"\n if train:\n if self.idx_s >= len(self.lr_scheds): return {'stop_training': True, 'stop_epoch': True}\n self.opt.lr = self.lr_scheds[self.idx_s].step()\n self.opt.mom = self.mom_scheds[self.idx_s].step()\n # when the current schedule is complete we move onto the next\n # schedule. (in 1-cycle there are two schedules)\n if self.lr_scheds[self.idx_s].is_done:\n self.idx_s += 1"} {"instruction": "Here you have a function in Python 3, explain what it does\ndef main(\n gpu:Param(\"GPU to run on\", str)=None,\n lr: Param(\"Learning rate\", float)=1e-3,\n size: Param(\"Size (px: 128,192,224)\", int)=128,\n debias_mom: Param(\"Debias statistics\", bool)=False,\n debias_sqr: Param(\"Debias statistics\", bool)=False,\n opt: Param(\"Optimizer: 'adam','genopt','rms','sgd'\", str)='genopt',\n alpha: Param(\"Alpha\", float)=0.99,\n mom: Param(\"Momentum\", float)=0.9,\n eps: Param(\"epsilon\", float)=1e-7,\n decay: Param(\"Decay AvgStatistic (momentum)\", bool)=False,\n epochs: Param(\"Number of epochs\", int)=5,\n bs: Param(\"Batch size\", int)=128,\n ):\n \"\"\"Distributed training of Imagenette.\n Fastest multi-gpu speed is if you run with: python -m fastai.launch\"\"\"\n\n # Pick one of these\n gpu = setup_distrib(gpu)\n if gpu is None: bs *= torch.cuda.device_count()\n\n moms = (mom,mom)\n stats = [\n AvgStatistic('momentum', mom, scope=StatScope.Weight, decay=decay, debias=debias_mom),\n AvgSquare ('alpha', alpha, scope=StatScope.Weight, debias=debias_sqr),\n ConstStatistic('eps', eps), ConstStatistic('clip', 0.001),\n ]\n if opt=='adam' : opt_func = partial(optim.Adam, betas=(mom,alpha), eps=eps)\n elif opt=='rms' : opt_func = partial(optim.RMSprop, alpha=alpha)\n elif opt=='genopt': opt_func = partial(GeneralOptimizer, on_step=on_step, stats=stats)\n else: raise Exception(f'unknown opt: {opt}')\n\n #opt_func = optim.SGD\n #learn = (cnn_learner(data, models.xresnet50, pretrained=False, concat_pool=False, lin_ftrs=[], split_on=bn_and_final,\n print(f'lr: {lr}; size: {size}; debias_mom: {debias_mom}; debias_sqr: {debias_sqr}; opt: {opt}; alpha: {alpha}; mom: {mom}; eps: {eps}; decay: {decay}')\n print('imagenette')\n get_learner(lr, size, False, bs, opt_func, gpu, epochs)\n gc.collect()\n\n print('imagewoof')\n get_learner(lr, size, True, bs, opt_func, gpu, epochs)", "response": "This function is the main entry point for Imagenette training."} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef basic_generator(in_size:int, n_channels:int, noise_sz:int=100, n_features:int=64, n_extra_layers=0, **conv_kwargs):\n \"A basic generator from `noise_sz` to images `n_channels` x `in_size` x `in_size`.\"\n cur_size, cur_ftrs = 4, n_features//2\n while cur_size < in_size: cur_size *= 2; cur_ftrs *= 2\n layers = [conv_layer(noise_sz, cur_ftrs, 4, 1, transpose=True, **conv_kwargs)]\n cur_size = 4\n while cur_size < in_size // 2:\n layers.append(conv_layer(cur_ftrs, cur_ftrs//2, 4, 2, 1, transpose=True, **conv_kwargs))\n cur_ftrs //= 2; cur_size *= 2\n layers += [conv_layer(cur_ftrs, cur_ftrs, 3, 1, 1, transpose=True, **conv_kwargs) for _ in range(n_extra_layers)]\n layers += [conv2d_trans(cur_ftrs, n_channels, 4, 2, 1, bias=False), nn.Tanh()]\n return nn.Sequential(*layers)", "response": "A basic generator from noise_sz to images n_channels x in_size x in_size."} {"instruction": "Write a Python 3 function for\ndefining loss functions for a GAN from loss_gen and loss_crit.", "response": "def gan_loss_from_func(loss_gen, loss_crit, weights_gen:Tuple[float,float]=None):\n \"Define loss functions for a GAN from `loss_gen` and `loss_crit`.\"\n def _loss_G(fake_pred, output, target, weights_gen=weights_gen):\n ones = fake_pred.new_ones(fake_pred.shape[0])\n weights_gen = ifnone(weights_gen, (1.,1.))\n return weights_gen[0] * loss_crit(fake_pred, ones) + weights_gen[1] * loss_gen(output, target)\n\n def _loss_C(real_pred, fake_pred):\n ones = real_pred.new_ones (real_pred.shape[0])\n zeros = fake_pred.new_zeros(fake_pred.shape[0])\n return (loss_crit(real_pred, ones) + loss_crit(fake_pred, zeros)) / 2\n\n return _loss_G, _loss_C"} {"instruction": "Given the following Python 3 function, write the documentation\ndef gan_critic(n_channels:int=3, nf:int=128, n_blocks:int=3, p:int=0.15):\n \"Critic to train a `GAN`.\"\n layers = [\n _conv(n_channels, nf, ks=4, stride=2),\n nn.Dropout2d(p/2),\n res_block(nf, dense=True,**_conv_args)]\n nf *= 2 # after dense block\n for i in range(n_blocks):\n layers += [\n nn.Dropout2d(p),\n _conv(nf, nf*2, ks=4, stride=2, self_attention=(i==0))]\n nf *= 2\n layers += [\n _conv(nf, 1, ks=4, bias=False, padding=0, use_activ=False),\n Flatten()]\n return nn.Sequential(*layers)", "response": "Critic to train a GAN."} {"instruction": "Given the following Python 3 function, write the documentation\ndef accuracy_thresh_expand(y_pred:Tensor, y_true:Tensor, thresh:float=0.5, sigmoid:bool=True)->Rank0Tensor:\n \"Compute accuracy after expanding `y_true` to the size of `y_pred`.\"\n if sigmoid: y_pred = y_pred.sigmoid()\n return ((y_pred>thresh)==y_true[:,None].expand_as(y_pred).byte()).float().mean()", "response": "Compute accuracy after expanding y_true to the size of y_pred."} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef switch(self, gen_mode:bool=None):\n \"Put the model in generator mode if `gen_mode`, in critic mode otherwise.\"\n self.gen_mode = (not self.gen_mode) if gen_mode is None else gen_mode", "response": "Put the model in generator mode if gen_mode in critic mode otherwise."} {"instruction": "Given the following Python 3 function, write the documentation\ndef generator(self, output, target):\n \"Evaluate the `output` with the critic then uses `self.loss_funcG` to combine it with `target`.\"\n fake_pred = self.gan_model.critic(output)\n return self.loss_funcG(fake_pred, target, output)", "response": "Evaluate the output with the critic then uses self. loss_funcG to combine it with target."} {"instruction": "Make a summary of the following Python 3 code\ndef critic(self, real_pred, input):\n \"Create some `fake_pred` with the generator from `input` and compare them to `real_pred` in `self.loss_funcD`.\"\n fake = self.gan_model.generator(input.requires_grad_(False)).requires_grad_(True)\n fake_pred = self.gan_model.critic(fake)\n return self.loss_funcC(real_pred, fake_pred)", "response": "Create some fake_pred with the generator from input and compare them to real_pred in self. loss_funcD."} {"instruction": "Here you have a function in Python 3, explain what it does\ndef on_train_begin(self, **kwargs):\n \"Create the optimizers for the generator and critic if necessary, initialize smootheners.\"\n if not getattr(self,'opt_gen',None):\n self.opt_gen = self.opt.new([nn.Sequential(*flatten_model(self.generator))])\n else: self.opt_gen.lr,self.opt_gen.wd = self.opt.lr,self.opt.wd\n if not getattr(self,'opt_critic',None):\n self.opt_critic = self.opt.new([nn.Sequential(*flatten_model(self.critic))])\n else: self.opt_critic.lr,self.opt_critic.wd = self.opt.lr,self.opt.wd\n self.gen_mode = self.gen_first\n self.switch(self.gen_mode)\n self.closses,self.glosses = [],[]\n self.smoothenerG,self.smoothenerC = SmoothenValue(self.beta),SmoothenValue(self.beta)\n #self.recorder.no_val=True\n self.recorder.add_metric_names(['gen_loss', 'disc_loss'])\n self.imgs,self.titles = [],[]", "response": "Create the optimizers for the generator and critic if necessary initialize smootheners."} {"instruction": "Create a Python 3 function to\nclamp the weights with self. clip if it s not None return the correct input.", "response": "def on_batch_begin(self, last_input, last_target, **kwargs):\n \"Clamp the weights with `self.clip` if it's not None, return the correct input.\"\n if self.clip is not None:\n for p in self.critic.parameters(): p.data.clamp_(-self.clip, self.clip)\n return {'last_input':last_input,'last_target':last_target} if self.gen_mode else {'last_input':last_target,'last_target':last_input}"} {"instruction": "Write a Python 3 function for\nrecording last_loss in the proper list.", "response": "def on_backward_begin(self, last_loss, last_output, **kwargs):\n \"Record `last_loss` in the proper list.\"\n last_loss = last_loss.detach().cpu()\n if self.gen_mode:\n self.smoothenerG.add_value(last_loss)\n self.glosses.append(self.smoothenerG.smooth)\n self.last_gen = last_output.detach().cpu()\n else:\n self.smoothenerC.add_value(last_loss)\n self.closses.append(self.smoothenerC.smooth)"} {"instruction": "Write a Python 3 script to\nput the various losses in the recorder and show a sample image.", "response": "def on_epoch_end(self, pbar, epoch, last_metrics, **kwargs):\n \"Put the various losses in the recorder and show a sample image.\"\n if not hasattr(self, 'last_gen') or not self.show_img: return\n data = self.learn.data\n img = self.last_gen[0]\n norm = getattr(data,'norm',False)\n if norm and norm.keywords.get('do_y',False): img = data.denorm(img)\n img = data.train_ds.y.reconstruct(img)\n self.imgs.append(img)\n self.titles.append(f'Epoch {epoch}')\n pbar.show_imgs(self.imgs, self.titles)\n return add_metrics(last_metrics, [getattr(self.smoothenerG,'smooth',None),getattr(self.smoothenerC,'smooth',None)])"} {"instruction": "Write a Python 3 script to\nswitch the model if gen_mode is provided in the desired mode.", "response": "def switch(self, gen_mode:bool=None):\n \"Switch the model, if `gen_mode` is provided, in the desired mode.\"\n self.gen_mode = (not self.gen_mode) if gen_mode is None else gen_mode\n self.opt.opt = self.opt_gen.opt if self.gen_mode else self.opt_critic.opt\n self._set_trainable()\n self.model.switch(gen_mode)\n self.loss_func.switch(gen_mode)"} {"instruction": "Can you write a function in Python 3 where it\nswitches the model if necessary.", "response": "def on_batch_end(self, iteration, **kwargs):\n \"Switch the model if necessary.\"\n if self.learn.gan_trainer.gen_mode:\n self.n_g += 1\n n_iter,n_in,n_out = self.n_gen,self.n_c,self.n_g\n else:\n self.n_c += 1\n n_iter,n_in,n_out = self.n_crit,self.n_g,self.n_c\n target = n_iter if isinstance(n_iter, int) else n_iter(n_in)\n if target == n_out:\n self.learn.gan_trainer.switch()\n self.n_c,self.n_g = 0,0"} {"instruction": "Implement a function in Python 3 to\ncreate a GAN from learn_gen and learn_crit.", "response": "def from_learners(cls, learn_gen:Learner, learn_crit:Learner, switcher:Callback=None,\n weights_gen:Tuple[float,float]=None, **learn_kwargs):\n \"Create a GAN from `learn_gen` and `learn_crit`.\"\n losses = gan_loss_from_func(learn_gen.loss_func, learn_crit.loss_func, weights_gen=weights_gen)\n return cls(learn_gen.data, learn_gen.model, learn_crit.model, *losses, switcher=switcher, **learn_kwargs)"} {"instruction": "Write a Python 3 script to\ncreate a WGAN from data generator and critic.", "response": "def wgan(cls, data:DataBunch, generator:nn.Module, critic:nn.Module, switcher:Callback=None, clip:float=0.01, **learn_kwargs):\n \"Create a WGAN from `data`, `generator` and `critic`.\"\n return cls(data, generator, critic, NoopLoss(), WassersteinLoss(), switcher=switcher, clip=clip, **learn_kwargs)"} {"instruction": "How would you implement a function in Python 3 that\nshows ys ( target images ) on a figure of figsize.", "response": "def show_xys(self, xs, ys, imgsize:int=4, figsize:Optional[Tuple[int,int]]=None, **kwargs):\n \"Shows `ys` (target images) on a figure of `figsize`.\"\n super().show_xys(ys, xs, imgsize=imgsize, figsize=figsize, **kwargs)"} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef on_batch_begin(self, train, **kwargs):\n \"Multiply the current lr if necessary.\"\n if not self.learn.gan_trainer.gen_mode and train: self.learn.opt.lr *= self.mult_lr", "response": "Multiply the current lr if necessary."} {"instruction": "Can you tell what is the following Python 3 function doing\ndef on_step_end(self, **kwargs):\n \"Put the LR back to its value if necessary.\"\n if not self.learn.gan_trainer.gen_mode: self.learn.opt.lr /= self.mult_lr", "response": "Put the LR back to its value if necessary."} {"instruction": "Can you write a function in Python 3 where it\ngets the indexes of the layers where the size of the activation changes.", "response": "def _get_sfs_idxs(sizes:Sizes) -> List[int]:\n \"Get the indexes of the layers where the size of the activation changes.\"\n feature_szs = [size[-1] for size in sizes]\n sfs_idxs = list(np.where(np.array(feature_szs[:-1]) != np.array(feature_szs[1:]))[0])\n if feature_szs[0] != feature_szs[1]: sfs_idxs = [0] + sfs_idxs\n return sfs_idxs"} {"instruction": "Write a Python 3 script to\ndownload n_images images on Google matching search_term and size requirements and return a list of FilePathList objects.", "response": "def download_google_images(path:PathOrStr, search_term:str, size:str='>400*300', n_images:int=10, format:str='jpg',\n max_workers:int=defaults.cpus, timeout:int=4) -> FilePathList:\n \"\"\"\n Search for `n_images` images on Google, matching `search_term` and `size` requirements,\n download them into `path`/`search_term` and verify them, using `max_workers` threads.\n \"\"\"\n label_path = Path(path)/search_term\n search_url = _search_url(search_term, size=size, format=format)\n if n_images <= 100: img_tuples = _fetch_img_tuples(search_url, format=format, n_images=n_images)\n else: img_tuples = _fetch_img_tuples_webdriver(search_url, format=format, n_images=n_images)\n downloaded_images = _download_images(label_path, img_tuples, max_workers=max_workers, timeout=timeout)\n if len(downloaded_images) == 0: raise RuntimeError(f\"Couldn't download any images.\")\n verify_images(label_path, max_workers=max_workers)\n return get_image_files(label_path)"} {"instruction": "Create a Python 3 function for\nbuilding Google Images Search Url params and return them as a string.", "response": "def _url_params(size:str='>400*300', format:str='jpg') -> str:\n \"Build Google Images Search Url params and return them as a string.\"\n _fmts = {'jpg':'ift:jpg','gif':'ift:gif','png':'ift:png','bmp':'ift:bmp', 'svg':'ift:svg','webp':'webp','ico':'ift:ico'}\n if size not in _img_sizes: \n raise RuntimeError(f\"\"\"Unexpected size argument value: {size}.\n See `widgets.image_downloader._img_sizes` for supported sizes.\"\"\") \n if format not in _fmts: \n raise RuntimeError(f\"Unexpected image file format: {format}. Use jpg, gif, png, bmp, svg, webp, or ico.\")\n return \"&tbs=\" + _img_sizes[size] + \",\" + _fmts[format]"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef _search_url(search_term:str, size:str='>400*300', format:str='jpg') -> str:\n \"Return a Google Images Search URL for a given search term.\"\n return ('https://www.google.com/search?q=' + quote(search_term) +\n '&espv=2&biw=1366&bih=667&site=webhp&source=lnms&tbm=isch' +\n _url_params(size, format) + '&sa=X&ei=XosDVaCXD8TasATItgE&ved=0CAcQ_AUoAg')", "response": "Return a Google Images Search URL for a given search term."} {"instruction": "Write a Python 3 script to\nparse the Google Images Search for urls and return the image metadata as tuples ( fname url ).", "response": "def _fetch_img_tuples(url:str, format:str='jpg', n_images:int=10) -> list:\n \"Parse the Google Images Search for urls and return the image metadata as tuples (fname, url).\"\n headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36'}\n html = requests.get(url, headers=headers).text\n return _html_to_img_tuples(html, format=format, n_images=n_images)"} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef _html_to_img_tuples(html:str, format:str='jpg', n_images:int=10) -> list: \n \"Parse the google images html to img tuples containining `(fname, url)`\"\n bs = BeautifulSoup(html, 'html.parser')\n img_tags = bs.find_all('div', {'class': 'rg_meta'})\n metadata_dicts = (json.loads(e.text) for e in img_tags)\n img_tuples = ((_img_fname(d['ou']), d['ou']) for d in metadata_dicts if d['ity'] == format)\n return list(itertools.islice(img_tuples, n_images))", "response": "Parse the google images html to img tuples containining ( fname url )"} {"instruction": "How would you explain what the following Python 3 function does\ndef _fetch_img_tuples_webdriver(url:str, format:str='jpg', n_images:int=150) -> list:\n try:\n from selenium import webdriver\n from selenium.webdriver.common.keys import Keys\n except:\n print(\"\"\"Looks like you're trying to download > 100 images and `selenium`\n is not installed. Try running `pip install selenium` to fix this. \n You'll also need chrome and `chromedriver` installed.\"\"\")\n options = webdriver.ChromeOptions()\n options.add_argument(\"--headless\")\n try: driver = webdriver.Chrome(chrome_options=options)\n except: print(\"\"\"Error initializing chromedriver. \n Check if it's in your path by running `which chromedriver`\"\"\")\n driver.set_window_size(1440, 900)\n driver.get(url)\n\n for i in range(n_images // 100 + 1):\n driver.execute_script(\"window.scrollTo(0, document.body.scrollHeight)\")\n time.sleep(0.5 + random.random()/2.0)\n\n n_available = len(driver.find_elements_by_css_selector(\"div.rg_meta\"))\n if n_available < n_images:\n raise ValueError(f\"Requested {n_images} images, but only found {n_available}.\")\n\n html = driver.page_source\n driver.close()\n return _html_to_img_tuples(html, format=format, n_images=n_images)", "response": "Fetch the Google Images from the given URL and return the image metadata as tuples."} {"instruction": "Here you have a function in Python 3, explain what it does\ndef _download_images(label_path:PathOrStr, img_tuples:list, max_workers:int=defaults.cpus, timeout:int=4) -> FilePathList:\n os.makedirs(Path(label_path), exist_ok=True)\n parallel( partial(_download_single_image, label_path, timeout=timeout), img_tuples, max_workers=max_workers)\n return get_image_files(label_path)", "response": "Downloads images in img_tuples to label_path."} {"instruction": "Create a Python 3 function to\ndownload a single image from Google Search results to label_path.", "response": "def _download_single_image(label_path:Path, img_tuple:tuple, i:int, timeout:int=4) -> None:\n \"\"\"\n Downloads a single image from Google Search results to `label_path`\n given an `img_tuple` that contains `(fname, url)` of an image to download.\n `i` is just an iteration number `int`. \n \"\"\"\n suffix = re.findall(r'\\.\\w+?(?=(?:\\?|$))', img_tuple[1])\n suffix = suffix[0].lower() if len(suffix)>0 else '.jpg'\n fname = f\"{i:08d}{suffix}\"\n download_url(img_tuple[1], label_path/fname, timeout=timeout)"} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef _init_ui(self) -> VBox:\n \"Initialize the widget UI and return the UI.\"\n self._search_input = Text(placeholder=\"What images to search for?\")\n self._count_input = BoundedIntText(placeholder=\"How many pics?\", value=10, min=1, max=5000, step=1,\n layout=Layout(width='60px'))\n self._size_input = Dropdown(options= _img_sizes.keys(), value='>400*300', layout=Layout(width='120px'))\n self._download_button = Button(description=\"Search & Download\", icon=\"download\", layout=Layout(width='200px'))\n self._download_button.on_click(self.on_download_button_click)\n self._output = Output()\n self._controls_pane = HBox([self._search_input, self._count_input, self._size_input, self._download_button],\n layout=Layout(width='auto', height='40px'))\n self._heading = \"\"\n self._download_complete_heading = \"

Download complete. Here are a few images

\"\n self._preview_header = widgets.HTML(self._heading, layout=Layout(height='60px'))\n self._img_pane = Box(layout=Layout(display='inline'))\n return VBox([self._controls_pane, self._preview_header, self._img_pane])", "response": "Initialize the widget UI and return the UI."} {"instruction": "Can you implement a function in Python 3 that\nclears the widget s images preview pane.", "response": "def clear_imgs(self) -> None:\n \"Clear the widget's images preview pane.\"\n self._preview_header.value = self._heading\n self._img_pane.children = tuple()"} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef validate_search_input(self) -> bool:\n \"Check if input value is empty.\"\n input = self._search_input\n if input.value == str(): input.layout = Layout(border=\"solid 2px red\", height='auto')\n else: self._search_input.layout = Layout()\n return input.value != str()", "response": "Check if input value is empty."} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef on_download_button_click(self, btn) -> None:\n \"Download button click handler: validate search term and download images.\"\n term = self._search_input.value\n limit = int(self._count_input.value)\n size = self._size_input.value\n if not self.validate_search_input(): return\n self.clear_imgs()\n downloaded_images = download_google_images(self._path, term, n_images=limit, size=size)\n self.display_images_widgets(downloaded_images[:min(limit, 12)])\n self._preview_header.value = self._download_complete_heading\n self.render()", "response": "Download button click handler : validate search term and download images."} {"instruction": "Create a Python 3 function to\ndisplay a few preview images in the notebook", "response": "def display_images_widgets(self, fnames:list) -> None:\n \"Display a few preview images in the notebook\"\n imgs = [widgets.Image(value=open(f, 'rb').read(), width='200px') for f in fnames]\n self._img_pane.children = tuple(imgs)"} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef on_train_begin(self, pbar, **kwargs:Any)->None:\n \"Initialize optimizer and learner hyperparameters.\"\n setattr(pbar, 'clean_on_interrupt', True)\n self.learn.save('tmp')\n self.opt = self.learn.opt\n self.opt.lr = self.sched.start\n self.stop,self.best_loss = False,0.\n return {'skip_validate': True}", "response": "Initialize optimizer and learner hyperparameters."} {"instruction": "Here you have a function in Python 3, explain what it does\ndef on_batch_end(self, iteration:int, smooth_loss:TensorOrNumber, **kwargs:Any)->None:\n \"Determine if loss has runaway and we should stop.\"\n if iteration==0 or smooth_loss < self.best_loss: self.best_loss = smooth_loss\n self.opt.lr = self.sched.step()\n if self.sched.is_done or (self.stop_div and (smooth_loss > 4*self.best_loss or torch.isnan(smooth_loss))):\n #We use the smoothed loss to decide on the stopping since it's less shaky.\n return {'stop_epoch': True, 'stop_training': True}", "response": "Determine if loss has runaway and we should stop."} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef on_train_end(self, **kwargs:Any)->None:\n \"Cleanup learn model weights disturbed during LRFinder exploration.\"\n self.learn.load('tmp', purge=False)\n if hasattr(self.learn.model, 'reset'): self.learn.model.reset()\n for cb in self.callbacks:\n if hasattr(cb, 'reset'): cb.reset()\n print('LR Finder is complete, type {learner_name}.recorder.plot() to see the graph.')", "response": "Cleanup learn model weights disturbed during LRFinder exploration."} {"instruction": "Implement a Python 3 function for\napplying a dropout mask to the object holding the specified size.", "response": "def dropout_mask(x, sz, dropout):\n \"\"\" Applies a dropout mask whose size is determined by passed argument 'sz'.\n Args:\n x (nn.Variable): A torch Variable object\n sz (tuple(int, int, int)): The expected size of the new tensor\n dropout (float): The dropout fraction to apply\n\n This method uses the bernoulli distribution to decide which activations to keep.\n Additionally, the sampled activations is rescaled is using the factor 1/(1 - dropout).\n\n In the example given below, one can see that approximately .8 fraction of the\n returned tensors are zero. Rescaling with the factor 1/(1 - 0.8) returns a tensor\n with 5's in the unit places.\n\n The official link to the pytorch bernoulli function is here:\n http://pytorch.org/docs/master/torch.html#torch.bernoulli\n\n Examples:\n >>> a_Var = torch.autograd.Variable(torch.Tensor(2, 3, 4).uniform_(0, 1), requires_grad=False)\n >>> a_Var\n Variable containing:\n (0 ,.,.) =\n 0.6890 0.5412 0.4303 0.8918\n 0.3871 0.7944 0.0791 0.5979\n 0.4575 0.7036 0.6186 0.7217\n (1 ,.,.) =\n 0.8354 0.1690 0.1734 0.8099\n 0.6002 0.2602 0.7907 0.4446\n 0.5877 0.7464 0.4257 0.3386\n [torch.FloatTensor of size 2x3x4]\n >>> a_mask = dropout_mask(a_Var.data, (1,a_Var.size(1),a_Var.size(2)), dropout=0.8)\n >>> a_mask\n (0 ,.,.) =\n 0 5 0 0\n 0 0 0 5\n 5 0 5 0\n [torch.FloatTensor of size 1x3x4]\n \"\"\"\n return x.new(*sz).bernoulli_(1-dropout)/(1-dropout)"} {"instruction": "Create a Python 3 function for\nsetting the weights of the current module to the weights of the new module.", "response": "def _setweights(self):\n \"\"\" Uses pytorch's built-in dropout function to apply dropout to the parameters of\n the wrapped module.\n\n Args:\n None\n Returns:\n None\n \"\"\"\n for name_w in self.weights:\n raw_w = getattr(self.module, name_w + '_raw')\n w = torch.nn.functional.dropout(raw_w, p=self.dropout, training=self.training)\n if hasattr(self.module, name_w):\n delattr(self.module, name_w)\n setattr(self.module, name_w, w)"} {"instruction": "Here you have a function in Python 3, explain what it does\ndef load_data(path:PathOrStr, file:PathLikeOrBinaryStream='data_save.pkl', bs:int=64, val_bs:int=None, num_workers:int=defaults.cpus,\n dl_tfms:Optional[Collection[Callable]]=None, device:torch.device=None, collate_fn:Callable=data_collate,\n no_check:bool=False, **kwargs)->DataBunch:\n \"Load a saved `DataBunch` from `path/file`. `file` can be file-like (file or buffer)\"\n source = Path(path)/file if is_pathlike(file) else file\n ll = torch.load(source, map_location='cpu') if defaults.device == torch.device('cpu') else torch.load(source)\n return ll.databunch(path=path, bs=bs, val_bs=val_bs, num_workers=num_workers, dl_tfms=dl_tfms, device=device,\n collate_fn=collate_fn, no_check=no_check, **kwargs)", "response": "Load a saved DataBunch from path / file. file can be file - like ( file or buffer )"} {"instruction": "Given the following Python 3 function, write the documentation\ndef create(cls, train_ds:Dataset, valid_ds:Dataset, test_ds:Optional[Dataset]=None, path:PathOrStr='.', bs:int=64,\n val_bs:int=None, num_workers:int=defaults.cpus, dl_tfms:Optional[Collection[Callable]]=None,\n device:torch.device=None, collate_fn:Callable=data_collate, no_check:bool=False, **dl_kwargs)->'DataBunch':\n \"Create a `DataBunch` from `train_ds`, `valid_ds` and maybe `test_ds` with a batch size of `bs`. Passes `**dl_kwargs` to `DataLoader()`\"\n datasets = cls._init_ds(train_ds, valid_ds, test_ds)\n val_bs = ifnone(val_bs, bs)\n dls = [DataLoader(d, b, shuffle=s, drop_last=s, num_workers=num_workers, **dl_kwargs) for d,b,s in\n zip(datasets, (bs,val_bs,val_bs,val_bs), (True,False,False,False)) if d is not None]\n return cls(*dls, path=path, device=device, dl_tfms=dl_tfms, collate_fn=collate_fn, no_check=no_check)", "response": "Create a DataBunch from train_ds valid_ds and maybe test_ds with a batch size of bs. Passes **dl_kwargs to DataLoader ()"} {"instruction": "Can you write a function in Python 3 where it\nreturns appropriate Dataset for validation training or test.", "response": "def dl(self, ds_type:DatasetType=DatasetType.Valid)->DeviceDataLoader:\n \"Returns appropriate `Dataset` for validation, training, or test (`ds_type`).\"\n #TODO: refactor\n return (self.train_dl if ds_type == DatasetType.Train else\n self.test_dl if ds_type == DatasetType.Test else\n self.valid_dl if ds_type == DatasetType.Valid else\n self.single_dl if ds_type == DatasetType.Single else\n self.fix_dl)"} {"instruction": "Can you create a Python 3 function that\nreturns a list of all DeviceDataLoaders. If you need a specific DeviceDataLoader access via the relevant property ( train_dl valid_dl single_dl. If you need a specific DeviceDataLoader access via the relevant property ( train_dl valid_dl single_dl as the index of DLs in this list is not guaranteed to remain constant.", "response": "def dls(self)->List[DeviceDataLoader]:\n \"Returns a list of all DeviceDataLoaders. If you need a specific DeviceDataLoader, access via the relevant property (`train_dl`, `valid_dl`, etc) as the index of DLs in this list is not guaranteed to remain constant.\"\n res = [self.train_dl, self.fix_dl, self.single_dl]\n # Preserve the original ordering of Train, Valid, Fix, Single, Test Data Loaders\n # (Unknown/not verified as of 1.0.47 whether there are other methods explicitly using DLs their list index)\n if self.valid_dl: res.insert(1, self.valid_dl)\n return res if not self.test_dl else res + [self.test_dl]"} {"instruction": "Implement a Python 3 function for\nsaving the DataBunch in self. path file. file can be file - like ( file or buffer )", "response": "def save(self, file:PathLikeOrBinaryStream= 'data_save.pkl')->None:\n \"Save the `DataBunch` in `self.path/file`. `file` can be file-like (file or buffer)\"\n if not getattr(self, 'label_list', False):\n warn(\"Serializing the `DataBunch` only works when you created it using the data block API.\")\n return\n try_save(self.label_list, self.path, file)"} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef one_batch(self, ds_type:DatasetType=DatasetType.Train, detach:bool=True, denorm:bool=True, cpu:bool=True)->Collection[Tensor]:\n \"Get one batch from the data loader of `ds_type`. Optionally `detach` and `denorm`.\"\n dl = self.dl(ds_type)\n w = self.num_workers\n self.num_workers = 0\n try: x,y = next(iter(dl))\n finally: self.num_workers = w\n if detach: x,y = to_detach(x,cpu=cpu),to_detach(y,cpu=cpu)\n norm = getattr(self,'norm',False)\n if denorm and norm:\n x = self.denorm(x)\n if norm.keywords.get('do_y',False): y = self.denorm(y, do_x=True)\n return x,y", "response": "Get one batch from the data loader of ds_type. Optionally detach and denorm."} {"instruction": "Here you have a function in Python 3, explain what it does\ndef one_item(self, item, detach:bool=False, denorm:bool=False, cpu:bool=False):\n \"Get `item` into a batch. Optionally `detach` and `denorm`.\"\n ds = self.single_ds\n with ds.set_item(item):\n return self.one_batch(ds_type=DatasetType.Single, detach=detach, denorm=denorm, cpu=cpu)", "response": "Get item into a batch. Optionally detach and denorm."} {"instruction": "Can you implement a function in Python 3 that\nshows a batch of data in ds_type on a few rows.", "response": "def show_batch(self, rows:int=5, ds_type:DatasetType=DatasetType.Train, reverse:bool=False, **kwargs)->None:\n \"Show a batch of data in `ds_type` on a few `rows`.\"\n x,y = self.one_batch(ds_type, True, True)\n if reverse: x,y = x.flip(0),y.flip(0)\n n_items = rows **2 if self.train_ds.x._square_show else rows\n if self.dl(ds_type).batch_size < n_items: n_items = self.dl(ds_type).batch_size\n xs = [self.train_ds.x.reconstruct(grab_idx(x, i)) for i in range(n_items)]\n #TODO: get rid of has_arg if possible\n if has_arg(self.train_ds.y.reconstruct, 'x'):\n ys = [self.train_ds.y.reconstruct(grab_idx(y, i), x=x) for i,x in enumerate(xs)]\n else : ys = [self.train_ds.y.reconstruct(grab_idx(y, i)) for i in range(n_items)]\n self.train_ds.x.show_xys(xs, ys, **kwargs)"} {"instruction": "How would you code a function in Python 3 to\nexport the minimal state of self for inference in self. path / file. file can be file - like ( file or buffer )", "response": "def export(self, file:PathLikeOrBinaryStream='export.pkl'):\n \"Export the minimal state of `self` for inference in `self.path/file`. `file` can be file-like (file or buffer)\"\n xtra = dict(normalize=self.norm.keywords) if getattr(self, 'norm', False) else {}\n try_save(self.valid_ds.get_state(**xtra), self.path, file)"} {"instruction": "Explain what the following Python 3 code does\ndef sanity_check(self):\n \"Check the underlying data in the training set can be properly loaded.\"\n final_message = \"You can deactivate this warning by passing `no_check=True`.\"\n if not hasattr(self.train_ds, 'items') or len(self.train_ds.items) == 0 or not hasattr(self.train_dl, 'batch_sampler'): return\n if len(self.train_dl) == 0:\n warn(f\"\"\"Your training dataloader is empty, you have only {len(self.train_dl.dataset)} items in your training set.\n Your batch size is {self.train_dl.batch_size}, you should lower it.\"\"\")\n print(final_message)\n return\n idx = next(iter(self.train_dl.batch_sampler))\n samples,fails = [],[]\n for i in idx:\n try: samples.append(self.train_dl.dataset[i])\n except: fails.append(i)\n if len(fails) > 0:\n warn_msg = \"There seems to be something wrong with your dataset, for example, in the first batch can't access\"\n if len(fails) == len(idx):\n warn_msg += f\" any element of self.train_ds.\\nTried: {show_some(idx)}\"\n else:\n warn_msg += f\" these elements in self.train_ds: {show_some(fails)}\"\n warn(warn_msg)\n print(final_message)\n return\n try: batch = self.collate_fn(samples)\n except:\n message = \"It's not possible to collate samples of your dataset together in a batch.\"\n try:\n shapes = [[o[i].data.shape for o in samples] for i in range(2)]\n message += f'\\nShapes of the inputs/targets:\\n{shapes}'\n except: pass\n warn(message)\n print(final_message)", "response": "Check the underlying data in the training set can be properly loaded."} {"instruction": "Make a summary of the following Python 3 code\ndef one_cycle_scheduler(lr_max:float, **kwargs:Any)->OneCycleScheduler:\n \"Instantiate a `OneCycleScheduler` with `lr_max`.\"\n return partial(OneCycleScheduler, lr_max=lr_max, **kwargs)", "response": "Instantiate a OneCycleScheduler with lr_max."} {"instruction": "Create a Python 3 function for\nfitting a model following the 1cycle policy.", "response": "def fit_one_cycle(learn:Learner, cyc_len:int, max_lr:Union[Floats,slice]=defaults.lr,\n moms:Tuple[float,float]=(0.95,0.85), div_factor:float=25., pct_start:float=0.3, final_div:float=None,\n wd:float=None, callbacks:Optional[CallbackList]=None, tot_epochs:int=None, start_epoch:int=None)->None:\n \"Fit a model following the 1cycle policy.\"\n max_lr = learn.lr_range(max_lr)\n callbacks = listify(callbacks)\n callbacks.append(OneCycleScheduler(learn, max_lr, moms=moms, div_factor=div_factor, pct_start=pct_start,\n final_div=final_div, tot_epochs=tot_epochs, start_epoch=start_epoch))\n learn.fit(cyc_len, max_lr, wd=wd, callbacks=callbacks)"} {"instruction": "Can you implement a function in Python 3 that\nexplores lr from start_lr to end_lr over num_it iterations in learn. If stop_div stops when loss diverges.", "response": "def lr_find(learn:Learner, start_lr:Floats=1e-7, end_lr:Floats=10, num_it:int=100, stop_div:bool=True, wd:float=None):\n \"Explore lr from `start_lr` to `end_lr` over `num_it` iterations in `learn`. If `stop_div`, stops when loss diverges.\"\n start_lr = learn.lr_range(start_lr)\n start_lr = np.array(start_lr) if is_listy(start_lr) else start_lr\n end_lr = learn.lr_range(end_lr)\n end_lr = np.array(end_lr) if is_listy(end_lr) else end_lr\n cb = LRFinder(learn, start_lr, end_lr, num_it, stop_div)\n epochs = int(np.ceil(num_it/len(learn.data.train_dl)))\n learn.fit(epochs, start_lr, callbacks=[cb], wd=wd)"} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef to_fp16(learn:Learner, loss_scale:float=None, max_noskip:int=1000, dynamic:bool=True, clip:float=None,\n flat_master:bool=False, max_scale:float=2**24)->Learner:\n \"Put `learn` in FP16 precision mode.\"\n learn.to_fp32()\n learn.model = model2half(learn.model)\n learn.data.add_tfm(batch_to_half)\n learn.mp_cb = MixedPrecision(learn, loss_scale=loss_scale, max_noskip=max_noskip, dynamic=dynamic, clip=clip, \n flat_master=flat_master, max_scale=max_scale)\n learn.callbacks.append(learn.mp_cb)\n return learn", "response": "Put learn in FP16 precision mode."} {"instruction": "Can you implement a function in Python 3 that\nputs learn back to FP32 precision mode.", "response": "def to_fp32(learn:Learner):\n \"Put `learn` back to FP32 precision mode.\"\n learn.data.remove_tfm(batch_to_half)\n for cb in learn.callbacks: \n if isinstance(cb, MixedPrecision): learn.callbacks.remove(cb)\n learn.model = learn.model.float()\n return learn"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef mixup(learn:Learner, alpha:float=0.4, stack_x:bool=False, stack_y:bool=True) -> Learner:\n \"Add mixup https://arxiv.org/abs/1710.09412 to `learn`.\"\n learn.callback_fns.append(partial(MixUpCallback, alpha=alpha, stack_x=stack_x, stack_y=stack_y))\n return learn", "response": "Add mixup https://arxiv. org / abs / 1710.09412 to learn."} {"instruction": "Can you tell what is the following Python 3 function doing\ndef clip_grad(learn:Learner, clip:float=0.1)->Learner:\n \"Add gradient clipping of `clip` during training.\"\n learn.callback_fns.append(partial(GradientClipping, clip=clip))\n return learn", "response": "Add gradient clipping of clip during training."} {"instruction": "Make a summary of the following Python 3 code\ndef _learner_interpret(learn:Learner, ds_type:DatasetType=DatasetType.Valid):\n \"Create a `ClassificationInterpretation` object from `learner` on `ds_type` with `tta`.\"\n return ClassificationInterpretation.from_learner(learn, ds_type=ds_type)", "response": "Create a ClassificationInterpretation object from learner on ds_type with tta."} {"instruction": "Write a Python 3 script for\nclipping the gradient before the optimizer step.", "response": "def on_backward_end(self, **kwargs):\n \"Clip the gradient before the optimizer step.\"\n if self.clip: nn.utils.clip_grad_norm_(self.learn.model.parameters(), self.clip)"} {"instruction": "Given the following Python 3 function, write the documentation\ndef on_train_begin(self, **kwargs):\n \"check if loss is reduction\"\n if hasattr(self.loss_func, \"reduction\") and (self.loss_func.reduction != \"sum\"):\n warn(\"For better gradients consider 'reduction=sum'\")", "response": "check if loss is reduction"} {"instruction": "Make a summary of the following Python 3 code\ndef on_batch_begin(self, last_input, last_target, **kwargs):\n \"accumulate samples and batches\"\n self.acc_samples += last_input.shape[0]\n self.acc_batches += 1", "response": "accumulate samples and batches"} {"instruction": "Can you implement a function in Python 3 that\naccumulates step and reset samples True will result in no stepping", "response": "def on_backward_end(self, **kwargs):\n \"accumulated step and reset samples, True will result in no stepping\"\n if (self.acc_batches % self.n_step) == 0:\n for p in (self.learn.model.parameters()):\n if p.requires_grad: p.grad.div_(self.acc_samples)\n self.acc_samples = 0\n else: return {'skip_step':True, 'skip_zero':True}"} {"instruction": "Here you have a function in Python 3, explain what it does\ndef on_epoch_end(self, **kwargs):\n \"step the rest of the accumulated grads if not perfectly divisible\"\n for p in (self.learn.model.parameters()):\n if p.requires_grad: p.grad.div_(self.acc_samples)\n if not self.drop_last: self.learn.opt.step()\n self.learn.opt.zero_grad()", "response": "step the rest of the accumulated grads if not perfectly divisible"} {"instruction": "Write a Python 3 script to\ncreate an instance of ClassificationInterpretation", "response": "def from_learner(cls, learn: Learner, ds_type:DatasetType=DatasetType.Valid):\n \"Create an instance of `ClassificationInterpretation`\"\n preds = learn.get_preds(ds_type=ds_type, with_loss=True)\n return cls(learn, *preds)"} {"instruction": "Write a Python 3 function that can\nplot the confusion matrix with title and using cmap.", "response": "def plot_confusion_matrix(self, normalize:bool=False, title:str='Confusion matrix', cmap:Any=\"Blues\", slice_size:int=1,\n norm_dec:int=2, plot_txt:bool=True, return_fig:bool=None, **kwargs)->Optional[plt.Figure]:\n \"Plot the confusion matrix, with `title` and using `cmap`.\"\n # This function is mainly copied from the sklearn docs\n cm = self.confusion_matrix(slice_size=slice_size)\n if normalize: cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]\n fig = plt.figure(**kwargs)\n plt.imshow(cm, interpolation='nearest', cmap=cmap)\n plt.title(title)\n tick_marks = np.arange(self.data.c)\n plt.xticks(tick_marks, self.data.y.classes, rotation=90)\n plt.yticks(tick_marks, self.data.y.classes, rotation=0)\n\n if plot_txt:\n thresh = cm.max() / 2.\n for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):\n coeff = f'{cm[i, j]:.{norm_dec}f}' if normalize else f'{cm[i, j]}'\n plt.text(j, i, coeff, horizontalalignment=\"center\", verticalalignment=\"center\", color=\"white\" if cm[i, j] > thresh else \"black\")\n\n plt.tight_layout()\n plt.ylabel('Actual')\n plt.xlabel('Predicted')\n plt.grid(False)\n if ifnone(return_fig, defaults.return_fig): return fig"} {"instruction": "Can you create a Python 3 function that\nsorts descending list of largest non - diagonal entries of confusion matrix presented as actual predicted number of occurrences.", "response": "def most_confused(self, min_val:int=1, slice_size:int=1)->Collection[Tuple[str,str,int]]:\n \"Sorted descending list of largest non-diagonal entries of confusion matrix, presented as actual, predicted, number of occurrences.\"\n cm = self.confusion_matrix(slice_size=slice_size)\n np.fill_diagonal(cm, 0)\n res = [(self.data.classes[i],self.data.classes[j],cm[i,j])\n for i,j in zip(*np.where(cm>=min_val))]\n return sorted(res, key=itemgetter(2), reverse=True)"} {"instruction": "Write a Python 3 script for\ncalculating the F - beta score.", "response": "def fbeta(log_preds, targs, beta, thresh=0.5, epsilon=1e-8):\n \"\"\"Calculates the F-beta score (the weighted harmonic mean of precision and recall).\n This is the micro averaged version where the true positives, false negatives and\n false positives are calculated globally (as opposed to on a per label basis).\n\n beta == 1 places equal weight on precision and recall, b < 1 emphasizes precision and\n beta > 1 favors recall.\n \"\"\"\n assert beta > 0, 'beta needs to be greater than 0'\n beta2 = beta ** 2\n rec = recall(log_preds, targs, thresh)\n prec = precision(log_preds, targs, thresh)\n return (1 + beta2) * prec * rec / (beta2 * prec + rec + epsilon)"} {"instruction": "Here you have a function in Python 3, explain what it does\ndef fbeta_np(preds, targs, beta, thresh=0.5, epsilon=1e-8):\n assert beta > 0, 'beta needs to be greater than 0'\n beta2 = beta ** 2\n rec = recall_np(preds, targs, thresh)\n prec = precision_np(preds, targs, thresh)\n return (1 + beta2) * prec * rec / (beta2 * prec + rec + epsilon)", "response": "compute Fbeta for a single node"} {"instruction": "Write a Python 3 function for\ndistributing training of Imagenet.", "response": "def main( gpu:Param(\"GPU to run on\", str)=None ):\n \"\"\"Distributed training of Imagenet. Fastest speed is if you run with: python -m fastai.launch\"\"\"\n path = Path('/mnt/fe2_disk/')\n tot_epochs,size,bs,lr = 60,224,256,3e-1\n dirname = 'imagenet'\n\n gpu = setup_distrib(gpu)\n if gpu is None: bs *= torch.cuda.device_count()\n n_gpus = num_distrib() or 1\n workers = min(12, num_cpus()//n_gpus)\n data = get_data(path/dirname, size, bs, workers)\n b_its = len(data.train_dl)//n_gpus\n\n # Using bs 256 on single GPU as baseline, scale the LR linearly\n tot_bs = bs*n_gpus\n bs_rat = tot_bs/256\n lr *= bs_rat\n\n ph1 = (TrainingPhase(tot_epochs*0.10*b_its)\n .schedule_hp('lr', (lr/10,lr), anneal=annealing_cos))\n ph2 = (TrainingPhase(tot_epochs*0.90*b_its)\n .schedule_hp('lr', (lr,lr/1e5), anneal=annealing_cos))\n opt_func = partial(optim.Adam, eps=0.1, betas=(0.9,0.99))\n learn = Learner(data, models.xresnet50(), metrics=[accuracy,top_k_accuracy], wd=1e-3,\n opt_func=opt_func, bn_wd=False, true_wd=True,\n loss_func = LabelSmoothingCrossEntropy()).mixup(alpha=0.2)\n\n learn.callback_fns += [\n partial(GeneralScheduler, phases=(ph1,ph2)),\n partial(SaveModelCallback, every='epoch', name='model')\n ]\n learn.split(lambda m: (children(m)[-2],))\n if gpu is None: learn.model = nn.DataParallel(learn.model)\n else: learn.to_distributed(gpu)\n learn.to_fp16(dynamic=True)\n\n learn.fit(tot_epochs, 1)\n if rank_distrib(): time.sleep(1)\n learn.save('done')"} {"instruction": "How would you code a function in Python 3 to\nget the metadata associated with arch.", "response": "def cnn_config(arch):\n \"Get the metadata associated with `arch`.\"\n torch.backends.cudnn.benchmark = True\n return model_meta.get(arch, _default_meta)"} {"instruction": "Explain what the following Python 3 code does\ndef create_body(arch:Callable, pretrained:bool=True, cut:Optional[Union[int, Callable]]=None):\n \"Cut off the body of a typically pretrained `model` at `cut` (int) or cut the model as specified by `cut(model)` (function).\"\n model = arch(pretrained)\n cut = ifnone(cut, cnn_config(arch)['cut'])\n if cut is None:\n ll = list(enumerate(model.children()))\n cut = next(i for i,o in reversed(ll) if has_pool_type(o))\n if isinstance(cut, int): return nn.Sequential(*list(model.children())[:cut])\n elif isinstance(cut, Callable): return cut(model)\n else: raise NamedError(\"cut must be either integer or a function\")", "response": "Cut off the body of a typically pretrained model at cut ( int or cut the model as specified by cut ( function )."} {"instruction": "Make a summary of the following Python 3 code\ndef create_head(nf:int, nc:int, lin_ftrs:Optional[Collection[int]]=None, ps:Floats=0.5,\n concat_pool:bool=True, bn_final:bool=False):\n \"Model head that takes `nf` features, runs through `lin_ftrs`, and about `nc` classes.\"\n lin_ftrs = [nf, 512, nc] if lin_ftrs is None else [nf] + lin_ftrs + [nc]\n ps = listify(ps)\n if len(ps) == 1: ps = [ps[0]/2] * (len(lin_ftrs)-2) + ps\n actns = [nn.ReLU(inplace=True)] * (len(lin_ftrs)-2) + [None]\n pool = AdaptiveConcatPool2d() if concat_pool else nn.AdaptiveAvgPool2d(1)\n layers = [pool, Flatten()]\n for ni,no,p,actn in zip(lin_ftrs[:-1], lin_ftrs[1:], ps, actns):\n layers += bn_drop_lin(ni, no, True, p, actn)\n if bn_final: layers.append(nn.BatchNorm1d(lin_ftrs[-1], momentum=0.01))\n return nn.Sequential(*layers)", "response": "Model head that takes nf features runs through lin_ftrs and about nc classes."} {"instruction": "How would you explain what the following Python 3 function does\ndef create_cnn_model(base_arch:Callable, nc:int, cut:Union[int,Callable]=None, pretrained:bool=True,\n lin_ftrs:Optional[Collection[int]]=None, ps:Floats=0.5, custom_head:Optional[nn.Module]=None,\n split_on:Optional[SplitFuncOrIdxList]=None, bn_final:bool=False, concat_pool:bool=True):\n \"Create custom convnet architecture\"\n body = create_body(base_arch, pretrained, cut)\n if custom_head is None:\n nf = num_features_model(nn.Sequential(*body.children())) * (2 if concat_pool else 1)\n head = create_head(nf, nc, lin_ftrs, ps=ps, concat_pool=concat_pool, bn_final=bn_final)\n else: head = custom_head\n return nn.Sequential(body, head)", "response": "Create custom convnet architecture"} {"instruction": "Write a Python 3 script for\nbuilding convnet style learner.", "response": "def cnn_learner(data:DataBunch, base_arch:Callable, cut:Union[int,Callable]=None, pretrained:bool=True,\n lin_ftrs:Optional[Collection[int]]=None, ps:Floats=0.5, custom_head:Optional[nn.Module]=None,\n split_on:Optional[SplitFuncOrIdxList]=None, bn_final:bool=False, init=nn.init.kaiming_normal_,\n concat_pool:bool=True, **kwargs:Any)->Learner:\n \"Build convnet style learner.\"\n meta = cnn_config(base_arch)\n model = create_cnn_model(base_arch, data.c, cut, pretrained, lin_ftrs, ps=ps, custom_head=custom_head,\n split_on=split_on, bn_final=bn_final, concat_pool=concat_pool)\n learn = Learner(data, model, **kwargs)\n learn.split(split_on or meta['split'])\n if pretrained: learn.freeze()\n if init: apply_init(model[1], init)\n return learn"} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef unet_learner(data:DataBunch, arch:Callable, pretrained:bool=True, blur_final:bool=True,\n norm_type:Optional[NormType]=NormType, split_on:Optional[SplitFuncOrIdxList]=None, blur:bool=False,\n self_attention:bool=False, y_range:Optional[Tuple[float,float]]=None, last_cross:bool=True,\n bottle:bool=False, cut:Union[int,Callable]=None, **learn_kwargs:Any)->Learner:\n \"Build Unet learner from `data` and `arch`.\"\n meta = cnn_config(arch)\n body = create_body(arch, pretrained, cut)\n model = to_device(models.unet.DynamicUnet(body, n_classes=data.c, blur=blur, blur_final=blur_final,\n self_attention=self_attention, y_range=y_range, norm_type=norm_type, last_cross=last_cross,\n bottle=bottle), data.device)\n learn = Learner(data, model, **learn_kwargs)\n learn.split(ifnone(split_on, meta['split']))\n if pretrained: learn.freeze()\n apply_init(model[2], nn.init.kaiming_normal_)\n return learn", "response": "Build Unet learner from data and arch."} {"instruction": "Given the following Python 3 function, write the documentation\ndef _cl_int_from_learner(cls, learn:Learner, ds_type:DatasetType=DatasetType.Valid, tta=False):\n \"Create an instance of `ClassificationInterpretation`. `tta` indicates if we want to use Test Time Augmentation.\"\n preds = learn.TTA(ds_type=ds_type, with_loss=True) if tta else learn.get_preds(ds_type=ds_type, with_loss=True)\n return cls(learn, *preds, ds_type=ds_type)", "response": "Create an instance of ClassificationInterpretation. tta indicates if we want to use Test Time Augmentation."} {"instruction": "Explain what the following Python 3 code does\ndef _cl_int_plot_top_losses(self, k, largest=True, figsize=(12,12), heatmap:bool=True, heatmap_thresh:int=16,\n return_fig:bool=None)->Optional[plt.Figure]:\n \"Show images in `top_losses` along with their prediction, actual, loss, and probability of actual class.\"\n tl_val,tl_idx = self.top_losses(k, largest)\n classes = self.data.classes\n cols = math.ceil(math.sqrt(k))\n rows = math.ceil(k/cols)\n fig,axes = plt.subplots(rows, cols, figsize=figsize)\n fig.suptitle('prediction/actual/loss/probability', weight='bold', size=14)\n for i,idx in enumerate(tl_idx):\n im,cl = self.data.dl(self.ds_type).dataset[idx]\n cl = int(cl)\n im.show(ax=axes.flat[i], title=\n f'{classes[self.pred_class[idx]]}/{classes[cl]} / {self.losses[idx]:.2f} / {self.probs[idx][cl]:.2f}')\n if heatmap:\n xb,_ = self.data.one_item(im, detach=False, denorm=False)\n m = self.learn.model.eval()\n with hook_output(m[0]) as hook_a:\n with hook_output(m[0], grad= True) as hook_g:\n preds = m(xb)\n preds[0,cl].backward()\n acts = hook_a.stored[0].cpu()\n if (acts.shape[-1]*acts.shape[-2]) >= heatmap_thresh:\n grad = hook_g.stored[0][0].cpu()\n grad_chan = grad.mean(1).mean(1)\n mult = F.relu(((acts*grad_chan[...,None,None])).sum(0))\n sz = list(im.shape[-2:])\n axes.flat[i].imshow(mult, alpha=0.6, extent=(0,*sz[::-1],0), interpolation='bilinear', cmap='magma') \n if ifnone(return_fig, defaults.return_fig): return fig", "response": "Show images in top_losses along with their prediction actual loss and probability of actual class."} {"instruction": "Implement a function in Python 3 to\nshow images in top_losses along with their prediction actual loss and probability of predicted class in a multilabeled dataset.", "response": "def _cl_int_plot_multi_top_losses(self, samples:int=3, figsize:Tuple[int,int]=(8,8), save_misclassified:bool=False):\n \"Show images in `top_losses` along with their prediction, actual, loss, and probability of predicted class in a multilabeled dataset.\"\n if samples >20:\n print(\"Max 20 samples\")\n return\n losses, idxs = self.top_losses(self.data.c)\n l_dim = len(losses.size())\n if l_dim == 1: losses, idxs = self.top_losses()\n infolist, ordlosses_idxs, mismatches_idxs, mismatches, losses_mismatches, mismatchescontainer = [],[],[],[],[],[]\n truthlabels = np.asarray(self.y_true, dtype=int)\n classes_ids = [k for k in enumerate(self.data.classes)]\n predclass = np.asarray(self.pred_class)\n for i,pred in enumerate(predclass):\n where_truth = np.nonzero((truthlabels[i]>0))[0]\n mismatch = np.all(pred!=where_truth)\n if mismatch:\n mismatches_idxs.append(i)\n if l_dim > 1 : losses_mismatches.append((losses[i][pred], i))\n else: losses_mismatches.append((losses[i], i))\n if l_dim > 1: infotup = (i, pred, where_truth, losses[i][pred], np.round(self.probs[i], decimals=3)[pred], mismatch)\n else: infotup = (i, pred, where_truth, losses[i], np.round(self.probs[i], decimals=3)[pred], mismatch)\n infolist.append(infotup)\n ds = self.data.dl(self.ds_type).dataset\n mismatches = ds[mismatches_idxs]\n ordlosses = sorted(losses_mismatches, key = lambda x: x[0], reverse=True)\n for w in ordlosses: ordlosses_idxs.append(w[1])\n mismatches_ordered_byloss = ds[ordlosses_idxs]\n print(f'{str(len(mismatches))} misclassified samples over {str(len(self.data.valid_ds))} samples in the validation set.')\n samples = min(samples, len(mismatches))\n for ima in range(len(mismatches_ordered_byloss)):\n mismatchescontainer.append(mismatches_ordered_byloss[ima][0])\n for sampleN in range(samples):\n actualclasses = ''\n for clas in infolist[ordlosses_idxs[sampleN]][2]:\n actualclasses = f'{actualclasses} -- {str(classes_ids[clas][1])}'\n imag = mismatches_ordered_byloss[sampleN][0]\n imag = show_image(imag, figsize=figsize)\n imag.set_title(f\"\"\"Predicted: {classes_ids[infolist[ordlosses_idxs[sampleN]][1]][1]} \\nActual: {actualclasses}\\nLoss: {infolist[ordlosses_idxs[sampleN]][3]}\\nProbability: {infolist[ordlosses_idxs[sampleN]][4]}\"\"\",\n loc='left')\n plt.show()\n if save_misclassified: return mismatchescontainer"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef from_toplosses(cls, learn, n_imgs=None, **kwargs):\n \"Gets indices with top losses.\"\n train_ds, train_idxs = cls.get_toplosses_idxs(learn, n_imgs, **kwargs)\n return train_ds, train_idxs", "response": "Gets indices with top losses."} {"instruction": "Write a Python 3 function that can\nsort ds_type dataset by top losses and returns dataset and sorted indices.", "response": "def get_toplosses_idxs(cls, learn, n_imgs, **kwargs):\n \"Sorts `ds_type` dataset by top losses and returns dataset and sorted indices.\"\n dl = learn.data.fix_dl\n if not n_imgs: n_imgs = len(dl.dataset)\n _,_,top_losses = learn.get_preds(ds_type=DatasetType.Fix, with_loss=True)\n idxs = torch.topk(top_losses, n_imgs)[1]\n return cls.padded_ds(dl.dataset, **kwargs), idxs"} {"instruction": "Can you write a function in Python 3 where it\ngets the indices for the most similar images.", "response": "def from_similars(cls, learn, layer_ls:list=[0, 7, 2], **kwargs):\n \"Gets the indices for the most similar images.\"\n train_ds, train_idxs = cls.get_similars_idxs(learn, layer_ls, **kwargs)\n return train_ds, train_idxs"} {"instruction": "Make a summary of the following Python 3 code\ndef get_similars_idxs(cls, learn, layer_ls, **kwargs):\n \"Gets the indices for the most similar images in `ds_type` dataset\"\n hook = hook_output(learn.model[layer_ls[0]][layer_ls[1]][layer_ls[2]])\n dl = learn.data.fix_dl\n\n ds_actns = cls.get_actns(learn, hook=hook, dl=dl, **kwargs)\n similarities = cls.comb_similarity(ds_actns, ds_actns, **kwargs)\n idxs = cls.sort_idxs(similarities)\n return cls.padded_ds(dl, **kwargs), idxs", "response": "Gets the indices for the most similar images in ds_type dataset"} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef get_actns(learn, hook:Hook, dl:DataLoader, pool=AdaptiveConcatPool2d, pool_dim:int=4, **kwargs):\n \"Gets activations at the layer specified by `hook`, applies `pool` of dim `pool_dim` and concatenates\"\n print('Getting activations...')\n\n actns = []\n learn.model.eval()\n with torch.no_grad():\n for (xb,yb) in progress_bar(dl):\n learn.model(xb)\n actns.append((hook.stored).cpu())\n\n if pool:\n pool = pool(pool_dim)\n return pool(torch.cat(actns)).view(len(dl.x),-1)\n else: return torch.cat(actns).view(len(dl.x),-1)", "response": "Gets activations at the layer specified by hook applies pool of dim pool_dim and concatenates"} {"instruction": "How would you explain what the following Python 3 function does\ndef comb_similarity(t1: torch.Tensor, t2: torch.Tensor, **kwargs):\n # https://github.com/pytorch/pytorch/issues/11202\n \"Computes the similarity function between each embedding of `t1` and `t2` matrices.\"\n print('Computing similarities...')\n\n w1 = t1.norm(p=2, dim=1, keepdim=True)\n w2 = w1 if t2 is t1 else t2.norm(p=2, dim=1, keepdim=True)\n\n t = torch.mm(t1, t2.t()) / (w1 * w2.t()).clamp(min=1e-8)\n return torch.tril(t, diagonal=-1)", "response": "Computes the similarity function between each embedding of t1 and t2 matrices."} {"instruction": "Write a Python 3 function for\nreturning the n largest indices from a numpy array arr.", "response": "def largest_indices(arr, n):\n \"Returns the `n` largest indices from a numpy array `arr`.\"\n #https://stackoverflow.com/questions/6910641/how-do-i-get-indices-of-n-maximum-values-in-a-numpy-array\n flat = arr.flatten()\n indices = np.argpartition(flat, -n)[-n:]\n indices = indices[np.argsort(-flat[indices])]\n return np.unravel_index(indices, arr.shape)"} {"instruction": "Write a Python 3 script for\nsorting similarities and return the indexes in pairs ordered by highest similarity.", "response": "def sort_idxs(cls, similarities):\n \"Sorts `similarities` and return the indexes in pairs ordered by highest similarity.\"\n idxs = cls.largest_indices(similarities, len(similarities))\n idxs = [(idxs[0][i], idxs[1][i]) for i in range(len(idxs[0]))]\n return [e for l in idxs for e in l]"} {"instruction": "Write a Python 3 script for\nreturning an image widget for specified file name img.", "response": "def make_img_widget(cls, img, layout=Layout(), format='jpg'):\n \"Returns an image widget for specified file name `img`.\"\n return widgets.Image(value=img, format=format, layout=layout)"} {"instruction": "Create a Python 3 function for\nreturning a Button widget with specified handler.", "response": "def make_button_widget(cls, label, file_path=None, handler=None, style=None, layout=Layout(width='auto')):\n \"Return a Button widget with specified `handler`.\"\n btn = widgets.Button(description=label, layout=layout)\n if handler is not None: btn.on_click(handler)\n if style is not None: btn.button_style = style\n btn.file_path = file_path\n btn.flagged_for_delete = False\n return btn"} {"instruction": "Create a Python 3 function to\nreturn a Dropdown widget with specified handler.", "response": "def make_dropdown_widget(cls, description='Description', options=['Label 1', 'Label 2'], value='Label 1',\n file_path=None, layout=Layout(), handler=None):\n \"Return a Dropdown widget with specified `handler`.\"\n dd = widgets.Dropdown(description=description, options=options, value=value, layout=layout)\n if file_path is not None: dd.file_path = file_path\n if handler is not None: dd.observe(handler, names=['value'])\n return dd"} {"instruction": "Given the following Python 3 function, write the documentation\ndef make_horizontal_box(cls, children, layout=Layout()):\n \"Make a horizontal box with `children` and `layout`.\"\n return widgets.HBox(children, layout=layout)", "response": "Make a horizontal box with children and layout."} {"instruction": "Write a Python 3 script to\nmake a vertical box with children and layout.", "response": "def make_vertical_box(cls, children, layout=Layout(), duplicates=False):\n \"Make a vertical box with `children` and `layout`.\"\n if not duplicates: return widgets.VBox(children, layout=layout)\n else: return widgets.VBox([children[0], children[2]], layout=layout)"} {"instruction": "Here you have a function in Python 3, explain what it does\ndef create_image_list(self, dataset, fns_idxs):\n \"Create a list of images, filenames and labels but first removing files that are not supposed to be displayed.\"\n items = dataset.x.items\n if self._duplicates:\n chunked_idxs = chunks(fns_idxs, 2)\n chunked_idxs = [chunk for chunk in chunked_idxs if Path(items[chunk[0]]).is_file() and Path(items[chunk[1]]).is_file()]\n return [(dataset.x[i]._repr_jpeg_(), items[i], self._labels[dataset.y[i].data]) for chunk in chunked_idxs for i in chunk]\n else:\n return [(dataset.x[i]._repr_jpeg_(), items[i], self._labels[dataset.y[i].data]) for i in fns_idxs if\n Path(items[i]).is_file()]", "response": "Create a list of images filenames and labels but first removing files that are not supposed to be displayed."} {"instruction": "Make a summary of the following Python 3 code\ndef relabel(self, change):\n \"Relabel images by moving from parent dir with old label `class_old` to parent dir with new label `class_new`.\"\n class_new,class_old,file_path = change.new,change.old,change.owner.file_path\n fp = Path(file_path)\n parent = fp.parents[1]\n self._csv_dict[fp] = class_new", "response": "Relabel images by moving from parent dir with old label class_old to parent dir with new label class_new."} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef next_batch(self, _):\n \"Handler for 'Next Batch' button click. Delete all flagged images and renders next batch.\"\n for img_widget, delete_btn, fp, in self._batch:\n fp = delete_btn.file_path\n if (delete_btn.flagged_for_delete == True):\n self.delete_image(fp)\n self._deleted_fns.append(fp)\n self._all_images = self._all_images[self._batch_size:]\n self.empty_batch()\n self.render()", "response": "Handler for Next Batch button click. Delete all flagged images and renders next batch."} {"instruction": "How would you explain what the following Python 3 function does\ndef on_delete(self, btn):\n \"Flag this image as delete or keep.\"\n btn.button_style = \"\" if btn.flagged_for_delete else \"danger\"\n btn.flagged_for_delete = not btn.flagged_for_delete", "response": "Flag this image as delete or keep."} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef get_widgets(self, duplicates):\n \"Create and format widget set.\"\n widgets = []\n for (img,fp,human_readable_label) in self._all_images[:self._batch_size]:\n img_widget = self.make_img_widget(img, layout=Layout(height='250px', width='300px'))\n dropdown = self.make_dropdown_widget(description='', options=self._labels, value=human_readable_label,\n file_path=fp, handler=self.relabel, layout=Layout(width='auto'))\n delete_btn = self.make_button_widget('Delete', file_path=fp, handler=self.on_delete)\n widgets.append(self.make_vertical_box([img_widget, dropdown, delete_btn],\n layout=Layout(width='auto', height='300px',\n overflow_x=\"hidden\"), duplicates=duplicates))\n self._batch.append((img_widget, delete_btn, fp))\n return widgets", "response": "Create and format widget set."} {"instruction": "Write a Python 3 function for\nchecking if current batch contains already deleted images.", "response": "def batch_contains_deleted(self):\n \"Check if current batch contains already deleted images.\"\n if not self._duplicates: return False\n imgs = [self._all_images[:self._batch_size][0][1], self._all_images[:self._batch_size][1][1]]\n return any(img in self._deleted_fns for img in imgs)"} {"instruction": "Here you have a function in Python 3, explain what it does\ndef _line_shift(x:Tensor, mask:bool=False):\n \"Shift the line i of `x` by p-i elements to the left, is `mask` puts 0s on the diagonal.\"\n bs,nh,n,p = x.size()\n x_pad = torch.cat([x.new_zeros(bs,nh,n,1), x], dim=3)\n x_shift = x_pad.view(bs,nh,p + 1,n)[:,:,1:].view_as(x)\n if mask: x_shift.mul_(torch.tril(x.new_ones(n,p), p-n)[None,None,])\n return x_shift", "response": "Shift the line i of x by p - i elements to the left is mask puts 0s on the diagonal."} {"instruction": "How would you explain what the following Python 3 function does\ndef tfmer_lm_split(model:nn.Module) -> List[nn.Module]:\n \"Split a RNN `model` in groups for differential learning rates.\"\n encoder = model[0]\n n = len(encoder.layers)//3\n groups = [list(encoder.layers[:n]), list(encoder.layers[n:2*n]), list(encoder.layers[2*n:])]\n return groups + [[encoder.encoder, model[1]]]", "response": "Split a RNN model in groups for differential learning rates."} {"instruction": "Write a Python 3 script to\nsplit a RNN model in groups for differential learning rates.", "response": "def tfmer_clas_split(model:nn.Module) -> List[nn.Module]:\n \"Split a RNN `model` in groups for differential learning rates.\"\n encoder = model[0].module\n n = len(encoder.layers)//3\n groups = [[encoder.encoder], list(encoder.layers[:n]), list(encoder.layers[n:2*n]), list(encoder.layers[2*n:])]\n return groups + [[model[1]]]"} {"instruction": "Can you write a function in Python 3 where it\nsplits a RNN model in groups for differential learning rates.", "response": "def tfmerXL_lm_split(model:nn.Module) -> List[nn.Module]:\n \"Split a RNN `model` in groups for differential learning rates.\"\n encoder = model[0]\n n = len(encoder.layers)//3\n groups = [list(encoder.layers[:n]) + [ParameterModule(encoder.u), ParameterModule(encoder.v)]]\n return groups + [list(encoder.layers[n:2*n]), list(encoder.layers[2*n:]), [encoder.encoder, model[1]]]"} {"instruction": "How would you explain what the following Python 3 function does\ndef reset(self):\n \"Reset the internal memory.\"\n self.hidden = [next(self.parameters()).data.new(0) for i in range(self.n_layers+1)]", "response": "Reset the internal memory."} {"instruction": "Can you implement a function in Python 3 that\nmakes a report in form of two notebooks.", "response": "def make_report(self, outcome):\n \"\"\"Make report in form of two notebooks.\n\n Use nbdime diff-web to present the difference between reference\n cells and test cells.\n \"\"\"\n failures = self.getreports('failed')\n if not failures:\n return\n for rep in failures:\n # Check if this is a notebook node\n msg = self._getfailureheadline(rep)\n lines = rep.longrepr.splitlines()\n if len(lines) > 1:\n self.section(msg, lines[1])\n self._outrep_summary(rep)\n tmpdir = tempfile.mkdtemp()\n try:\n ref_file = os.path.join(tmpdir, 'reference.ipynb')\n test_file = os.path.join(tmpdir, 'test_result.ipynb')\n with io.open(ref_file, \"w\", encoding=\"utf8\") as f:\n nbformat.write(self.nb_ref, f)\n with io.open(test_file, \"w\", encoding=\"utf8\") as f:\n nbformat.write(self.nb_test, f)\n run_server(\n port=0, # Run on random port\n cwd=tmpdir,\n closable=True,\n on_port=lambda port: browse(\n port, ref_file, test_file, None))\n finally:\n shutil.rmtree(tmpdir)"} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef batchnorm_to_fp32(module):\n '''\n BatchNorm layers to have parameters in single precision.\n Find all layers and convert them back to float. This can't\n be done with built in .apply as that function will apply\n fn to all modules, parameters, and buffers. Thus we wouldn't\n be able to guard the float conversion based on the module type.\n '''\n if isinstance(module, nn.modules.batchnorm._BatchNorm):\n module.float()\n for child in module.children():\n batchnorm_to_fp32(child)\n return module", "response": "Convert a BatchNorm module to float."} {"instruction": "Explain what the following Python 3 code does\ndef copy_model_to_fp32(m, optim):\n fp32_params = [m_param.clone().type(torch.cuda.FloatTensor).detach() for m_param in trainable_params_(m)]\n optim_groups = [group['params'] for group in optim.param_groups]\n iter_fp32_params = iter(fp32_params)\n for group_params in optim_groups:\n for i in range(len(group_params)):\n if not group_params[i].requires_grad: continue # only update trainable_params_\n fp32_param = next(iter_fp32_params)\n assert(fp32_param.shape == group_params[i].shape)\n fp32_param.requires_grad = group_params[i].requires_grad\n group_params[i] = fp32_param\n return fp32_params", "response": "Creates a fp32 copy of model parameters and sets optimizer parameters\n "} {"instruction": "Create a Python 3 function for\nstarting coverage reporting in kernel.", "response": "def setup_coverage(config, kernel, floc, output_loc=None):\n \"\"\"Start coverage reporting in kernel.\n\n Currently supported kernel languages are:\n - Python\n \"\"\"\n\n language = kernel.language\n if language.startswith('python'):\n # Get the pytest-cov coverage object\n cov = get_cov(config)\n if cov:\n # If present, copy the data file location used by pytest-cov\n data_file = os.path.abspath(cov.config.data_file)\n else:\n # Fall back on output_loc and current dir if not\n data_file = os.path.abspath(os.path.join(output_loc or os.getcwd(), '.coverage'))\n\n # Get options from pytest-cov's command line arguments:\n source = config.option.cov_source\n config_file = config.option.cov_config\n if isinstance(config_file, str) and os.path.isfile(config_file):\n config_file = os.path.abspath(config_file)\n\n # Copy the suffix of plugin if available\n suffix = _make_suffix(cov)\n if suffix is True:\n # Cannot merge data with autogen suffix, so turn off warning\n # for missing data in pytest-cov collector\n cov._warn_no_data = False\n\n # Build setup command and execute in kernel:\n cmd = _python_setup % (data_file, source, config_file, suffix)\n msg_id = kernel.kc.execute(cmd, stop_on_error=False)\n kernel.await_idle(msg_id, 60) # A minute should be plenty to enable coverage\n else:\n config.warn(\n 'C1',\n 'Coverage currently not supported for language \"%s\".' % language,\n floc)\n return"} {"instruction": "How would you code a function in Python 3 to\nfinish coverage reporting in kernel.", "response": "def teardown_coverage(config, kernel, output_loc=None):\n \"\"\"Finish coverage reporting in kernel.\n\n The coverage should previously have been started with\n setup_coverage.\n \"\"\"\n language = kernel.language\n if language.startswith('python'):\n # Teardown code does not require any input, simply execute:\n msg_id = kernel.kc.execute(_python_teardown)\n kernel.await_idle(msg_id, 60) # A minute should be plenty to write out coverage\n\n # Ensure we merge our data into parent data of pytest-cov, if possible\n cov = get_cov(config)\n _merge_nbval_coverage_data(cov)\n\n else:\n # Warnings should be given on setup, or there might be no teardown\n # for a specific language, so do nothing here\n pass"} {"instruction": "How would you implement a function in Python 3 that\nreturns the coverage object of pytest - cov.", "response": "def get_cov(config):\n \"\"\"Returns the coverage object of pytest-cov.\"\"\"\n\n # Check with hasplugin to avoid getplugin exception in older pytest.\n if config.pluginmanager.hasplugin('_cov'):\n plugin = config.pluginmanager.getplugin('_cov')\n if plugin.cov_controller:\n return plugin.cov_controller.cov\n return None"} {"instruction": "How would you explain what the following Python 3 function does\ndef _make_suffix(cov):\n # Check if coverage object has data_suffix:\n if cov and cov.data_suffix is not None:\n # If True, the suffix will be autogenerated by coverage.py.\n # The suffixed data files will be automatically combined later.\n if cov.data_suffix is True:\n return True\n # Has a suffix, but we add our own extension\n return cov.data_suffix + '.nbval'\n return 'nbval'", "response": "Create a suffix for nbval data files depending on pytest - cov config."} {"instruction": "Can you implement a function in Python 3 that\nmerges nbval coverage data into pytest - cov data.", "response": "def _merge_nbval_coverage_data(cov):\n \"\"\"Merge nbval coverage data into pytest-cov data.\"\"\"\n if not cov:\n return\n\n suffix = _make_suffix(cov)\n if suffix is True:\n # Note: If suffix is true, we are running in parallel, so several\n # files will be generated. This will cause some warnings about \"no coverage\"\n # but is otherwise OK. Do nothing.\n return\n\n # Get the filename of the nbval coverage:\n filename = cov.data_files.filename + '.' + suffix\n\n # Read coverage generated by nbval in this run:\n nbval_data = coverage.CoverageData(debug=cov.debug)\n try:\n nbval_data.read_file(os.path.abspath(filename))\n except coverage.CoverageException:\n return\n\n # Set up aliases (following internal coverage.py code here)\n aliases = None\n if cov.config.paths:\n aliases = coverage.files.PathAliases()\n for paths in cov.config.paths.values():\n result = paths[0]\n for pattern in paths[1:]:\n aliases.add(pattern, result)\n\n # Merge nbval data into pytest-cov data:\n cov.data.update(nbval_data, aliases=aliases)\n # Delete our nbval coverage data\n coverage.misc.file_be_gone(filename)"} {"instruction": "Make a summary of the following Python 3 code\ndef chunks(l:Collection, n:int)->Iterable:\n \"Yield successive `n`-sized chunks from `l`.\"\n for i in range(0, len(l), n): yield l[i:i+n]", "response": "Yield successive n - sized chunks from l."} {"instruction": "How would you explain what the following Python 3 function does\ndef to_int(b:Any)->Union[int,List[int]]:\n \"Convert `b` to an int or list of ints (if `is_listy`); raises exception if not convertible\"\n if is_listy(b): return [to_int(x) for x in b]\n else: return int(b)", "response": "Convert b to an int or list of ints ( if is_listy ; raises exception if not convertible"} {"instruction": "How would you code a function in Python 3 to\nreturn True if a is one - dimensional", "response": "def is1d(a:Collection)->bool:\n \"Return `True` if `a` is one-dimensional\"\n return len(a.shape) == 1 if hasattr(a, 'shape') else True"} {"instruction": "How would you explain what the following Python 3 function does\ndef uniqueify(x:Series, sort:bool=False)->List:\n \"Return sorted unique values of `x`.\"\n res = list(OrderedDict.fromkeys(x).keys())\n if sort: res.sort()\n return res", "response": "Return sorted unique values of x."} {"instruction": "Here you have a function in Python 3, explain what it does\ndef find_classes(folder:Path)->FilePathList:\n \"List of label subdirectories in imagenet-style `folder`.\"\n classes = [d for d in folder.iterdir()\n if d.is_dir() and not d.name.startswith('.')]\n assert(len(classes)>0)\n return sorted(classes, key=lambda d: d.name)", "response": "List of label subdirectories in imagenet - style folder."} {"instruction": "How would you implement a function in Python 3 that\ngives arrs is [ a b... ] and mask index - return [ ( a [ mask ] b [ mask ]", "response": "def arrays_split(mask:NPArrayMask, *arrs:NPArrayableList)->SplitArrayList:\n \"Given `arrs` is [a,b,...] and `mask`index - return[(a[mask],a[~mask]),(b[mask],b[~mask]),...].\"\n assert all([len(arr)==len(arrs[0]) for arr in arrs]), 'All arrays should have same length'\n mask = array(mask)\n return list(zip(*[(a[mask],a[~mask]) for a in map(np.array, arrs)]))"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef random_split(valid_pct:float, *arrs:NPArrayableList)->SplitArrayList:\n \"Randomly split `arrs` with `valid_pct` ratio. good for creating validation set.\"\n assert (valid_pct>=0 and valid_pct<=1), 'Validation set percentage should be between 0 and 1'\n is_train = np.random.uniform(size=(len(arrs[0]),)) > valid_pct\n return arrays_split(is_train, *arrs)", "response": "Randomly split arrs with valid_pct ratio. good for creating validation set."} {"instruction": "Can you create a Python 3 function that\nmakes p listy and the same length as q.", "response": "def listify(p:OptListOrItem=None, q:OptListOrItem=None):\n \"Make `p` listy and the same length as `q`.\"\n if p is None: p=[]\n elif isinstance(p, str): p = [p]\n elif not isinstance(p, Iterable): p = [p]\n #Rank 0 tensors in PyTorch are Iterable but don't have a length.\n else:\n try: a = len(p)\n except: p = [p]\n n = q if type(q)==int else len(p) if q is None else len(q)\n if len(p)==1: p = p * n\n assert len(p)==n, f'List len mismatch ({len(p)} vs {n})'\n return list(p)"} {"instruction": "Write a Python 3 function for\nchanging name from camel to snake style.", "response": "def camel2snake(name:str)->str:\n \"Change `name` from camel to snake style.\"\n s1 = re.sub(_camel_re1, r'\\1_\\2', name)\n return re.sub(_camel_re2, r'\\1_\\2', s1).lower()"} {"instruction": "How would you code a function in Python 3 to\nbuild log - stepped array from start to stop in n steps.", "response": "def even_mults(start:float, stop:float, n:int)->np.ndarray:\n \"Build log-stepped array from `start` to `stop` in `n` steps.\"\n mult = stop/start\n step = mult**(1/(n-1))\n return np.array([start*(step**i) for i in range(n)])"} {"instruction": "Write a Python 3 script to\nextract the keys in names from the kwargs.", "response": "def extract_kwargs(names:Collection[str], kwargs:KWArgs):\n \"Extract the keys in `names` from the `kwargs`.\"\n new_kwargs = {}\n for arg_name in names:\n if arg_name in kwargs:\n arg_val = kwargs.pop(arg_name)\n new_kwargs[arg_name] = arg_val\n return new_kwargs, kwargs"} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef partition(a:Collection, sz:int)->List[Collection]:\n \"Split iterables `a` in equal parts of size `sz`\"\n return [a[i:i+sz] for i in range(0, len(a), sz)]", "response": "Split iterables a in equal parts of size sz"} {"instruction": "Write a Python 3 script to\nsplit data in a equally among n_cpus cores", "response": "def partition_by_cores(a:Collection, n_cpus:int)->List[Collection]:\n \"Split data in `a` equally among `n_cpus` cores\"\n return partition(a, len(a)//n_cpus + 1)"} {"instruction": "Explain what the following Python 3 code does\ndef series2cat(df:DataFrame, *col_names):\n \"Categorifies the columns `col_names` in `df`.\"\n for c in listify(col_names): df[c] = df[c].astype('category').cat.as_ordered()", "response": "Categorifies the columns col_names in df."} {"instruction": "Here you have a function in Python 3, explain what it does\ndef download_url(url:str, dest:str, overwrite:bool=False, pbar:ProgressBar=None,\n show_progress=True, chunk_size=1024*1024, timeout=4, retries=5)->None:\n \"Download `url` to `dest` unless it exists and not `overwrite`.\"\n if os.path.exists(dest) and not overwrite: return\n\n s = requests.Session()\n s.mount('http://',requests.adapters.HTTPAdapter(max_retries=retries))\n u = s.get(url, stream=True, timeout=timeout)\n try: file_size = int(u.headers[\"Content-Length\"])\n except: show_progress = False\n\n with open(dest, 'wb') as f:\n nbytes = 0\n if show_progress: pbar = progress_bar(range(file_size), auto_update=False, leave=False, parent=pbar)\n try:\n for chunk in u.iter_content(chunk_size=chunk_size):\n nbytes += len(chunk)\n if show_progress: pbar.update(nbytes)\n f.write(chunk)\n except requests.exceptions.ConnectionError as e:\n fname = url.split('/')[-1]\n from fastai.datasets import Config\n data_dir = Config().data_path()\n timeout_txt =(f'\\n Download of {url} has failed after {retries} retries\\n'\n f' Fix the download manually:\\n'\n f'$ mkdir -p {data_dir}\\n'\n f'$ cd {data_dir}\\n'\n f'$ wget -c {url}\\n'\n f'$ tar -zxvf {fname}\\n\\n'\n f'And re-run your code once the download is successful\\n')\n print(timeout_txt)\n import sys;sys.exit(1)", "response": "Download url to dest unless it exists and not overwrite."} {"instruction": "Write a Python 3 script for\nreturning Path ( path ) + fname. path defaults to current dir.", "response": "def join_path(fname:PathOrStr, path:PathOrStr='.')->Path:\n \"Return `Path(path)/Path(fname)`, `path` defaults to current dir.\"\n return Path(path)/Path(fname)"} {"instruction": "Explain what the following Python 3 code does\ndef join_paths(fnames:FilePathList, path:PathOrStr='.')->Collection[Path]:\n \"Join `path` to every file name in `fnames`.\"\n path = Path(path)\n return [join_path(o,path) for o in fnames]", "response": "Join path to every file name in fnames."} {"instruction": "Implement a function in Python 3 to\nreturn ndarray of str of lines of text from path.", "response": "def loadtxt_str(path:PathOrStr)->np.ndarray:\n \"Return `ndarray` of `str` of lines of text from `path`.\"\n with open(path, 'r') as f: lines = f.readlines()\n return np.array([l.strip() for l in lines])"} {"instruction": "Explain what the following Python 3 code does\ndef save_texts(fname:PathOrStr, texts:Collection[str]):\n \"Save in `fname` the content of `texts`.\"\n with open(fname, 'w') as f:\n for t in texts: f.write(f'{t}\\n')", "response": "Save in fname the content of texts."} {"instruction": "Can you create a Python 3 function that\nreturns the column indexes of names in df.", "response": "def df_names_to_idx(names:IntsOrStrs, df:DataFrame):\n \"Return the column indexes of `names` in `df`.\"\n if not is_listy(names): names = [names]\n if isinstance(names[0], int): return names\n return [df.columns.get_loc(c) for c in names]"} {"instruction": "Here you have a function in Python 3, explain what it does\ndef one_hot(x:Collection[int], c:int):\n \"One-hot encode `x` with `c` classes.\"\n res = np.zeros((c,), np.float32)\n res[listify(x)] = 1.\n return res", "response": "One - hot encode x with c classes."} {"instruction": "Create a Python 3 function for\nreturning the slice of a corresponding to idxs.", "response": "def index_row(a:Union[Collection,pd.DataFrame,pd.Series], idxs:Collection[int])->Any:\n \"Return the slice of `a` corresponding to `idxs`.\"\n if a is None: return a\n if isinstance(a,(pd.DataFrame,pd.Series)):\n res = a.iloc[idxs]\n if isinstance(res,(pd.DataFrame,pd.Series)): return res.copy()\n return res\n return a[idxs]"} {"instruction": "Make a summary of the following Python 3 code\ndef func_args(func)->bool:\n \"Return the arguments of `func`.\"\n code = func.__code__\n return code.co_varnames[:code.co_argcount]", "response": "Return the arguments of func."} {"instruction": "Implement a function in Python 3 to\nsplit kwargs between those expected by func and the others.", "response": "def split_kwargs_by_func(kwargs, func):\n \"Split `kwargs` between those expected by `func` and the others.\"\n args = func_args(func)\n func_kwargs = {a:kwargs.pop(a) for a in args if a in kwargs}\n return func_kwargs, kwargs"} {"instruction": "Explain what the following Python 3 code does\ndef array(a, dtype:type=None, **kwargs)->np.ndarray:\n \"Same as `np.array` but also handles generators. `kwargs` are passed to `np.array` with `dtype`.\"\n if not isinstance(a, collections.Sized) and not getattr(a,'__array_interface__',False):\n a = list(a)\n if np.int_==np.int32 and dtype is None and is_listy(a) and len(a) and isinstance(a[0],int):\n dtype=np.int64\n return np.array(a, dtype=dtype, **kwargs)", "response": "Same as np. array but also handles generators. kwargs are passed to np. array with dtype."} {"instruction": "Implement a Python 3 function for\nputting the texts in items in an HTML table widths are the widths of the columns in %.", "response": "def text2html_table(items:Collection[Collection[str]])->str:\n \"Put the texts in `items` in an HTML table, `widths` are the widths of the columns in %.\"\n html_code = f\"\"\"\"\"\"\n html_code += f\"\"\" \\n \\n\"\"\"\n for i in items[0]: html_code += f\" \"\n html_code += f\" \\n \\n \"\n html_code += \" \"\n for line in items[1:]:\n html_code += \" \"\n for i in line: html_code += f\" \"\n html_code += \" \"\n html_code += \" \\n
{_treat_html(i)}
{_treat_html(i)}
\"\n return html_code"} {"instruction": "Create a Python 3 function for\ncalling func on every element of arr in parallel using max_workers.", "response": "def parallel(func, arr:Collection, max_workers:int=None):\n \"Call `func` on every element of `arr` in parallel using `max_workers`.\"\n max_workers = ifnone(max_workers, defaults.cpus)\n if max_workers<2: results = [func(o,i) for i,o in progress_bar(enumerate(arr), total=len(arr))]\n else:\n with ProcessPoolExecutor(max_workers=max_workers) as ex:\n futures = [ex.submit(func,o,i) for i,o in enumerate(arr)]\n results = []\n for f in progress_bar(concurrent.futures.as_completed(futures), total=len(arr)): results.append(f.result())\n if any([o is not None for o in results]): return results"} {"instruction": "Explain what the following Python 3 code does\ndef subplots(rows:int, cols:int, imgsize:int=4, figsize:Optional[Tuple[int,int]]=None, title=None, **kwargs):\n \"Like `plt.subplots` but with consistent axs shape, `kwargs` passed to `fig.suptitle` with `title`\"\n figsize = ifnone(figsize, (imgsize*cols, imgsize*rows))\n fig, axs = plt.subplots(rows,cols,figsize=figsize)\n if rows==cols==1: axs = [[axs]] # subplots(1,1) returns Axes, not [Axes]\n elif (rows==1 and cols!=1) or (cols==1 and rows!=1): axs = [axs]\n if title is not None: fig.suptitle(title, **kwargs)\n return array(axs)", "response": "Like plt. subplots but with consistent axs shape kwargs passed to fig. suptitle with title"} {"instruction": "Can you write a function in Python 3 where it\nreturns the representation of the first n_max elements in items.", "response": "def show_some(items:Collection, n_max:int=5, sep:str=','):\n \"Return the representation of the first `n_max` elements in `items`.\"\n if items is None or len(items) == 0: return ''\n res = sep.join([f'{o}' for o in items[:n_max]])\n if len(items) > n_max: res += '...'\n return res"} {"instruction": "Can you implement a function in Python 3 that\ncreates and return a tmp filename optionally at a specific path. os. remove when done with it.", "response": "def get_tmp_file(dir=None):\n \"Create and return a tmp filename, optionally at a specific path. `os.remove` when done with it.\"\n with tempfile.NamedTemporaryFile(delete=False, dir=dir) as f: return f.name"} {"instruction": "How would you implement a function in Python 3 that\ncreates a seuence Conv2d -> BatchNorm2d -> LeakyReLu layer.", "response": "def conv_bn_lrelu(ni:int, nf:int, ks:int=3, stride:int=1)->nn.Sequential:\n \"Create a seuence Conv2d->BatchNorm2d->LeakyReLu layer.\"\n return nn.Sequential(\n nn.Conv2d(ni, nf, kernel_size=ks, bias=False, stride=stride, padding=ks//2),\n nn.BatchNorm2d(nf),\n nn.LeakyReLU(negative_slope=0.1, inplace=True))"} {"instruction": "Here you have a function in Python 3, explain what it does\ndef make_group_layer(self, ch_in:int, num_blocks:int, stride:int=1):\n \"starts with conv layer - `ch_in` channels in - then has `num_blocks` `ResLayer`\"\n return [conv_bn_lrelu(ch_in, ch_in*2,stride=stride)\n ] + [(ResLayer(ch_in*2)) for i in range(num_blocks)]", "response": "starts with conv layer - ch_in channels in - then has num_blocks ResLayer"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef collab_learner(data, n_factors:int=None, use_nn:bool=False, emb_szs:Dict[str,int]=None, layers:Collection[int]=None, \n ps:Collection[float]=None, emb_drop:float=0., y_range:OptRange=None, use_bn:bool=True, \n bn_final:bool=False, **learn_kwargs)->Learner:\n \"Create a Learner for collaborative filtering on `data`.\"\n emb_szs = data.get_emb_szs(ifnone(emb_szs, {}))\n u,m = data.train_ds.x.classes.values()\n if use_nn: model = EmbeddingNN(emb_szs=emb_szs, layers=layers, ps=ps, emb_drop=emb_drop, y_range=y_range, \n use_bn=use_bn, bn_final=bn_final, **learn_kwargs)\n else: model = EmbeddingDotBias(n_factors, len(u), len(m), y_range=y_range)\n return CollabLearner(data, model, **learn_kwargs)", "response": "Create a Learner for collaborative filtering on data."} {"instruction": "Write a Python 3 script for\ncreating a DataBunch suitable for collaborative filtering from ratings.", "response": "def from_df(cls, ratings:DataFrame, valid_pct:float=0.2, user_name:Optional[str]=None, item_name:Optional[str]=None,\n rating_name:Optional[str]=None, test:DataFrame=None, seed:int=None, path:PathOrStr='.', bs:int=64, \n val_bs:int=None, num_workers:int=defaults.cpus, dl_tfms:Optional[Collection[Callable]]=None, \n device:torch.device=None, collate_fn:Callable=data_collate, no_check:bool=False) -> 'CollabDataBunch':\n \"Create a `DataBunch` suitable for collaborative filtering from `ratings`.\"\n user_name = ifnone(user_name, ratings.columns[0])\n item_name = ifnone(item_name, ratings.columns[1])\n rating_name = ifnone(rating_name,ratings.columns[2])\n cat_names = [user_name,item_name]\n src = (CollabList.from_df(ratings, cat_names=cat_names, procs=Categorify)\n .split_by_rand_pct(valid_pct=valid_pct, seed=seed).label_from_df(cols=rating_name))\n if test is not None: src.add_test(CollabList.from_df(test, cat_names=cat_names))\n return src.databunch(path=path, bs=bs, val_bs=val_bs, num_workers=num_workers, device=device, \n collate_fn=collate_fn, no_check=no_check)"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef get_idx(self, arr:Collection, is_item:bool=True):\n \"Fetch item or user (based on `is_item`) for all in `arr`. (Set model to `cpu` and no grad.)\"\n m = self.model.eval().cpu()\n requires_grad(m,False)\n u_class,i_class = self.data.train_ds.x.classes.values()\n classes = i_class if is_item else u_class\n c2i = {v:k for k,v in enumerate(classes)}\n try: return tensor([c2i[o] for o in arr])\n except Exception as e: \n print(f\"\"\"You're trying to access {'an item' if is_item else 'a user'} that isn't in the training data.\n If it was in your original data, it may have been split such that it's only in the validation set now.\"\"\")", "response": "Fetch item or user for all in arr. (Set model to cpu and no grad."} {"instruction": "Here you have a function in Python 3, explain what it does\ndef bias(self, arr:Collection, is_item:bool=True):\n \"Bias for item or user (based on `is_item`) for all in `arr`. (Set model to `cpu` and no grad.)\"\n idx = self.get_idx(arr, is_item)\n m = self.model\n layer = m.i_bias if is_item else m.u_bias\n return layer(idx).squeeze()", "response": "Bias for item or user for all in arr. ( Set model to cpu and no grad."} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef weight(self, arr:Collection, is_item:bool=True):\n \"Bias for item or user (based on `is_item`) for all in `arr`. (Set model to `cpu` and no grad.)\"\n idx = self.get_idx(arr, is_item)\n m = self.model\n layer = m.i_weight if is_item else m.u_weight\n return layer(idx)", "response": "Bias for item or user for all in arr. ( Set model to cpu and no grad."} {"instruction": "How would you explain what the following Python 3 function does\ndef draw_tree(t, df, size=10, ratio=0.6, precision=0):\n s=export_graphviz(t, out_file=None, feature_names=df.columns, filled=True,\n special_characters=True, rotate=True, precision=precision)\n IPython.display.display(graphviz.Source(re.sub('Tree {',\n f'Tree {{ size={size}; ratio={ratio}', s)))", "response": "Draw a representation of a random forest in IPython."} {"instruction": "Implement a Python 3 function for\nreturning a random sample of n rows from df without replacement.", "response": "def get_sample(df,n):\n \"\"\" Gets a random sample of n rows from df, without replacement.\n Parameters:\n -----------\n df: A pandas data frame, that you wish to sample from.\n n: The number of rows you wish to sample.\n Returns:\n --------\n return value: A random sample of n rows of df.\n Examples:\n ---------\n >>> df = pd.DataFrame({'col1' : [1, 2, 3], 'col2' : ['a', 'b', 'a']})\n >>> df\n col1 col2\n 0 1 a\n 1 2 b\n 2 3 a\n >>> get_sample(df, 2)\n col1 col2\n 1 2 b\n 2 3 a\n \"\"\"\n idxs = sorted(np.random.permutation(len(df))[:n])\n return df.iloc[idxs].copy()"} {"instruction": "How would you code a function in Python 3 to\ntrain the panda s dataframe to a column of categorical values.", "response": "def train_cats(df):\n \"\"\"Change any columns of strings in a panda's dataframe to a column of\n categorical values. This applies the changes inplace.\n Parameters:\n -----------\n df: A pandas dataframe. Any columns of strings will be changed to\n categorical values.\n Examples:\n ---------\n >>> df = pd.DataFrame({'col1' : [1, 2, 3], 'col2' : ['a', 'b', 'a']})\n >>> df\n col1 col2\n 0 1 a\n 1 2 b\n 2 3 a\n note the type of col2 is string\n >>> train_cats(df)\n >>> df\n col1 col2\n 0 1 a\n 1 2 b\n 2 3 a\n now the type of col2 is category\n \"\"\"\n for n,c in df.items():\n if is_string_dtype(c): df[n] = c.astype('category').cat.as_ordered()"} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef apply_cats(df, trn):\n for n,c in df.items():\n if (n in trn.columns) and (trn[n].dtype.name=='category'):\n df[n] = c.astype('category').cat.as_ordered()\n df[n].cat.set_categories(trn[n].cat.categories, ordered=True, inplace=True)", "response": "Applies any columns of strings in df into categorical variables using trn as\nApps."} {"instruction": "Implement a function in Python 3 to\nfill missing data in a column of df with the median, and add a {name}_na column which specifies if the data was missing. Parameters: ----------- df: The data frame that will be changed. col: The column of data to fix by filling in missing data. name: The name of the new filled column in df. na_dict: A dictionary of values to create na's of and the value to insert. If name is not a key of na_dict the median will fill any missing data. Also if name is not a key of na_dict and there is no missing data in col, then no {name}_na column is not created. Examples: --------- >>> df = pd.DataFrame({'col1' : [1, np.NaN, 3], 'col2' : [5, 2, 2]}) >>> df col1 col2 0 1 5 1 nan 2 2 3 2 >>> fix_missing(df, df['col1'], 'col1', {}) >>> df col1 col2 col1_na 0 1 5 False 1 2 2 True 2 3 2 False >>> df = pd.DataFrame({'col1' : [1, np.NaN, 3], 'col2' : [5, 2, 2]}) >>> df col1 col2 0 1 5 1 nan 2 2 3 2 >>> fix_missing(df, df['col2'], 'col2', {}) >>> df col1 col2 0 1 5 1 nan 2 2 3 2 >>> df = pd.DataFrame({'col1' : [1, np.NaN, 3], 'col2' : [5, 2, 2]}) >>> df col1 col2 0 1 5 1 nan 2 2 3 2 >>> fix_missing(df, df['col1'], 'col1', {'col1' : 500}) >>> df col1 col2 col1_na 0 1 5 False 1 500 2 True 2 3 2 False", "response": "def fix_missing(df, col, name, na_dict):\n \"\"\" Fill missing data in a column of df with the median, and add a {name}_na column\n which specifies if the data was missing.\n Parameters:\n -----------\n df: The data frame that will be changed.\n col: The column of data to fix by filling in missing data.\n name: The name of the new filled column in df.\n na_dict: A dictionary of values to create na's of and the value to insert. If\n name is not a key of na_dict the median will fill any missing data. Also\n if name is not a key of na_dict and there is no missing data in col, then\n no {name}_na column is not created.\n Examples:\n ---------\n >>> df = pd.DataFrame({'col1' : [1, np.NaN, 3], 'col2' : [5, 2, 2]})\n >>> df\n col1 col2\n 0 1 5\n 1 nan 2\n 2 3 2\n >>> fix_missing(df, df['col1'], 'col1', {})\n >>> df\n col1 col2 col1_na\n 0 1 5 False\n 1 2 2 True\n 2 3 2 False\n >>> df = pd.DataFrame({'col1' : [1, np.NaN, 3], 'col2' : [5, 2, 2]})\n >>> df\n col1 col2\n 0 1 5\n 1 nan 2\n 2 3 2\n >>> fix_missing(df, df['col2'], 'col2', {})\n >>> df\n col1 col2\n 0 1 5\n 1 nan 2\n 2 3 2\n >>> df = pd.DataFrame({'col1' : [1, np.NaN, 3], 'col2' : [5, 2, 2]})\n >>> df\n col1 col2\n 0 1 5\n 1 nan 2\n 2 3 2\n >>> fix_missing(df, df['col1'], 'col1', {'col1' : 500})\n >>> df\n col1 col2 col1_na\n 0 1 5 False\n 1 500 2 True\n 2 3 2 False\n \"\"\"\n if is_numeric_dtype(col):\n if pd.isnull(col).sum() or (name in na_dict):\n df[name+'_na'] = pd.isnull(col)\n filler = na_dict[name] if name in na_dict else col.median()\n df[name] = col.fillna(filler)\n na_dict[name] = filler\n return na_dict"} {"instruction": "Given the following Python 3 function, write the documentation\ndef numericalize(df, col, name, max_n_cat):\n if not is_numeric_dtype(col) and ( max_n_cat is None or len(col.cat.categories)>max_n_cat):\n df[name] = pd.Categorical(col).codes+1", "response": "Changes the column col from a categorical type to its integer codes."} {"instruction": "Can you tell what is the following Python 3 function doing\ndef proc_df(df, y_fld=None, skip_flds=None, ignore_flds=None, do_scale=False, na_dict=None,\n preproc_fn=None, max_n_cat=None, subset=None, mapper=None):\n \"\"\" proc_df takes a data frame df and splits off the response variable, and\n changes the df into an entirely numeric dataframe. For each column of df \n which is not in skip_flds nor in ignore_flds, na values are replaced by the\n median value of the column.\n Parameters:\n -----------\n df: The data frame you wish to process.\n y_fld: The name of the response variable\n skip_flds: A list of fields that dropped from df.\n ignore_flds: A list of fields that are ignored during processing.\n do_scale: Standardizes each column in df. Takes Boolean Values(True,False)\n na_dict: a dictionary of na columns to add. Na columns are also added if there\n are any missing values.\n preproc_fn: A function that gets applied to df.\n max_n_cat: The maximum number of categories to break into dummy values, instead\n of integer codes.\n subset: Takes a random subset of size subset from df.\n mapper: If do_scale is set as True, the mapper variable\n calculates the values used for scaling of variables during training time (mean and standard deviation).\n Returns:\n --------\n [x, y, nas, mapper(optional)]:\n x: x is the transformed version of df. x will not have the response variable\n and is entirely numeric.\n y: y is the response variable\n nas: returns a dictionary of which nas it created, and the associated median.\n mapper: A DataFrameMapper which stores the mean and standard deviation of the corresponding continuous\n variables which is then used for scaling of during test-time.\n Examples:\n ---------\n >>> df = pd.DataFrame({'col1' : [1, 2, 3], 'col2' : ['a', 'b', 'a']})\n >>> df\n col1 col2\n 0 1 a\n 1 2 b\n 2 3 a\n note the type of col2 is string\n >>> train_cats(df)\n >>> df\n col1 col2\n 0 1 a\n 1 2 b\n 2 3 a\n now the type of col2 is category { a : 1, b : 2}\n >>> x, y, nas = proc_df(df, 'col1')\n >>> x\n col2\n 0 1\n 1 2\n 2 1\n >>> data = DataFrame(pet=[\"cat\", \"dog\", \"dog\", \"fish\", \"cat\", \"dog\", \"cat\", \"fish\"],\n children=[4., 6, 3, 3, 2, 3, 5, 4],\n salary=[90, 24, 44, 27, 32, 59, 36, 27])\n >>> mapper = DataFrameMapper([(:pet, LabelBinarizer()),\n ([:children], StandardScaler())])\n >>>round(fit_transform!(mapper, copy(data)), 2)\n 8x4 Array{Float64,2}:\n 1.0 0.0 0.0 0.21\n 0.0 1.0 0.0 1.88\n 0.0 1.0 0.0 -0.63\n 0.0 0.0 1.0 -0.63\n 1.0 0.0 0.0 -1.46\n 0.0 1.0 0.0 -0.63\n 1.0 0.0 0.0 1.04\n 0.0 0.0 1.0 0.21\n \"\"\"\n if not ignore_flds: ignore_flds=[]\n if not skip_flds: skip_flds=[]\n if subset: df = get_sample(df,subset)\n else: df = df.copy()\n ignored_flds = df.loc[:, ignore_flds]\n df.drop(ignore_flds, axis=1, inplace=True)\n if preproc_fn: preproc_fn(df)\n if y_fld is None: y = None\n else:\n if not is_numeric_dtype(df[y_fld]): df[y_fld] = pd.Categorical(df[y_fld]).codes\n y = df[y_fld].values\n skip_flds += [y_fld]\n df.drop(skip_flds, axis=1, inplace=True)\n\n if na_dict is None: na_dict = {}\n else: na_dict = na_dict.copy()\n na_dict_initial = na_dict.copy()\n for n,c in df.items(): na_dict = fix_missing(df, c, n, na_dict)\n if len(na_dict_initial.keys()) > 0:\n df.drop([a + '_na' for a in list(set(na_dict.keys()) - set(na_dict_initial.keys()))], axis=1, inplace=True)\n if do_scale: mapper = scale_vars(df, mapper)\n for n,c in df.items(): numericalize(df, c, n, max_n_cat)\n df = pd.get_dummies(df, dummy_na=True)\n df = pd.concat([ignored_flds, df], axis=1)\n res = [df, y, na_dict]\n if do_scale: res = res + [mapper]\n return res", "response": "This function processes the response variable and returns a DataFrame of the response variable and the corresponding median value."} {"instruction": "Make a summary of the following Python 3 code\ndef set_rf_samples(n):\n forest._generate_sample_indices = (lambda rs, n_samples:\n forest.check_random_state(rs).randint(0, n_samples, n))", "response": "Changes Scikit learn s random forests to give each tree a random sample of n random rows."} {"instruction": "Create a Python 3 function to\nundo the changes produced by set_rf_samples.", "response": "def reset_rf_samples():\n \"\"\" Undoes the changes produced by set_rf_samples.\n \"\"\"\n forest._generate_sample_indices = (lambda rs, n_samples:\n forest.check_random_state(rs).randint(0, n_samples, n_samples))"} {"instruction": "Write a Python 3 script for\nreturning globally assigned variables.", "response": "def get_global_vars(mod):\n \"Return globally assigned variables.\"\n # https://stackoverflow.com/questions/8820276/docstring-for-variable/31764368#31764368\n import ast,re\n with open(mod.__file__, 'r') as f: fstr = f.read()\n flines = fstr.splitlines()\n d = {}\n for node in ast.walk(ast.parse(fstr)):\n if isinstance(node,ast.Assign) and hasattr(node.targets[0], 'id'):\n key,lineno = node.targets[0].id,node.targets[0].lineno\n codestr = flines[lineno]\n match = re.match(f\"^({key})\\s*=\\s*.*\", codestr)\n if match and match.group(1) != '__all__': # only top level assignment\n d[key] = f'`{codestr}` {get_source_link(mod, lineno)}'\n return d"} {"instruction": "Write a Python 3 function for\nexecuting notebook fname with metadata for preprocessing.", "response": "def execute_nb(fname, metadata=None, save=True, show_doc_only=False):\n \"Execute notebook `fname` with `metadata` for preprocessing.\"\n # Any module used in the notebook that isn't inside must be in the same directory as this script\n with open(fname) as f: nb = nbformat.read(f, as_version=4)\n ep_class = ExecuteShowDocPreprocessor if show_doc_only else ExecutePreprocessor\n ep = ep_class(timeout=600, kernel_name='python3')\n metadata = metadata or {}\n ep.preprocess(nb, metadata)\n if save:\n with open(fname, 'wt') as f: nbformat.write(nb, f)\n NotebookNotary().sign(nb)"} {"instruction": "Implement a function in Python 3 to\ncreate the documentation notebook for module mod_name in path dest_path", "response": "def create_module_page(mod, dest_path, force=False):\n \"Create the documentation notebook for module `mod_name` in path `dest_path`\"\n nb = get_empty_notebook()\n mod_name = mod.__name__\n strip_name = strip_fastai(mod_name)\n init_cell = [get_md_cell(f'## Title for {strip_name} (use plain english, not module name!)'), get_md_cell('Type an introduction of the package here.')]\n cells = [get_code_cell(f'from fastai.gen_doc.nbdoc import *\\nfrom {mod_name} import * ', True)]\n\n gvar_map = get_global_vars(mod)\n if gvar_map: cells.append(get_md_cell('### Global Variable Definitions:'))\n for name in get_exports(mod):\n if name in gvar_map: cells.append(get_md_cell(gvar_map[name]))\n\n for ft_name in get_ft_names(mod, include_inner=True):\n if not hasattr(mod, ft_name):\n warnings.warn(f\"Module {strip_name} doesn't have a function named {ft_name}.\")\n continue\n cells += _symbol_skeleton(ft_name)\n elt = getattr(mod, ft_name)\n nb['cells'] = init_cell + cells + [get_md_cell(UNDOC_HEADER)]\n\n doc_path = get_doc_path(mod, dest_path)\n write_nb(nb, doc_path, 'w' if force else 'x')\n execute_nb(doc_path)\n return doc_path"} {"instruction": "How would you code a function in Python 3 to\nsearch a given path_dir and return all the modules contained inside except those in exclude", "response": "def get_module_names(path_dir, exclude=None):\n if exclude is None: exclude = _default_exclude\n \"Search a given `path_dir` and return all the modules contained inside except those in `exclude`\"\n files = sorted(path_dir.glob('*'), key=lambda x: (x.is_dir(), x.name), reverse=True) # directories first\n res = [f'{path_dir.name}']\n for f in files:\n if f.is_dir() and f.name in exclude: continue # exclude directories\n if any([f.name.endswith(ex) for ex in exclude]): continue # exclude extensions\n\n if f.suffix == '.py': res.append(f'{path_dir.name}.{f.stem}')\n elif f.is_dir(): res += [f'{path_dir.name}.{name}' for name in get_module_names(f)]\n return res"} {"instruction": "Can you implement a function in Python 3 that\nreads a notebook in fname and return its corresponding json", "response": "def read_nb(fname):\n \"Read a notebook in `fname` and return its corresponding json\"\n with open(fname,'r') as f: return nbformat.reads(f.read(), as_version=4)"} {"instruction": "How would you implement a function in Python 3 that\nbuilds a dictionary containing the position of the cells.", "response": "def read_nb_content(cells, mod_name):\n \"Build a dictionary containing the position of the `cells`.\"\n doc_fns = {}\n for i, cell in enumerate(cells):\n if cell['cell_type'] == 'code':\n for match in SHOW_DOC_RE.findall(cell['source']):\n doc_fns[match] = i\n return doc_fns"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef link_markdown_cells(cells, modules):\n \"Create documentation links for all cells in markdown with backticks.\"\n for i, cell in enumerate(cells):\n if cell['cell_type'] == 'markdown':\n cell['source'] = link_docstring(modules, cell['source'])", "response": "Create documentation links for all cells in markdown with backticks."} {"instruction": "Write a Python 3 script to\nreturn the position to insert a given function doc in a notebook.", "response": "def get_insert_idx(pos_dict, name):\n \"Return the position to insert a given function doc in a notebook.\"\n keys,i = list(pos_dict.keys()),0\n while i < len(keys) and str.lower(keys[i]) < str.lower(name): i+=1\n if i == len(keys): return -1\n else: return pos_dict[keys[i]]"} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef update_pos(pos_dict, start_key, nbr=2):\n \"Update the `pos_dict` by moving all positions after `start_key` by `nbr`.\"\n for key,idx in pos_dict.items():\n if str.lower(key) >= str.lower(start_key): pos_dict[key] += nbr\n return pos_dict", "response": "Update the pos_dict by moving all positions after start_key by nbr."} {"instruction": "Make a summary of the following Python 3 code\ndef insert_cells(cells, pos_dict, ft_name, append=False):\n \"Insert the function doc `cells` at their correct position and updates `pos_dict`.\"\n idx = get_insert_idx(pos_dict, ft_name)\n if append or idx == -1: cells += [get_doc_cell(ft_name), get_empty_cell()]\n else:\n cells.insert(idx, get_doc_cell(ft_name))\n cells.insert(idx+1, get_empty_cell())\n pos_dict = update_pos(pos_dict, ft_name, 2)\n return cells, pos_dict", "response": "Insert the function doc cells at their correct position and updates pos_dict."} {"instruction": "Implement a Python 3 function for\ncreating jekyll metadata for given notebook path.", "response": "def update_nb_metadata(nb_path=None, title=None, summary=None, keywords='fastai', overwrite=True, **kwargs):\n \"Creates jekyll metadata for given notebook path.\"\n nb = read_nb(nb_path)\n data = {'title': title, 'summary': summary, 'keywords': keywords, **kwargs}\n data = {k:v for (k,v) in data.items() if v is not None} # remove none values\n if not data: return\n nb['metadata']['jekyll'] = data\n write_nb(nb, nb_path)\n NotebookNotary().sign(nb)"} {"instruction": "Write a Python 3 function for\nfinding all submodules of notebook - sorted by submodules > top level modules > manual imports. This gives notebook imports priority", "response": "def get_imported_modules(cells, nb_module_name=''):\n \"Finds all submodules of notebook - sorted by submodules > top level modules > manual imports. This gives notebook imports priority\"\n module_names = get_top_level_modules()\n nb_imports = [match.group(1) for cell in cells for match in IMPORT_RE.finditer(cell['source']) if cell['cell_type'] == 'code']\n parts = nb_module_name.split('.')\n parent_modules = ['.'.join(parts[:(x+1)]) for x in range_of(parts)] # Imports parent modules - a.b.c = [a, a.b, a.b.c]\n all_modules = module_names + nb_imports + parent_modules\n mods = [import_mod(m, ignore_errors=True) for m in all_modules]\n return [m for m in mods if m is not None]"} {"instruction": "Given the following Python 3 function, write the documentation\ndef update_module_page(mod, dest_path='.'):\n \"Update the documentation notebook of a given module.\"\n doc_path = get_doc_path(mod, dest_path)\n strip_name = strip_fastai(mod.__name__)\n nb = read_nb(doc_path)\n cells = nb['cells']\n\n link_markdown_cells(cells, get_imported_modules(cells, mod.__name__))\n\n type_dict = read_nb_types(cells)\n gvar_map = get_global_vars(mod)\n for name in get_exports(mod):\n if name not in gvar_map: continue\n code = gvar_map[name]\n if name in type_dict: cells[type_dict[name]] = get_md_cell(code)\n else: cells.append(get_md_cell(code))\n\n pos_dict = read_nb_content(cells, strip_name)\n ft_names = get_ft_names(mod, include_inner=True)\n new_fts = list(set(ft_names) - set(pos_dict.keys()))\n if new_fts: print(f'Found new fuctions for {mod}. Please document:\\n{new_fts}')\n existing, undoc_cells, new_cells = parse_sections(cells)\n for ft_name in new_fts: new_cells.extend([get_doc_cell(ft_name), get_empty_cell()])\n if len(new_cells) > 1: nb['cells'] = existing + undoc_cells + new_cells\n\n write_nb(nb, doc_path)\n return doc_path", "response": "Update the documentation notebook of a given module."} {"instruction": "Given the following Python 3 function, write the documentation\ndef update_notebooks(source_path, dest_path=None, update_html=True, document_new_fns=False,\n update_nb_links=True, html_path=None, force=False):\n \"`source_path` can be a directory or a file. Assume all modules reside in the fastai directory.\"\n from .convert2html import convert_nb\n source_path = Path(source_path)\n\n if source_path.is_file():\n dest_path = source_path.parent if dest_path is None else Path(dest_path)\n html_path = dest_path/'..'/'docs' if html_path is None else Path(html_path)\n doc_path = source_path\n assert source_path.suffix == '.ipynb', 'Must update from notebook or module'\n if document_new_fns:\n mod = import_mod(get_module_from_notebook(source_path))\n if not mod: print('Could not find module for path:', source_path)\n elif mod.__file__.endswith('__init__.py'): pass\n else: update_module_page(mod, dest_path)\n generate_missing_metadata(doc_path)\n if update_nb_links:\n print(f'Updating notebook {doc_path}. Please wait...')\n link_nb(doc_path)\n execute_nb(doc_path, {'metadata': {'path': doc_path.parent}}, show_doc_only=True)\n if update_html:\n check_nbconvert_version()\n html_fn = html_path/doc_path.with_suffix('.html').name\n if not force and html_fn.is_file():\n in_mod = os.path.getmtime(doc_path)\n out_mod = os.path.getmtime(html_fn)\n if in_mod < out_mod: return\n convert_nb(doc_path, html_path)\n\n elif (source_path.name.startswith('fastai.')):\n # Do module update\n assert dest_path is not None, 'To update a module, you must specify a destination folder for where notebook resides'\n mod = import_mod(source_path.name)\n if not mod: return print('Could not find module for:', source_path)\n doc_path = Path(dest_path)/(strip_fastai(mod.__name__)+'.ipynb')\n if not doc_path.exists():\n print('Notebook does not exist. Creating:', doc_path)\n create_module_page(mod, dest_path)\n update_notebooks(doc_path, dest_path=dest_path, update_html=update_html, document_new_fns=document_new_fns,\n update_nb_links=update_nb_links, html_path=html_path)\n elif source_path.is_dir():\n for f in sorted(Path(source_path).glob('*.ipynb')):\n update_notebooks(f, dest_path=dest_path, update_html=update_html, document_new_fns=document_new_fns,\n update_nb_links=update_nb_links, html_path=html_path)\n else: print('Could not resolve source file:', source_path)", "response": "source_path can be a directory or a file. Assume all modules reside in the fastai directory."} {"instruction": "Implement a function in Python 3 to\nreturn a dropout mask of the same type as x size sz with probability p to cancel an element.", "response": "def dropout_mask(x:Tensor, sz:Collection[int], p:float):\n \"Return a dropout mask of the same type as `x`, size `sz`, with probability `p` to cancel an element.\"\n return x.new(*sz).bernoulli_(1-p).div_(1-p)"} {"instruction": "How would you implement a function in Python 3 that\nsplits a RNN model in groups for differential learning rates.", "response": "def awd_lstm_lm_split(model:nn.Module) -> List[nn.Module]:\n \"Split a RNN `model` in groups for differential learning rates.\"\n groups = [[rnn, dp] for rnn, dp in zip(model[0].rnns, model[0].hidden_dps)]\n return groups + [[model[0].encoder, model[0].encoder_dp, model[1]]]"} {"instruction": "Write a Python 3 script for\nconverting a value x from 0 to 1 to an RGBA tuple according to cmap times transparency alpha_mult.", "response": "def value2rgba(x:float, cmap:Callable=cm.RdYlGn, alpha_mult:float=1.0)->Tuple:\n \"Convert a value `x` from 0 to 1 (inclusive) to an RGBA tuple according to `cmap` times transparency `alpha_mult`.\"\n c = cmap(x)\n rgb = (np.array(c[:-1]) * 255).astype(int)\n a = c[-1] * alpha_mult\n return tuple(rgb.tolist() + [a])"} {"instruction": "How would you implement a function in Python 3 that\napplies dropout to the raw weights.", "response": "def _setweights(self):\n \"Apply dropout to the raw weights.\"\n for layer in self.layer_names:\n raw_w = getattr(self, f'{layer}_raw')\n self.module._parameters[layer] = F.dropout(raw_w, p=self.weight_p, training=self.training)"} {"instruction": "Write a Python 3 function for\nreturning one hidden state.", "response": "def _one_hidden(self, l:int)->Tensor:\n \"Return one hidden state.\"\n nh = (self.n_hid if l != self.n_layers - 1 else self.emb_sz) // self.n_dir\n return one_param(self).new(1, self.bs, nh).zero_()"} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef reset(self):\n \"Reset the hidden states.\"\n [r.reset() for r in self.rnns if hasattr(r, 'reset')]\n if self.qrnn: self.hidden = [self._one_hidden(l) for l in range(self.n_layers)]\n else: self.hidden = [(self._one_hidden(l), self._one_hidden(l)) for l in range(self.n_layers)]", "response": "Reset the hidden states."} {"instruction": "Create a Python 3 function to\ncalculate the intrinsic attention of the input w. r. t to an output class_id or the classification given by the model.", "response": "def intrinsic_attention(self, text:str, class_id:int=None):\n \"\"\"Calculate the intrinsic attention of the input w.r.t to an output `class_id`, or the classification given by the model if `None`.\n For reference, see the Sequential Jacobian session at https://www.cs.toronto.edu/~graves/preprint.pdf\n \"\"\"\n self.model.train()\n _eval_dropouts(self.model)\n self.model.zero_grad()\n self.model.reset()\n ids = self.data.one_item(text)[0]\n emb = self.model[0].module.encoder(ids).detach().requires_grad_(True)\n lstm_output = self.model[0].module(emb, from_embeddings=True)\n self.model.eval()\n cl = self.model[1](lstm_output + (torch.zeros_like(ids).byte(),))[0].softmax(dim=-1)\n if class_id is None: class_id = cl.argmax()\n cl[0][class_id].backward()\n attn = emb.grad.squeeze().abs().sum(dim=-1)\n attn /= attn.max()\n tokens = self.data.single_ds.reconstruct(ids[0])\n return tokens, attn"} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef show_top_losses(self, k:int, max_len:int=70)->None:\n from IPython.display import display, HTML\n items = []\n tl_val,tl_idx = self.top_losses()\n for i,idx in enumerate(tl_idx):\n if k <= 0: break\n k -= 1\n tx,cl = self.data.dl(self.ds_type).dataset[idx]\n cl = cl.data\n classes = self.data.classes\n txt = ' '.join(tx.text.split(' ')[:max_len]) if max_len is not None else tx.text\n tmp = [txt, f'{classes[self.pred_class[idx]]}', f'{classes[cl]}', f'{self.losses[idx]:.2f}',\n f'{self.probs[idx][cl]:.2f}']\n items.append(tmp)\n items = np.array(items)\n names = ['Text', 'Prediction', 'Actual', 'Loss', 'Probability']\n df = pd.DataFrame({n:items[:,i] for i,n in enumerate(names)}, columns=names)\n with pd.option_context('display.max_colwidth', -1):\n display(HTML(df.to_html(index=False)))", "response": "Create a tabulation showing the first k texts in top_losses along with their prediction actual loss and probability of the actual class."} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef on_train_begin(self, epoch:int, **kwargs:Any)->None:\n \"Initialize the schedulers for training.\"\n res = {'epoch':self.start_epoch} if self.start_epoch is not None else None\n self.start_epoch = ifnone(self.start_epoch, epoch)\n self.scheds = [p.scheds for p in self.phases]\n self.opt = self.learn.opt\n for k,v in self.scheds[0].items(): \n v.restart()\n self.opt.set_stat(k, v.start)\n self.idx_s = 0\n return res", "response": "Initialize the schedulers for training."} {"instruction": "Create a Python 3 function to\ntake a step in lr mom sched start next stepper when the current one is complete.", "response": "def on_batch_end(self, train, **kwargs:Any)->None:\n \"Take a step in lr,mom sched, start next stepper when the current one is complete.\"\n if train:\n if self.idx_s >= len(self.scheds): return {'stop_training': True, 'stop_epoch': True}\n sched = self.scheds[self.idx_s]\n for k,v in sched.items(): self.opt.set_stat(k, v.step())\n if list(sched.values())[0].is_done: self.idx_s += 1"} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef tensor(x:Any, *rest)->Tensor:\n \"Like `torch.as_tensor`, but handle lists too, and can pass multiple vector elements directly.\"\n if len(rest): x = (x,)+rest\n # XXX: Pytorch bug in dataloader using num_workers>0; TODO: create repro and report\n if is_listy(x) and len(x)==0: return tensor(0)\n res = torch.tensor(x) if is_listy(x) else as_tensor(x)\n if res.dtype is torch.int32:\n warn('Tensor is int32: upgrading to int64; for better performance use int64 input')\n return res.long()\n return res", "response": "Like torch. as_tensor but handle lists too and can pass multiple vector elements directly."} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef to_detach(b:Tensors, cpu:bool=True):\n \"Recursively detach lists of tensors in `b `; put them on the CPU if `cpu=True`.\"\n if is_listy(b): return [to_detach(o, cpu) for o in b]\n if not isinstance(b,Tensor): return b\n b = b.detach()\n return b.cpu() if cpu else b", "response": "Recursively detach lists of tensors in b ; put them on the CPU if cpu = True."} {"instruction": "Given the following Python 3 function, write the documentation\ndef to_cpu(b:ItemsList):\n \"Recursively map lists of tensors in `b ` to the cpu.\"\n if is_listy(b): return [to_cpu(o) for o in b]\n return b.cpu() if isinstance(b,Tensor) else b", "response": "Recursively map lists of tensors in b to the cpu."} {"instruction": "How would you code a function in Python 3 to\nconvert batch items to tensor data.", "response": "def data_collate(batch:ItemsList)->Tensor:\n \"Convert `batch` items to tensor data.\"\n return torch.utils.data.dataloader.default_collate(to_data(batch))"} {"instruction": "Can you write a function in Python 3 where it\nreturns list of trainable params in m.", "response": "def trainable_params(m:nn.Module)->ParamList:\n \"Return list of trainable params in `m`.\"\n res = filter(lambda p: p.requires_grad, m.parameters())\n return res"} {"instruction": "How would you explain what the following Python 3 function does\ndef children_and_parameters(m:nn.Module):\n \"Return the children of `m` and its direct parameters not registered in modules.\"\n children = list(m.children())\n children_p = sum([[id(p) for p in c.parameters()] for c in m.children()],[])\n for p in m.parameters():\n if id(p) not in children_p: children.append(ParameterModule(p))\n return children", "response": "Return the children of m and its direct parameters not registered in modules."} {"instruction": "Implement a Python 3 function for\nsplitting model according to the indexes in idxs.", "response": "def split_model_idx(model:nn.Module, idxs:Collection[int])->ModuleList:\n \"Split `model` according to the indexes in `idxs`.\"\n layers = flatten_model(model)\n if idxs[0] != 0: idxs = [0] + idxs\n if idxs[-1] != len(layers): idxs.append(len(layers))\n return [nn.Sequential(*layers[i:j]) for i,j in zip(idxs[:-1],idxs[1:])]"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef split_model(model:nn.Module=None, splits:Collection[Union[nn.Module,ModuleList]]=None):\n \"Split `model` according to the layers in `splits`.\"\n splits = listify(splits)\n if isinstance(splits[0], nn.Module):\n layers = flatten_model(model)\n idxs = [layers.index(first_layer(s)) for s in splits]\n return split_model_idx(model, idxs)\n return [nn.Sequential(*s) for s in splits]", "response": "Split model according to the layers in splits."} {"instruction": "Write a Python 3 script to\nseparate the parameters in layer_groups between no_wd_types and bias ( bias_types from the rest.", "response": "def split_no_wd_params(layer_groups:Collection[nn.Module])->List[List[nn.Parameter]]:\n \"Separate the parameters in `layer_groups` between `no_wd_types` and bias (`bias_types`) from the rest.\"\n split_params = []\n for l in layer_groups:\n l1,l2 = [],[]\n for c in l.children():\n if isinstance(c, no_wd_types): l2 += list(trainable_params(c))\n elif isinstance(c, bias_types):\n bias = c.bias if hasattr(c, 'bias') else None\n l1 += [p for p in trainable_params(c) if not (p is bias)]\n if bias is not None: l2.append(bias)\n else: l1 += list(trainable_params(c))\n #Since we scan the children separately, we might get duplicates (tied weights). We need to preserve the order\n #for the optimizer load of state_dict\n l1,l2 = uniqueify(l1),uniqueify(l2)\n split_params += [l1, l2]\n return split_params"} {"instruction": "How would you implement a function in Python 3 that\nsets bn layers in eval mode for all recursive children of m.", "response": "def set_bn_eval(m:nn.Module)->None:\n \"Set bn layers in eval mode for all recursive children of `m`.\"\n for l in m.children():\n if isinstance(l, bn_types) and not next(l.parameters()).requires_grad:\n l.eval()\n set_bn_eval(l)"} {"instruction": "Given the following Python 3 function, write the documentation\ndef init_default(m:nn.Module, func:LayerFunc=nn.init.kaiming_normal_)->None:\n \"Initialize `m` weights with `func` and set `bias` to 0.\"\n if func:\n if hasattr(m, 'weight'): func(m.weight)\n if hasattr(m, 'bias') and hasattr(m.bias, 'data'): m.bias.data.fill_(0.)\n return m", "response": "Initialize m weights with func and set bias to 0."} {"instruction": "Given the following Python 3 function, write the documentation\ndef cond_init(m:nn.Module, init_func:LayerFunc):\n \"Initialize the non-batchnorm layers of `m` with `init_func`.\"\n if (not isinstance(m, bn_types)) and requires_grad(m): init_default(m, init_func)", "response": "Initialize the non - batchnorm layers of m with init_func.\n "} {"instruction": "Can you tell what is the following Python 3 function doing\ndef apply_init(m, init_func:LayerFunc):\n \"Initialize all non-batchnorm layers of `m` with `init_func`.\"\n apply_leaf(m, partial(cond_init, init_func=init_func))", "response": "Initialize all non - batchnorm layers of m with init_func."} {"instruction": "Can you write a function in Python 3 where it\nreturns the shape of the first weight layer in m.", "response": "def in_channels(m:nn.Module) -> List[int]:\n \"Return the shape of the first weight layer in `m`.\"\n for l in flatten_model(m):\n if hasattr(l, 'weight'): return l.weight.shape[1]\n raise Exception('No weight layer')"} {"instruction": "Given the following Python 3 function, write the documentation\ndef model_type(dtype):\n \"Return the torch type corresponding to `dtype`.\"\n return (torch.float32 if np.issubdtype(dtype, np.floating) else\n torch.int64 if np.issubdtype(dtype, np.integer)\n else None)", "response": "Return the torch type corresponding to dtype."} {"instruction": "Make a summary of the following Python 3 code\ndef np2model_tensor(a):\n \"Tranform numpy array `a` to a tensor of the same type.\"\n dtype = model_type(a.dtype)\n res = as_tensor(a)\n if not dtype: return res\n return res.type(dtype)", "response": "Tranform numpy array a to a tensor of the same type."} {"instruction": "Implement a function in Python 3 to\ncompute PCA of x with k dimensions.", "response": "def _pca(x, k=2):\n \"Compute PCA of `x` with `k` dimensions.\"\n x = x-torch.mean(x,0)\n U,S,V = torch.svd(x.t())\n return torch.mm(x,U[:,:k])"} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef grab_idx(x,i,batch_first:bool=True):\n \"Grab the `i`-th batch in `x`, `batch_first` stating the batch dimension.\"\n if batch_first: return ([o[i].cpu() for o in x] if is_listy(x) else x[i].cpu())\n else: return ([o[:,i].cpu() for o in x] if is_listy(x) else x[:,i].cpu())", "response": "Grab the i - th batch in x batch_first stating the batch dimension."} {"instruction": "Write a Python 3 function that can\ndraw 1 or shape = size random floats from uniform dist : min = low max = high.", "response": "def uniform(low:Number, high:Number=None, size:Optional[List[int]]=None)->FloatOrTensor:\n \"Draw 1 or shape=`size` random floats from uniform dist: min=`low`, max=`high`.\"\n if high is None: high=low\n return random.uniform(low,high) if size is None else torch.FloatTensor(*listify(size)).uniform_(low,high)"} {"instruction": "Create a Python 3 function for\ndrawing 1 or shape = size random floats from uniform dist.", "response": "def log_uniform(low, high, size:Optional[List[int]]=None)->FloatOrTensor:\n \"Draw 1 or shape=`size` random floats from uniform dist: min=log(`low`), max=log(`high`).\"\n res = uniform(log(low), log(high), size)\n return exp(res) if size is None else res.exp_()"} {"instruction": "Write a Python 3 script to\ndraw 1 or shape = size random booleans occuring with probability p.", "response": "def rand_bool(p:float, size:Optional[List[int]]=None)->BoolOrTensor:\n \"Draw 1 or shape=`size` random booleans (`True` occuring with probability `p`).\"\n return uniform(0,1,size)IntOrTensor:\n \"Generate int or tensor `size` of ints between `low` and `high` (included).\"\n return random.randint(low,high) if size is None else torch.randint(low,high+1,size)"} {"instruction": "How would you explain what the following Python 3 function does\ndef try_int(o:Any)->Any:\n \"Try to convert `o` to int, default to `o` if not possible.\"\n # NB: single-item rank-1 array/tensor can be converted to int, but we don't want to do this\n if isinstance(o, (np.ndarray,Tensor)): return o if o.ndim else int(o)\n if isinstance(o, collections.Sized) or getattr(o,'__array_interface__',False): return o\n try: return int(o)\n except: return o", "response": "Try to convert o to int default to o if not possible."} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef get_model(model:nn.Module):\n \"Return the model maybe wrapped inside `model`.\"\n return model.module if isinstance(model, (DistributedDataParallel, nn.DataParallel)) else model", "response": "Return the model maybe wrapped inside model."} {"instruction": "Can you write a function in Python 3 where it\nchecks that out and targ have the same number of elements and flatten them.", "response": "def flatten_check(out:Tensor, targ:Tensor) -> Tensor:\n \"Check that `out` and `targ` have the same number of elements and flatten them.\"\n out,targ = out.contiguous().view(-1),targ.contiguous().view(-1)\n assert len(out) == len(targ), f\"Expected output and target to have the same number of elements but got {len(out)} and {len(targ)}.\"\n return out,targ"} {"instruction": "Write a Python 3 script to\ncreate new OrderedDict that does not contain module.", "response": "def remove_module_load(state_dict):\n \"\"\"create new OrderedDict that does not contain `module.`\"\"\"\n new_state_dict = OrderedDict()\n for k, v in state_dict.items(): new_state_dict[k[7:]] = v\n return new_state_dict"} {"instruction": "Explain what the following Python 3 code does\ndef add_metrics(last_metrics:Collection[Rank0Tensor], mets:Union[Rank0Tensor, Collection[Rank0Tensor]]):\n \"Return a dictionary for updating `last_metrics` with `mets`.\"\n last_metrics,mets = listify(last_metrics),listify(mets)\n return {'last_metrics': last_metrics + mets}", "response": "Return a dictionary for updating last_metrics with mets."} {"instruction": "Given the following Python 3 function, write the documentation\ndef map(self, fn, *iterables, timeout=None, chunksize=1, prefetch=None):\n if timeout is not None: end_time = timeout + time.time()\n if prefetch is None: prefetch = self._max_workers\n if prefetch < 0: raise ValueError(\"prefetch count may not be negative\")\n argsiter = zip(*iterables)\n fs = collections.deque(self.submit(fn, *args) for args in itertools.islice(argsiter, self._max_workers+prefetch))\n # Yield must be hidden in closure so that the futures are submitted before the first iterator value is required.\n def result_iterator():\n nonlocal argsiter\n try:\n while fs:\n res = fs[0].result() if timeout is None else fs[0].result(end_time-time.time())\n # Got a result, future needn't be cancelled\n del fs[0]\n # Dispatch next task before yielding to keep pipeline full\n if argsiter:\n try:\n args = next(argsiter)\n except StopIteration:\n argsiter = None\n else:\n fs.append(self.submit(fn, *args))\n yield res\n finally:\n for future in fs: future.cancel()\n return result_iterator()", "response": "Map a function over a set of items in the pipeline."} {"instruction": "Here you have a function in Python 3, explain what it does\ndef gen_ascii_docs(src='fastai'):\n os.chdir(Path(__file__).absolute().parent)\n with working_directory('..'):\n path = Path(src)\n if path.is_dir():\n file_paths = list(path.glob('**/*.py'))\n else:\n file_paths = [path]\n\n pat = re.compile('^(?!__init__).*.py\\Z')\n for file_path in file_paths:\n if pat.match(file_path.name):\n file_path.parent.mkdir(parents=True, exist_ok=True)\n with working_directory('..'):\n tmpl_str = parse_module(file_path)\n\n (file_path.parent/(file_path.name.rsplit('.',1)[0] + '.adoc.tmpl')).write_text(tmpl_str)\n (file_path.parent/(file_path.name.rsplit('.',1)[0] + '.adoc')).write_text(re.sub(r\"{{(.*?)}}\", parse_tmpl, tmpl_str, flags=re.DOTALL))\n if path.is_dir():\n subprocess.call(['asciidoctor', str(path) + '/**/*.adoc'])\n else:\n subprocess.call(['asciidoctor', str(path).rsplit('.',1)[0] + '.adoc'])", "response": "Generate HTML documentation for fastai library in HTML"} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef _get_new_batch(self, ds_type:DatasetType)->Collection[Tensor]:\n \"Retrieves new batch of DatasetType, and detaches it.\"\n return self.learn.data.one_batch(ds_type=ds_type, detach=True, denorm=False, cpu=False)", "response": "Retrieves new batch of DatasetType and detaches it."} {"instruction": "Write a Python 3 function that can\nwrite gradient statistics to Tensorboard.", "response": "def _write_model_stats(self, iteration:int)->None:\n \"Writes gradient statistics to Tensorboard.\"\n self.stats_writer.write(model=self.learn.model, iteration=iteration, tbwriter=self.tbwriter)"} {"instruction": "Can you create a Python 3 function that\nwrites training loss to Tensorboard.", "response": "def _write_training_loss(self, iteration:int, last_loss:Tensor)->None:\n \"Writes training loss to Tensorboard.\"\n scalar_value = to_np(last_loss)\n tag = self.metrics_root + 'train_loss'\n self.tbwriter.add_scalar(tag=tag, scalar_value=scalar_value, global_step=iteration)"} {"instruction": "Write a Python 3 script to\nwrite model weight histograms to Tensorboard.", "response": "def _write_weight_histograms(self, iteration:int)->None:\n \"Writes model weight histograms to Tensorboard.\"\n self.hist_writer.write(model=self.learn.model, iteration=iteration, tbwriter=self.tbwriter)"} {"instruction": "Create a Python 3 function for\nwriting single scalar value to Tensorboard.", "response": "def _write_scalar(self, name:str, scalar_value, iteration:int)->None:\n \"Writes single scalar value to Tensorboard.\"\n tag = self.metrics_root + name\n self.tbwriter.add_scalar(tag=tag, scalar_value=scalar_value, global_step=iteration)"} {"instruction": "Here you have a function in Python 3, explain what it does\ndef _write_metrics(self, iteration:int, last_metrics:MetricsList, start_idx:int=2)->None:\n \"Writes training metrics to Tensorboard.\"\n recorder = self.learn.recorder\n for i, name in enumerate(recorder.names[start_idx:]):\n if last_metrics is None or len(last_metrics) < i+1: return\n scalar_value = last_metrics[i]\n self._write_scalar(name=name, scalar_value=scalar_value, iteration=iteration)", "response": "Writes training metrics to Tensorboard."} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef on_batch_end(self, last_loss:Tensor, iteration:int, **kwargs)->None:\n \"Callback function that writes batch end appropriate data to Tensorboard.\"\n if iteration == 0: return\n self._update_batches_if_needed()\n if iteration % self.loss_iters == 0: self._write_training_loss(iteration=iteration, last_loss=last_loss)\n if iteration % self.hist_iters == 0: self._write_weight_histograms(iteration=iteration)", "response": "Callback function that writes batch end appropriate data to Tensorboard."} {"instruction": "Here you have a function in Python 3, explain what it does\ndef on_backward_end(self, iteration:int, **kwargs)->None:\n \"Callback function that writes backward end appropriate data to Tensorboard.\"\n if iteration == 0: return\n self._update_batches_if_needed()\n if iteration % self.stats_iters == 0: self._write_model_stats(iteration=iteration)", "response": "Callback function that writes backward end appropriate data to Tensorboard."} {"instruction": "Implement a Python 3 function for\nwriting model weight histograms to Tensorboard.", "response": "def _write_weight_histograms(self, iteration:int)->None:\n \"Writes model weight histograms to Tensorboard.\"\n generator, critic = self.learn.gan_trainer.generator, self.learn.gan_trainer.critic\n self.hist_writer.write(model=generator, iteration=iteration, tbwriter=self.tbwriter, name='generator')\n self.hist_writer.write(model=critic, iteration=iteration, tbwriter=self.tbwriter, name='critic')"} {"instruction": "Create a Python 3 function to\nwrite gradient statistics for generator to Tensorboard.", "response": "def _write_gen_model_stats(self, iteration:int)->None:\n \"Writes gradient statistics for generator to Tensorboard.\"\n generator = self.learn.gan_trainer.generator\n self.stats_writer.write(model=generator, iteration=iteration, tbwriter=self.tbwriter, name='gen_model_stats')\n self.gen_stats_updated = True"} {"instruction": "Can you implement a function in Python 3 that\nwrites gradient statistics for critic to Tensorboard.", "response": "def _write_critic_model_stats(self, iteration:int)->None:\n \"Writes gradient statistics for critic to Tensorboard.\"\n critic = self.learn.gan_trainer.critic\n self.stats_writer.write(model=critic, iteration=iteration, tbwriter=self.tbwriter, name='crit_model_stats')\n self.crit_stats_updated = True"} {"instruction": "Can you create a Python 3 function that\nwrites gradient statistics to Tensorboard.", "response": "def _write_model_stats(self, iteration:int)->None:\n \"Writes gradient statistics to Tensorboard.\"\n # We don't want to write stats when model is not iterated on and hence has zeroed out gradients\n gen_mode = self.learn.gan_trainer.gen_mode\n if gen_mode and not self.gen_stats_updated: self._write_gen_model_stats(iteration=iteration)\n if not gen_mode and not self.crit_stats_updated: self._write_critic_model_stats(iteration=iteration)"} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef _write_training_loss(self, iteration:int, last_loss:Tensor)->None:\n \"Writes training loss to Tensorboard.\"\n recorder = self.learn.gan_trainer.recorder\n if len(recorder.losses) == 0: return\n scalar_value = to_np((recorder.losses[-1:])[0])\n tag = self.metrics_root + 'train_loss'\n self.tbwriter.add_scalar(tag=tag, scalar_value=scalar_value, global_step=iteration)", "response": "Writes training loss to Tensorboard."} {"instruction": "Make a summary of the following Python 3 code\ndef _write_images(self, iteration:int)->None:\n \"Writes model generated, original and real images to Tensorboard.\"\n trainer = self.learn.gan_trainer\n #TODO: Switching gen_mode temporarily seems a bit hacky here. Certainly not a good side-effect. Is there a better way?\n gen_mode = trainer.gen_mode\n try:\n trainer.switch(gen_mode=True)\n self.img_gen_vis.write(learn=self.learn, trn_batch=self.trn_batch, val_batch=self.val_batch, \n iteration=iteration, tbwriter=self.tbwriter)\n finally: trainer.switch(gen_mode=gen_mode)", "response": "Writes model generated original and real images to Tensorboard."} {"instruction": "Make a summary of the following Python 3 code\ndef on_batch_end(self, iteration:int, **kwargs)->None:\n \"Callback function that writes batch end appropriate data to Tensorboard.\"\n super().on_batch_end(iteration=iteration, **kwargs)\n if iteration == 0: return\n if iteration % self.visual_iters == 0: self._write_images(iteration=iteration)", "response": "Callback function that writes batch end appropriate data to Tensorboard."} {"instruction": "Implement a Python 3 function for\nwriting model generated original and real images to Tensorboard", "response": "def _write_images(self, iteration:int)->None:\n \"Writes model generated, original and real images to Tensorboard\"\n self.img_gen_vis.write(learn=self.learn, trn_batch=self.trn_batch, val_batch=self.val_batch, iteration=iteration, \n tbwriter=self.tbwriter)"} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef request_write(self, request: TBWriteRequest)->None:\n \"Queues up an asynchronous write request to Tensorboard.\"\n if self.stop_request.isSet(): return\n self.queue.put(request)", "response": "Queues up an asynchronous write request to Tensorboard."} {"instruction": "Create a Python 3 function to\nprocess queued up write requests asynchronously to Tensorboard.", "response": "def _queue_processor(self)->None:\n \"Processes queued up write requests asynchronously to Tensorboard.\"\n while not self.stop_request.isSet():\n while not self.queue.empty():\n if self.stop_request.isSet(): return\n request = self.queue.get()\n request.write()\n sleep(0.2)"} {"instruction": "Given the following Python 3 function, write the documentation\ndef get_list_from_model(learn:Learner, ds_type:DatasetType, batch:Tuple)->[]:\n \"Factory method to convert a batch of model images to a list of ModelImageSet.\"\n image_sets = []\n x,y = batch[0],batch[1]\n preds = learn.pred_batch(ds_type=ds_type, batch=(x,y), reconstruct=True) \n for orig_px, real_px, gen in zip(x,y,preds):\n orig, real = Image(px=orig_px), Image(px=real_px)\n image_set = ModelImageSet(orig=orig, real=real, gen=gen)\n image_sets.append(image_set)\n return image_sets", "response": "Factory method to convert a batch of model images to a list of ModelImageSet."} {"instruction": "Write a Python 3 script to\nwrite single model histogram to Tensorboard.", "response": "def _write_histogram(self, param_name:str, values)->None:\n \"Writes single model histogram to Tensorboard.\"\n tag = self.name + '/weights/' + param_name\n self.tbwriter.add_histogram(tag=tag, values=values, global_step=self.iteration)"} {"instruction": "Write a Python 3 function for\nwriting model histograms to Tensorboard.", "response": "def write(self)->None:\n \"Writes model histograms to Tensorboard.\"\n for param_name, values in self.params: self._write_histogram(param_name=param_name, values=values)"} {"instruction": "Explain what the following Python 3 code does\ndef write(self, model:nn.Module, iteration:int, tbwriter:SummaryWriter, name:str='model')->None:\n \"Writes model histograms to Tensorboard.\"\n request = HistogramTBRequest(model=model, iteration=iteration, tbwriter=tbwriter, name=name)\n asyncTBWriter.request_write(request)", "response": "Writes model histograms to Tensorboard."} {"instruction": "Write a Python 3 function that can\nwrite a single scalar value for a gradient statistic to Tensorboard.", "response": "def _add_gradient_scalar(self, name:str, scalar_value)->None:\n \"Writes a single scalar value for a gradient statistic to Tensorboard.\"\n tag = self.name + '/gradients/' + name\n self.tbwriter.add_scalar(tag=tag, scalar_value=scalar_value, global_step=self.iteration)"} {"instruction": "Write a Python 3 script for\nwriting the average norm of the gradients to Tensorboard.", "response": "def _write_avg_norm(self, norms:[])->None:\n \"Writes the average norm of the gradients to Tensorboard.\"\n avg_norm = sum(norms)/len(self.gradients)\n self._add_gradient_scalar('avg_norm', scalar_value=avg_norm)"} {"instruction": "Explain what the following Python 3 code does\ndef _write_median_norm(self, norms:[])->None:\n \"Writes the median norm of the gradients to Tensorboard.\"\n median_norm = statistics.median(norms)\n self._add_gradient_scalar('median_norm', scalar_value=median_norm)", "response": "Writes the median norm of the gradients to Tensorboard."} {"instruction": "Write a Python 3 script to\nwrite the maximum norm of the gradients to Tensorboard.", "response": "def _write_max_norm(self, norms:[])->None:\n \"Writes the maximum norm of the gradients to Tensorboard.\"\n max_norm = max(norms)\n self._add_gradient_scalar('max_norm', scalar_value=max_norm)"} {"instruction": "Write a Python 3 function that can\nwrite the minimum norm of the gradients to Tensorboard.", "response": "def _write_min_norm(self, norms:[])->None:\n \"Writes the minimum norm of the gradients to Tensorboard.\"\n min_norm = min(norms)\n self._add_gradient_scalar('min_norm', scalar_value=min_norm)"} {"instruction": "Write a Python 3 script to\nwrite the number of zeroes in the gradients to Tensorboard.", "response": "def _write_num_zeros(self)->None:\n \"Writes the number of zeroes in the gradients to Tensorboard.\"\n gradient_nps = [to_np(x.data) for x in self.gradients]\n num_zeros = sum((np.asarray(x) == 0.0).sum() for x in gradient_nps)\n self._add_gradient_scalar('num_zeros', scalar_value=num_zeros)"} {"instruction": "Given the following Python 3 function, write the documentation\ndef _write_avg_gradient(self)->None:\n \"Writes the average of the gradients to Tensorboard.\"\n avg_gradient = sum(x.data.mean() for x in self.gradients)/len(self.gradients)\n self._add_gradient_scalar('avg_gradient', scalar_value=avg_gradient)", "response": "Writes the average of the gradients to Tensorboard."} {"instruction": "Implement a Python 3 function for\nwriting the median of the gradients to Tensorboard.", "response": "def _write_median_gradient(self)->None:\n \"Writes the median of the gradients to Tensorboard.\"\n median_gradient = statistics.median(x.data.median() for x in self.gradients)\n self._add_gradient_scalar('median_gradient', scalar_value=median_gradient)"} {"instruction": "Create a Python 3 function for\nwriting the maximum of the gradients to Tensorboard.", "response": "def _write_max_gradient(self)->None:\n \"Writes the maximum of the gradients to Tensorboard.\"\n max_gradient = max(x.data.max() for x in self.gradients)\n self._add_gradient_scalar('max_gradient', scalar_value=max_gradient)"} {"instruction": "How would you explain what the following Python 3 function does\ndef _write_min_gradient(self)->None:\n \"Writes the minimum of the gradients to Tensorboard.\"\n min_gradient = min(x.data.min() for x in self.gradients)\n self._add_gradient_scalar('min_gradient', scalar_value=min_gradient)", "response": "Writes the minimum of the gradients to Tensorboard."} {"instruction": "Make a summary of the following Python 3 code\ndef write(self)->None:\n \"Writes model gradient statistics to Tensorboard.\"\n if len(self.gradients) == 0: return\n norms = [x.data.norm() for x in self.gradients]\n self._write_avg_norm(norms=norms)\n self._write_median_norm(norms=norms)\n self._write_max_norm(norms=norms)\n self._write_min_norm(norms=norms)\n self._write_num_zeros()\n self._write_avg_gradient()\n self._write_median_gradient()\n self._write_max_gradient()\n self._write_min_gradient()", "response": "Writes model gradient statistics to Tensorboard."} {"instruction": "Here you have a function in Python 3, explain what it does\ndef _write_images(self, name:str, images:[Tensor])->None:\n \"Writes list of images as tensors to Tensorboard.\"\n tag = self.ds_type.name + ' ' + name\n self.tbwriter.add_image(tag=tag, img_tensor=vutils.make_grid(images, normalize=True), global_step=self.iteration)", "response": "Writes list of images as tensors to Tensorboard."} {"instruction": "Make a summary of the following Python 3 code\ndef _get_image_tensors(self)->([Tensor], [Tensor], [Tensor]):\n \"Gets list of image tensors from lists of Image objects, as a tuple of original, generated and real(target) images.\"\n orig_images, gen_images, real_images = [], [], []\n for image_set in self.image_sets:\n orig_images.append(image_set.orig.px)\n gen_images.append(image_set.gen.px)\n real_images.append(image_set.real.px) \n return orig_images, gen_images, real_images", "response": "Gets list of image tensors from lists of Image objects as a tuple of original generated and real ( target ) images."} {"instruction": "Write a Python 3 script for\nwriting original generated and real ( target ) images to Tensorboard.", "response": "def write(self)->None:\n \"Writes original, generated and real(target) images to Tensorboard.\"\n orig_images, gen_images, real_images = self._get_image_tensors()\n self._write_images(name='orig images', images=orig_images)\n self._write_images(name='gen images', images=gen_images)\n self._write_images(name='real images', images=real_images)"} {"instruction": "How would you implement a function in Python 3 that\nwrites training and validation batch images to Tensorboard.", "response": "def write(self, learn:Learner, trn_batch:Tuple, val_batch:Tuple, iteration:int, tbwriter:SummaryWriter)->None:\n \"Writes training and validation batch images to Tensorboard.\"\n self._write_for_dstype(learn=learn, batch=val_batch, iteration=iteration, tbwriter=tbwriter, ds_type=DatasetType.Valid)\n self._write_for_dstype(learn=learn, batch=trn_batch, iteration=iteration, tbwriter=tbwriter, ds_type=DatasetType.Train)"} {"instruction": "Implement a function in Python 3 to\nwrite batch images of specified DatasetType to Tensorboard.", "response": "def _write_for_dstype(self, learn:Learner, batch:Tuple, iteration:int, tbwriter:SummaryWriter, ds_type:DatasetType)->None:\n \"Writes batch images of specified DatasetType to Tensorboard.\"\n request = ImageTBRequest(learn=learn, batch=batch, iteration=iteration, tbwriter=tbwriter, ds_type=ds_type)\n asyncTBWriter.request_write(request)"} {"instruction": "How would you implement a function in Python 3 that\nwrites single model graph to Tensorboard.", "response": "def write(self)->None:\n \"Writes single model graph to Tensorboard.\"\n self.tbwriter.add_graph(model=self.model, input_to_model=self.input_to_model)"} {"instruction": "Write a Python 3 function for\nwriting model graph to Tensorboard.", "response": "def write(self, model:nn.Module, tbwriter:SummaryWriter, input_to_model:torch.Tensor)->None:\n \"Writes model graph to Tensorboard.\"\n request = GraphTBRequest(model=model, tbwriter=tbwriter, input_to_model=input_to_model)\n asyncTBWriter.request_write(request)"} {"instruction": "Write a Python 3 script to\nperform batch norm inference on the given training data.", "response": "def fix_batchnorm(swa_model, train_dl):\n \"\"\"\n During training, batch norm layers keep track of a running mean and\n variance of the previous layer's activations. Because the parameters\n of the SWA model are computed as the average of other models' parameters,\n the SWA model never sees the training data itself, and therefore has no\n opportunity to compute the correct batch norm statistics. Before performing \n inference with the SWA model, we perform a single pass over the training data\n to calculate an accurate running mean and variance for each batch norm layer.\n \"\"\"\n bn_modules = []\n swa_model.apply(lambda module: collect_bn_modules(module, bn_modules))\n \n if not bn_modules: return\n\n swa_model.train()\n\n for module in bn_modules:\n module.running_mean = torch.zeros_like(module.running_mean)\n module.running_var = torch.ones_like(module.running_var)\n \n momenta = [m.momentum for m in bn_modules]\n\n inputs_seen = 0\n\n for (*x,y) in iter(train_dl): \n xs = V(x)\n batch_size = xs[0].size(0)\n\n momentum = batch_size / (inputs_seen + batch_size)\n for module in bn_modules:\n module.momentum = momentum\n \n res = swa_model(*xs) \n \n inputs_seen += batch_size\n \n for module, momentum in zip(bn_modules, momenta):\n module.momentum = momentum"} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef repackage_var(h):\n if IS_TORCH_04: return h.detach() if type(h) == torch.Tensor else tuple(repackage_var(v) for v in h)\n else: return Variable(h.data) if type(h) == Variable else tuple(repackage_var(v) for v in h)", "response": "Wraps h in new Variables to detach them from their history."} {"instruction": "How would you code a function in Python 3 to\nreturn a SequentialRNN model for the given language.", "response": "def get_language_model(n_tok, emb_sz, n_hid, n_layers, pad_token,\n dropout=0.4, dropouth=0.3, dropouti=0.5, dropoute=0.1, wdrop=0.5, tie_weights=True, qrnn=False, bias=False):\n \"\"\"Returns a SequentialRNN model.\n\n A RNN_Encoder layer is instantiated using the parameters provided.\n\n This is followed by the creation of a LinearDecoder layer.\n\n Also by default (i.e. tie_weights = True), the embedding matrix used in the RNN_Encoder\n is used to instantiate the weights for the LinearDecoder layer.\n\n The SequentialRNN layer is the native torch's Sequential wrapper that puts the RNN_Encoder and\n LinearDecoder layers sequentially in the model.\n\n Args:\n n_tok (int): number of unique vocabulary words (or tokens) in the source dataset\n emb_sz (int): the embedding size to use to encode each token\n n_hid (int): number of hidden activation per LSTM layer\n n_layers (int): number of LSTM layers to use in the architecture\n pad_token (int): the int value used for padding text.\n dropouth (float): dropout to apply to the activations going from one LSTM layer to another\n dropouti (float): dropout to apply to the input layer.\n dropoute (float): dropout to apply to the embedding layer.\n wdrop (float): dropout used for a LSTM's internal (or hidden) recurrent weights.\n tie_weights (bool): decide if the weights of the embedding matrix in the RNN encoder should be tied to the\n weights of the LinearDecoder layer.\n qrnn (bool): decide if the model is composed of LSTMS (False) or QRNNs (True).\n bias (bool): decide if the decoder should have a bias layer or not.\n Returns:\n A SequentialRNN model\n \"\"\"\n rnn_enc = RNN_Encoder(n_tok, emb_sz, n_hid=n_hid, n_layers=n_layers, pad_token=pad_token,\n dropouth=dropouth, dropouti=dropouti, dropoute=dropoute, wdrop=wdrop, qrnn=qrnn)\n enc = rnn_enc.encoder if tie_weights else None\n return SequentialRNN(rnn_enc, LinearDecoder(n_tok, emb_sz, dropout, tie_encoder=enc, bias=bias))"} {"instruction": "How would you code a function in Python 3 to\ninvoke during the forward propagation of the RNN_Encoder module. Args: input (Tensor): input of shape (sentence length x batch_size) Returns: raw_outputs (tuple(list (Tensor), list(Tensor)): list of tensors evaluated from each RNN layer without using dropouth, list of tensors evaluated from each RNN layer using dropouth,", "response": "def forward(self, input):\n \"\"\" Invoked during the forward propagation of the RNN_Encoder module.\n Args:\n input (Tensor): input of shape (sentence length x batch_size)\n\n Returns:\n raw_outputs (tuple(list (Tensor), list(Tensor)): list of tensors evaluated from each RNN layer without using\n dropouth, list of tensors evaluated from each RNN layer using dropouth,\n \"\"\"\n sl,bs = input.size()\n if bs!=self.bs:\n self.bs=bs\n self.reset()\n with set_grad_enabled(self.training):\n emb = self.encoder_with_dropout(input, dropout=self.dropoute if self.training else 0)\n emb = self.dropouti(emb)\n raw_output = emb\n new_hidden,raw_outputs,outputs = [],[],[]\n for l, (rnn,drop) in enumerate(zip(self.rnns, self.dropouths)):\n current_input = raw_output\n with warnings.catch_warnings():\n warnings.simplefilter(\"ignore\")\n raw_output, new_h = rnn(raw_output, self.hidden[l])\n new_hidden.append(new_h)\n raw_outputs.append(raw_output)\n if l != self.n_layers - 1: raw_output = drop(raw_output)\n outputs.append(raw_output)\n\n self.hidden = repackage_var(new_hidden)\n return raw_outputs, outputs"} {"instruction": "Can you implement a function in Python 3 that\nreplaces repetitions at the character level in t.", "response": "def replace_rep(t:str) -> str:\n \"Replace repetitions at the character level in `t`.\"\n def _replace_rep(m:Collection[str]) -> str:\n c,cc = m.groups()\n return f' {TK_REP} {len(cc)+1} {c} '\n re_rep = re.compile(r'(\\S)(\\1{3,})')\n return re_rep.sub(_replace_rep, t)"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef replace_wrep(t:str) -> str:\n \"Replace word repetitions in `t`.\"\n def _replace_wrep(m:Collection[str]) -> str:\n c,cc = m.groups()\n return f' {TK_WREP} {len(cc.split())+1} {c} '\n re_wrep = re.compile(r'(\\b\\w+\\W+)(\\1{3,})')\n return re_wrep.sub(_replace_wrep, t)", "response": "Replace word repetitions in t."} {"instruction": "Make a summary of the following Python 3 code\ndef fix_html(x:str) -> str:\n \"List of replacements from html strings in `x`.\"\n re1 = re.compile(r' +')\n x = x.replace('#39;', \"'\").replace('amp;', '&').replace('#146;', \"'\").replace(\n 'nbsp;', ' ').replace('#36;', '$').replace('\\\\n', \"\\n\").replace('quot;', \"'\").replace(\n '
', \"\\n\").replace('\\\\\"', '\"').replace('',UNK).replace(' @.@ ','.').replace(\n ' @-@ ','-').replace(' @,@ ',',').replace('\\\\', ' \\\\ ')\n return re1.sub(' ', html.unescape(x))", "response": "List of replacements from html strings in x."} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef replace_all_caps(x:Collection[str]) -> Collection[str]:\n \"Replace tokens in ALL CAPS in `x` by their lower version and add `TK_UP` before.\"\n res = []\n for t in x:\n if t.isupper() and len(t) > 1: res.append(TK_UP); res.append(t.lower())\n else: res.append(t)\n return res", "response": "Replace tokens in ALL CAPS in x by their lower version and add TK_UP before."} {"instruction": "Create a Python 3 function to\nreplace all Capitalized tokens in x by their lower version and add TK_MAJ before.", "response": "def deal_caps(x:Collection[str]) -> Collection[str]:\n \"Replace all Capitalized tokens in `x` by their lower version and add `TK_MAJ` before.\"\n res = []\n for t in x:\n if t == '': continue\n if t[0].isupper() and len(t) > 1 and t[1:].islower(): res.append(TK_MAJ)\n res.append(t.lower())\n return res"} {"instruction": "Can you create a Python 3 function that\nprocesses one text t with tokenizer tok.", "response": "def process_text(self, t:str, tok:BaseTokenizer) -> List[str]:\n \"Process one text `t` with tokenizer `tok`.\"\n for rule in self.pre_rules: t = rule(t)\n toks = tok.tokenizer(t)\n for rule in self.post_rules: toks = rule(toks)\n return toks"} {"instruction": "How would you explain what the following Python 3 function does\ndef _process_all_1(self, texts:Collection[str]) -> List[List[str]]:\n \"Process a list of `texts` in one process.\"\n tok = self.tok_func(self.lang)\n if self.special_cases: tok.add_special_cases(self.special_cases)\n return [self.process_text(str(t), tok) for t in texts]", "response": "Process a list of texts in one process."} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef process_all(self, texts:Collection[str]) -> List[List[str]]:\n \"Process a list of `texts`.\"\n if self.n_cpus <= 1: return self._process_all_1(texts)\n with ProcessPoolExecutor(self.n_cpus) as e:\n return sum(e.map(self._process_all_1, partition_by_cores(texts, self.n_cpus)), [])", "response": "Process a list of texts."} {"instruction": "How would you explain what the following Python 3 function does\ndef numericalize(self, t:Collection[str]) -> List[int]:\n \"Convert a list of tokens `t` to their ids.\"\n return [self.stoi[w] for w in t]", "response": "Convert a list of tokens t to their ids.\n "} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef textify(self, nums:Collection[int], sep=' ') -> List[str]:\n \"Convert a list of `nums` to their tokens.\"\n return sep.join([self.itos[i] for i in nums]) if sep is not None else [self.itos[i] for i in nums]", "response": "Convert a list of nums to their tokens.\n "} {"instruction": "Make a summary of the following Python 3 code\ndef create(cls, tokens:Tokens, max_vocab:int, min_freq:int) -> 'Vocab':\n \"Create a vocabulary from a set of `tokens`.\"\n freq = Counter(p for o in tokens for p in o)\n itos = [o for o,c in freq.most_common(max_vocab) if c >= min_freq]\n for o in reversed(defaults.text_spec_tok):\n if o in itos: itos.remove(o)\n itos.insert(0, o)\n return cls(itos)", "response": "Create a vocabulary from a set of tokens."} {"instruction": "Can you write a function in Python 3 where it\nloads the Vocab contained in path", "response": "def load(cls, path):\n \"Load the `Vocab` contained in `path`\"\n itos = pickle.load(open(path, 'rb'))\n return cls(itos)"} {"instruction": "Create a Python 3 function for\nplotting loss function as function of iterations.", "response": "def plot_loss(self, n_skip=10, n_skip_end=5):\n '''\n plots loss function as function of iterations. \n When used in Jupyternotebook, plot will be displayed in notebook. Else, plot will be displayed in console and both plot and loss are saved in save_path. \n '''\n if not in_ipynb(): plt.switch_backend('agg')\n plt.plot(self.iterations[n_skip:-n_skip_end], self.losses[n_skip:-n_skip_end])\n if not in_ipynb():\n plt.savefig(os.path.join(self.save_path, 'loss_plot.png'))\n np.save(os.path.join(self.save_path, 'losses.npy'), self.losses[10:])"} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef plot_lr(self):\n '''Plots learning rate in jupyter notebook or console, depending on the enviroment of the learner.'''\n if not in_ipynb():\n plt.switch_backend('agg')\n if self.record_mom:\n fig, axs = plt.subplots(1,2,figsize=(12,4))\n for i in range(0,2): axs[i].set_xlabel('iterations')\n axs[0].set_ylabel('learning rate')\n axs[1].set_ylabel('momentum')\n axs[0].plot(self.iterations,self.lrs)\n axs[1].plot(self.iterations,self.momentums) \n else:\n plt.xlabel(\"iterations\")\n plt.ylabel(\"learning rate\")\n plt.plot(self.iterations, self.lrs)\n if not in_ipynb():\n plt.savefig(os.path.join(self.save_path, 'lr_plot.png'))", "response": "Plots learning rate in jupyter notebook or console depending on the enviroment of the learner."} {"instruction": "Explain what the following Python 3 code does\ndef plot(self, n_skip=10, n_skip_end=5):\n '''\n Plots the loss function with respect to learning rate, in log scale. \n '''\n plt.ylabel(\"validation loss\")\n plt.xlabel(\"learning rate (log scale)\")\n plt.plot(self.lrs[n_skip:-(n_skip_end+1)], self.losses[n_skip:-(n_skip_end+1)])\n plt.xscale('log')", "response": "Plots the loss function with respect to learning rate in log scale."} {"instruction": "How would you explain what the following Python 3 function does\ndef plot_lr(self, show_text=True, show_moms=True):\n phase_limits = [0]\n for nb_batch, phase in zip(self.nb_batches, self.phases):\n phase_limits.append(phase_limits[-1] + nb_batch * phase.epochs)\n if not in_ipynb():\n plt.switch_backend('agg')\n np_plts = 2 if show_moms else 1\n fig, axs = plt.subplots(1,np_plts,figsize=(6*np_plts,4))\n if not show_moms: axs = [axs]\n for i in range(np_plts): axs[i].set_xlabel('iterations')\n axs[0].set_ylabel('learning rate')\n axs[0].plot(self.iterations,self.lrs)\n if show_moms:\n axs[1].set_ylabel('momentum')\n axs[1].plot(self.iterations,self.momentums)\n if show_text: \n for i, phase in enumerate(self.phases):\n text = phase.opt_fn.__name__\n if phase.wds is not None: text+='\\nwds='+str(phase.wds)\n if phase.beta is not None: text+='\\nbeta='+str(phase.beta)\n for k in range(np_plts):\n if i < len(self.phases)-1:\n draw_line(axs[k], phase_limits[i+1])\n draw_text(axs[k], (phase_limits[i]+phase_limits[i+1])/2, text) \n if not in_ipynb():\n plt.savefig(os.path.join(self.save_path, 'lr_plot.png'))", "response": "Plots the learning rate of the current state of the user."} {"instruction": "Can you write a function in Python 3 where it\ndistributes training of Imagenette.", "response": "def main(\n gpu:Param(\"GPU to run on\", str)=None,\n woof: Param(\"Use imagewoof (otherwise imagenette)\", int)=0,\n lr: Param(\"Learning rate\", float)=1e-3,\n size: Param(\"Size (px: 128,192,224)\", int)=128,\n alpha: Param(\"Alpha\", float)=0.99,\n mom: Param(\"Momentum\", float)=0.9,\n eps: Param(\"epsilon\", float)=1e-6,\n epochs: Param(\"Number of epochs\", int)=5,\n bs: Param(\"Batch size\", int)=256,\n mixup: Param(\"Mixup\", float)=0.,\n opt: Param(\"Optimizer (adam,rms,sgd)\", str)='adam',\n arch: Param(\"Architecture (xresnet34, xresnet50, presnet34, presnet50)\", str)='xresnet50',\n dump: Param(\"Print model; don't train\", int)=0,\n ):\n \"Distributed training of Imagenette.\"\n\n gpu = setup_distrib(gpu)\n if gpu is None: bs *= torch.cuda.device_count()\n if opt=='adam' : opt_func = partial(optim.Adam, betas=(mom,alpha), eps=eps)\n elif opt=='rms' : opt_func = partial(optim.RMSprop, alpha=alpha, eps=eps)\n elif opt=='sgd' : opt_func = partial(optim.SGD, momentum=mom)\n\n data = get_data(size, woof, bs)\n bs_rat = bs/256\n if gpu is not None: bs_rat *= num_distrib()\n if not gpu: print(f'lr: {lr}; eff_lr: {lr*bs_rat}; size: {size}; alpha: {alpha}; mom: {mom}; eps: {eps}')\n lr *= bs_rat\n\n m = globals()[arch]\n learn = (Learner(data, m(c_out=10), wd=1e-2, opt_func=opt_func,\n metrics=[accuracy,top_k_accuracy],\n bn_wd=False, true_wd=True,\n loss_func = LabelSmoothingCrossEntropy())\n )\n if dump: print(learn.model); exit()\n if mixup: learn = learn.mixup(alpha=mixup)\n learn = learn.to_fp16(dynamic=True)\n if gpu is None: learn.to_parallel()\n elif num_distrib()>1: learn.to_distributed(gpu) # Requires `-m fastai.launch`\n\n learn.fit_one_cycle(epochs, lr, div_factor=10, pct_start=0.3)"} {"instruction": "Here you have a function in Python 3, explain what it does\ndef on_batch_end(self, last_loss, epoch, num_batch, **kwargs:Any)->None:\n \"Test if `last_loss` is NaN and interrupts training.\"\n if self.stop: return True #to skip validation after stopping during training\n if torch.isnan(last_loss):\n print (f'Epoch/Batch ({epoch}/{num_batch}): Invalid loss, terminating training.')\n return {'stop_epoch': True, 'stop_training': True, 'skip_validate': True}", "response": "Test if last_loss is NaN and interrupts training."} {"instruction": "How would you implement a function in Python 3 that\ninitializes the best value.", "response": "def on_train_begin(self, **kwargs:Any)->None:\n \"Initializes the best value.\"\n self.best = float('inf') if self.operator == np.less else -float('inf')"} {"instruction": "Given the following Python 3 function, write the documentation\ndef get_monitor_value(self):\n \"Pick the monitored value.\"\n if self.monitor=='trn_loss' and len(self.learn.recorder.losses) == 0: return None\n elif len(self.learn.recorder.val_losses) == 0: return None\n values = {'train_loss':self.learn.recorder.losses[-1].cpu().numpy(),\n 'valid_loss':self.learn.recorder.val_losses[-1]}\n if values['valid_loss'] is None: return\n if self.learn.recorder.metrics:\n for m, n in zip(self.learn.recorder.metrics[-1],self.learn.recorder.names[3:-1]):\n values[n] = m\n if values.get(self.monitor) is None:\n warn(f'{self.__class__} conditioned on metric `{self.monitor}` which is not available. Available metrics are: {\", \".join(map(str, self.learn.recorder.names[1:-1]))}')\n return values.get(self.monitor)", "response": "Pick the monitored value."} {"instruction": "Given the following Python 3 function, write the documentation\ndef on_epoch_end(self, epoch:int, **kwargs:Any)->None:\n \"Compare the value monitored to its best score and maybe save the model.\"\n if self.every==\"epoch\": self.learn.save(f'{self.name}_{epoch}')\n else: #every=\"improvement\"\n current = self.get_monitor_value()\n if current is not None and self.operator(current, self.best):\n print(f'Better model found at epoch {epoch} with {self.monitor} value: {current}.')\n self.best = current\n self.learn.save(f'{self.name}')", "response": "Compare the value monitored to its best score and maybe save the model."} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef on_train_end(self, **kwargs):\n \"Load the best model.\"\n if self.every==\"improvement\" and (self.learn.path/f'{self.learn.model_dir}/{self.name}.pth').is_file():\n self.learn.load(f'{self.name}', purge=False)", "response": "Load the best model."} {"instruction": "Can you write a function in Python 3 where it\ncompares the value monitored to its best and maybe reduce lr.", "response": "def on_epoch_end(self, epoch, **kwargs:Any)->None:\n \"Compare the value monitored to its best and maybe reduce lr.\"\n current = self.get_monitor_value()\n if current is None: return\n if self.operator(current - self.min_delta, self.best): self.best,self.wait = current,0\n else:\n self.wait += 1\n if self.wait > self.patience:\n self.opt.lr *= self.factor\n self.wait = 0\n print(f'Epoch {epoch}: reducing lr to {self.opt.lr}')"} {"instruction": "Write a Python 3 script for\nconverting a notebook fname to html file in dest_path.", "response": "def convert_nb(fname, dest_path='.'):\n \"Convert a notebook `fname` to html file in `dest_path`.\"\n from .gen_notebooks import remove_undoc_cells, remove_code_cell_jupyter_widget_state_elem\n nb = read_nb(fname)\n nb['cells'] = remove_undoc_cells(nb['cells'])\n nb['cells'] = remove_code_cell_jupyter_widget_state_elem(nb['cells'])\n fname = Path(fname).absolute()\n dest_name = fname.with_suffix('.html').name\n meta = nb['metadata']\n meta_jekyll = meta['jekyll'] if 'jekyll' in meta else {'title': fname.with_suffix('').name}\n meta_jekyll['nb_path'] = f'{fname.parent.name}/{fname.name}'\n with open(f'{dest_path}/{dest_name}','w') as f:\n f.write(exporter.from_notebook_node(nb, resources=meta_jekyll)[0])"} {"instruction": "Explain what the following Python 3 code does\ndef convert_all(folder, dest_path='.', force_all=False):\n \"Convert modified notebooks in `folder` to html pages in `dest_path`.\"\n path = Path(folder)\n\n changed_cnt = 0\n for fname in path.glob(\"*.ipynb\"):\n # only rebuild modified files\n fname_out = Path(dest_path)/fname.with_suffix('.html').name\n if not force_all and fname_out.exists():\n in_mod = os.path.getmtime(fname)\n out_mod = os.path.getmtime(fname_out)\n if in_mod < out_mod: continue\n\n print(f\"converting: {fname} => {fname_out}\")\n changed_cnt += 1\n convert_nb(fname, dest_path=dest_path)\n if not changed_cnt: print(\"No notebooks were modified\")", "response": "Convert modified notebooks in folder to html pages in dest_path."} {"instruction": "How would you code a function in Python 3 to\nfunction that collect samples and adds padding. Flips token order if needed", "response": "def pad_collate(samples:BatchSamples, pad_idx:int=1, pad_first:bool=True, backwards:bool=False) -> Tuple[LongTensor, LongTensor]:\n \"Function that collect samples and adds padding. Flips token order if needed\"\n samples = to_data(samples)\n max_len = max([len(s[0]) for s in samples])\n res = torch.zeros(len(samples), max_len).long() + pad_idx\n if backwards: pad_first = not pad_first\n for i,s in enumerate(samples):\n if pad_first: res[i,-len(s[0]):] = LongTensor(s[0])\n else: res[i,:len(s[0]):] = LongTensor(s[0])\n if backwards: res = res.flip(1)\n return res, tensor(np.array([s[1] for s in samples]))"} {"instruction": "Write a Python 3 script to\nread the text in fn.", "response": "def open_text(fn:PathOrStr, enc='utf-8'):\n \"Read the text in `fn`.\"\n with open(fn,'r', encoding = enc) as f: return ''.join(f.readlines())"} {"instruction": "Write a Python 3 function for\ncreating the ragged array that will be filled when we ask for items.", "response": "def allocate_buffers(self):\n \"Create the ragged array that will be filled when we ask for items.\"\n if self.ite_len is None: len(self)\n self.idx = LanguageModelPreLoader.CircularIndex(len(self.dataset.x.items), not self.backwards)\n self.batch = np.zeros((self.bs, self.bptt+1), dtype=np.int64)\n self.batch_x, self.batch_y = self.batch[:,0:self.bptt], self.batch[:,1:self.bptt+1]\n #ro: index of the text we're at inside our datasets for the various batches\n self.ro = np.zeros(self.bs, dtype=np.int64)\n #ri: index of the token we're at inside our current text for the various batches\n self.ri = np.zeros(self.bs, dtype=np.int)"} {"instruction": "Can you implement a function in Python 3 that\nfills the row with tokens from the ragged array. -- OBS -- overlap!= 1 has not been implemented", "response": "def fill_row(self, forward, items, idx, row, ro, ri, overlap,lengths):\n \"Fill the row with tokens from the ragged array. --OBS-- overlap != 1 has not been implemented\"\n ibuf = n = 0\n ro -= 1\n while ibuf < row.size:\n ro += 1\n ix = idx[ro]\n rag = items[ix]\n if forward:\n ri = 0 if ibuf else ri\n n = min(lengths[ix] - ri, row.size - ibuf)\n row[ibuf:ibuf+n] = rag[ri:ri+n]\n else:\n ri = lengths[ix] if ibuf else ri\n n = min(ri, row.size - ibuf)\n row[ibuf:ibuf+n] = rag[ri-n:ri][::-1]\n ibuf += n\n return ro, ri + ((n-overlap) if forward else -(n-overlap))"} {"instruction": "Explain what the following Python 3 code does\ndef from_ids(cls, path:PathOrStr, vocab:Vocab, train_ids:Collection[Collection[int]], valid_ids:Collection[Collection[int]],\n test_ids:Collection[Collection[int]]=None, train_lbls:Collection[Union[int,float]]=None,\n valid_lbls:Collection[Union[int,float]]=None, classes:Collection[Any]=None,\n processor:PreProcessor=None, **kwargs) -> DataBunch:\n \"Create a `TextDataBunch` from ids, labels and a `vocab`. `kwargs` are passed to the dataloader creation.\"\n src = ItemLists(path, TextList(train_ids, vocab, path=path, processor=[]),\n TextList(valid_ids, vocab, path=path, processor=[]))\n src = src.label_for_lm() if cls==TextLMDataBunch else src.label_from_lists(train_lbls, valid_lbls, classes=classes, processor=[])\n if not is1d(train_lbls): src.train.y.one_hot,src.valid.y.one_hot = True,True\n if test_ids is not None: src.add_test(TextList(test_ids, vocab, path=path), label=train_lbls[0])\n src.valid.x.processor = ifnone(processor, [TokenizeProcessor(), NumericalizeProcessor(vocab=vocab)])\n return src.databunch(**kwargs)", "response": "Create a TextDataBunch from ids labels and a vocab. kwargs are passed to the dataloader creation."} {"instruction": "Write a Python 3 script for\nloading a TextDataBunch from path / cache_name. kwargs are passed to the dataloader creation.", "response": "def load(cls, path:PathOrStr, cache_name:PathOrStr='tmp', processor:PreProcessor=None, **kwargs):\n \"Load a `TextDataBunch` from `path/cache_name`. `kwargs` are passed to the dataloader creation.\"\n warn(\"\"\"This method is deprecated and only kept to load data serialized in v1.0.43 or earlier.\n Use `load_data` for data saved with v1.0.44 or later.\"\"\", DeprecationWarning)\n cache_path = Path(path)/cache_name\n vocab = Vocab(pickle.load(open(cache_path/'itos.pkl','rb')))\n train_ids,train_lbls = np.load(cache_path/f'train_ids.npy'), np.load(cache_path/f'train_lbl.npy')\n valid_ids,valid_lbls = np.load(cache_path/f'valid_ids.npy'), np.load(cache_path/f'valid_lbl.npy')\n test_ids = np.load(cache_path/f'test_ids.npy') if os.path.isfile(cache_path/f'test_ids.npy') else None\n classes = loadtxt_str(cache_path/'classes.txt') if os.path.isfile(cache_path/'classes.txt') else None\n return cls.from_ids(path, vocab, train_ids, valid_ids, test_ids, train_lbls, valid_lbls, classes, processor, **kwargs)"} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef from_tokens(cls, path:PathOrStr, trn_tok:Collection[Collection[str]], trn_lbls:Collection[Union[int,float]],\n val_tok:Collection[Collection[str]], val_lbls:Collection[Union[int,float]], vocab:Vocab=None,\n tst_tok:Collection[Collection[str]]=None, classes:Collection[Any]=None, max_vocab:int=60000, min_freq:int=3,\n **kwargs) -> DataBunch:\n \"Create a `TextDataBunch` from tokens and labels. `kwargs` are passed to the dataloader creation.\"\n processor = NumericalizeProcessor(vocab=vocab, max_vocab=max_vocab, min_freq=min_freq)\n src = ItemLists(path, TextList(trn_tok, path=path, processor=processor),\n TextList(val_tok, path=path, processor=processor))\n src = src.label_for_lm() if cls==TextLMDataBunch else src.label_from_lists(trn_lbls, val_lbls, classes=classes)\n if tst_tok is not None: src.add_test(TextList(tst_tok, path=path))\n return src.databunch(**kwargs)", "response": "Create a TextDataBunch from tokens and labels. kwargs are passed to the dataloader creation."} {"instruction": "Write a Python 3 script for\ncreating a TextDataBunch from DataFrames. kwargs are passed to the dataloader creation.", "response": "def from_df(cls, path:PathOrStr, train_df:DataFrame, valid_df:DataFrame, test_df:Optional[DataFrame]=None,\n tokenizer:Tokenizer=None, vocab:Vocab=None, classes:Collection[str]=None, text_cols:IntsOrStrs=1,\n label_cols:IntsOrStrs=0, label_delim:str=None, chunksize:int=10000, max_vocab:int=60000,\n min_freq:int=2, mark_fields:bool=False, include_bos:bool=True, include_eos:bool=False, **kwargs) -> DataBunch:\n \"Create a `TextDataBunch` from DataFrames. `kwargs` are passed to the dataloader creation.\"\n processor = _get_processor(tokenizer=tokenizer, vocab=vocab, chunksize=chunksize, max_vocab=max_vocab,\n min_freq=min_freq, mark_fields=mark_fields, \n include_bos=include_bos, include_eos=include_eos)\n if classes is None and is_listy(label_cols) and len(label_cols) > 1: classes = label_cols\n src = ItemLists(path, TextList.from_df(train_df, path, cols=text_cols, processor=processor),\n TextList.from_df(valid_df, path, cols=text_cols, processor=processor))\n if cls==TextLMDataBunch: src = src.label_for_lm()\n else: \n if label_delim is not None: src = src.label_from_df(cols=label_cols, classes=classes, label_delim=label_delim)\n else: src = src.label_from_df(cols=label_cols, classes=classes)\n if test_df is not None: src.add_test(TextList.from_df(test_df, path, cols=text_cols))\n return src.databunch(**kwargs)"} {"instruction": "How would you code a function in Python 3 to\ncreate a TextDataBunch from texts in csv files. kwargs are passed to the dataloader creation.", "response": "def from_csv(cls, path:PathOrStr, csv_name, valid_pct:float=0.2, test:Optional[str]=None,\n tokenizer:Tokenizer=None, vocab:Vocab=None, classes:Collection[str]=None, delimiter:str=None, header='infer',\n text_cols:IntsOrStrs=1, label_cols:IntsOrStrs=0, label_delim:str=None,\n chunksize:int=10000, max_vocab:int=60000, min_freq:int=2, \n mark_fields:bool=False, include_bos:bool=True, include_eos:bool=False, **kwargs) -> DataBunch:\n \"Create a `TextDataBunch` from texts in csv files. `kwargs` are passed to the dataloader creation.\"\n df = pd.read_csv(Path(path)/csv_name, header=header, delimiter=delimiter)\n df = df.iloc[np.random.permutation(len(df))]\n cut = int(valid_pct * len(df)) + 1\n train_df, valid_df = df[cut:], df[:cut]\n test_df = None if test is None else pd.read_csv(Path(path)/test, header=header, delimiter=delimiter)\n return cls.from_df(path, train_df, valid_df, test_df, tokenizer=tokenizer, vocab=vocab, classes=classes, text_cols=text_cols,\n label_cols=label_cols, label_delim=label_delim, chunksize=chunksize, max_vocab=max_vocab,\n min_freq=min_freq, mark_fields=mark_fields, \n include_bos=include_bos, include_eos=include_eos, **kwargs)"} {"instruction": "Create a Python 3 function for\ncreating a TextDataBunch from text files in folders.", "response": "def from_folder(cls, path:PathOrStr, train:str='train', valid:str='valid', test:Optional[str]=None,\n classes:Collection[Any]=None, tokenizer:Tokenizer=None, vocab:Vocab=None, chunksize:int=10000, max_vocab:int=60000,\n min_freq:int=2, mark_fields:bool=False, include_bos:bool=True, include_eos:bool=False, **kwargs):\n \"Create a `TextDataBunch` from text files in folders.\"\n path = Path(path).absolute()\n processor = [OpenFileProcessor()] + _get_processor(tokenizer=tokenizer, vocab=vocab, chunksize=chunksize, max_vocab=max_vocab,\n min_freq=min_freq, mark_fields=mark_fields, include_bos=include_bos, include_eos=include_eos)\n src = (TextList.from_folder(path, processor=processor)\n .split_by_folder(train=train, valid=valid))\n src = src.label_for_lm() if cls==TextLMDataBunch else src.label_from_folder(classes=classes)\n if test is not None: src.add_test_folder(path/test)\n return src.databunch(**kwargs)"} {"instruction": "Given the following Python 3 function, write the documentation\ndef create(cls, train_ds, valid_ds, test_ds=None, path:PathOrStr='.', no_check:bool=False, bs=64, val_bs:int=None,\n num_workers:int=0, device:torch.device=None, collate_fn:Callable=data_collate,\n dl_tfms:Optional[Collection[Callable]]=None, bptt:int=70, backwards:bool=False, **dl_kwargs) -> DataBunch:\n \"Create a `TextDataBunch` in `path` from the `datasets` for language modelling. Passes `**dl_kwargs` on to `DataLoader()`\"\n datasets = cls._init_ds(train_ds, valid_ds, test_ds)\n val_bs = ifnone(val_bs, bs)\n datasets = [LanguageModelPreLoader(ds, shuffle=(i==0), bs=(bs if i==0 else val_bs), bptt=bptt, backwards=backwards)\n for i,ds in enumerate(datasets)]\n val_bs = bs\n dls = [DataLoader(d, b, shuffle=False, **dl_kwargs) for d,b in zip(datasets, (bs,val_bs,val_bs,val_bs)) if d is not None]\n return cls(*dls, path=path, device=device, dl_tfms=dl_tfms, collate_fn=collate_fn, no_check=no_check)", "response": "Create a TextDataBunch in path from the datasets for language modelling. Passes dl_kwargs on to DataLoader ()"} {"instruction": "Write a Python 3 function for\nfunctioning that transform the datasets in a DataBunch for classification. Passes dl_kwargs on to DataLoader ()", "response": "def create(cls, train_ds, valid_ds, test_ds=None, path:PathOrStr='.', bs:int=32, val_bs:int=None, pad_idx=1,\n pad_first=True, device:torch.device=None, no_check:bool=False, backwards:bool=False, **dl_kwargs) -> DataBunch:\n \"Function that transform the `datasets` in a `DataBunch` for classification. Passes `**dl_kwargs` on to `DataLoader()`\"\n datasets = cls._init_ds(train_ds, valid_ds, test_ds)\n val_bs = ifnone(val_bs, bs)\n collate_fn = partial(pad_collate, pad_idx=pad_idx, pad_first=pad_first, backwards=backwards)\n train_sampler = SortishSampler(datasets[0].x, key=lambda t: len(datasets[0][t][0].data), bs=bs)\n train_dl = DataLoader(datasets[0], batch_size=bs, sampler=train_sampler, drop_last=True, **dl_kwargs)\n dataloaders = [train_dl]\n for ds in datasets[1:]:\n lengths = [len(t) for t in ds.x.items]\n sampler = SortSampler(ds.x, key=lengths.__getitem__)\n dataloaders.append(DataLoader(ds, batch_size=val_bs, sampler=sampler, **dl_kwargs))\n return cls(*dataloaders, path=path, device=device, collate_fn=collate_fn, no_check=no_check)"} {"instruction": "How would you explain what the following Python 3 function does\ndef from_folder(cls, path:PathOrStr='.', extensions:Collection[str]=text_extensions, vocab:Vocab=None,\n processor:PreProcessor=None, **kwargs)->'TextList':\n \"Get the list of files in `path` that have a text suffix. `recurse` determines if we search subfolders.\"\n processor = ifnone(processor, [OpenFileProcessor(), TokenizeProcessor(), NumericalizeProcessor(vocab=vocab)])\n return super().from_folder(path=path, extensions=extensions, processor=processor, **kwargs)", "response": "Get the list of files in path that have a text suffix. recurse determines if we search subfolders."} {"instruction": "Can you implement a function in Python 3 that\nshows the xs ys ( inputs ) and max_len is the maximum number of tokens displayed.", "response": "def show_xys(self, xs, ys, max_len:int=70)->None:\n \"Show the `xs` (inputs) and `ys` (targets). `max_len` is the maximum number of tokens displayed.\"\n from IPython.display import display, HTML\n names = ['idx','text'] if self._is_lm else ['text','target']\n items = []\n for i, (x,y) in enumerate(zip(xs,ys)):\n txt_x = ' '.join(x.text.split(' ')[:max_len]) if max_len is not None else x.text\n items.append([i, txt_x] if self._is_lm else [txt_x, y])\n items = np.array(items)\n df = pd.DataFrame({n:items[:,i] for i,n in enumerate(names)}, columns=names)\n with pd.option_context('display.max_colwidth', -1):\n display(HTML(df.to_html(index=False)))"} {"instruction": "Given the following Python 3 function, write the documentation\ndef inceptionv4(pretrained=True):\n model = InceptionV4()\n if pretrained:\n model.load_state_dict(model_zoo.load_url(model_urls['imagenet']))\n return model", "response": "r A function to create an InceptionV4 model architecture from the\n Inception - v4 Inception - ResNet... _ paper."} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef predict_array(self, arr):\n precompute = self.precompute\n self.precompute = False\n pred = super().predict_array(arr)\n self.precompute = precompute\n return pred", "response": "This method is used to predict the values from the neural network for the specified object."} {"instruction": "Can you tell what is the following Python 3 function doing\ndef initialize_options(self):\n self.dep_groups = ''\n self.dep_quote = False\n self.dep_conda = False", "response": "Set default values for options."} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef get_sfs_idxs(sfs, last=True):\n if last:\n feature_szs = [sfs_feats.features.size()[-1] for sfs_feats in sfs]\n sfs_idxs = list(np.where(np.array(feature_szs[:-1]) != np.array(feature_szs[1:]))[0])\n if feature_szs[0] != feature_szs[1]: sfs_idxs = [0] + sfs_idxs\n else: sfs_idxs = list(range(len(sfs)))\n return sfs_idxs", "response": "Returns the saved feature indexes that will be concatenated by hook function"} {"instruction": "Implement a Python 3 function for\nreturning a Hook that stores activations of module in self. stored", "response": "def hook_output (module:nn.Module, detach:bool=True, grad:bool=False)->Hook:\n \"Return a `Hook` that stores activations of `module` in `self.stored`\"\n return Hook(module, _hook_inner, detach=detach, is_forward=not grad)"} {"instruction": "Make a summary of the following Python 3 code\ndef hook_outputs(modules:Collection[nn.Module], detach:bool=True, grad:bool=False)->Hooks:\n \"Return `Hooks` that store activations of all `modules` in `self.stored`\"\n return Hooks(modules, _hook_inner, detach=detach, is_forward=not grad)", "response": "Return Hooks that store activations of all modules in self. stored"} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef dummy_batch(m: nn.Module, size:tuple=(64,64))->Tensor:\n \"Create a dummy batch to go through `m` with `size`.\"\n ch_in = in_channels(m)\n return one_param(m).new(1, ch_in, *size).requires_grad_(False).uniform_(-1.,1.)", "response": "Create a dummy batch to go through m with size."} {"instruction": "Given the following Python 3 function, write the documentation\ndef dummy_eval(m:nn.Module, size:tuple=(64,64)):\n \"Pass a `dummy_batch` in evaluation mode in `m` with `size`.\"\n return m.eval()(dummy_batch(m, size))", "response": "Pass a dummy_batch in evaluation mode in m with size."} {"instruction": "Here you have a function in Python 3, explain what it does\ndef model_sizes(m:nn.Module, size:tuple=(64,64))->Tuple[Sizes,Tensor,Hooks]:\n \"Pass a dummy input through the model `m` to get the various sizes of activations.\"\n with hook_outputs(m) as hooks:\n x = dummy_eval(m, size)\n return [o.stored.shape for o in hooks]", "response": "Pass a dummy input through the model m to get the various sizes of activations."} {"instruction": "Write a Python 3 script to\nreturn the number of output features for model.", "response": "def num_features_model(m:nn.Module)->int:\n \"Return the number of output features for `model`.\"\n sz = 64\n while True:\n try: return model_sizes(m, size=(sz,sz))[-1][1]\n except Exception as e:\n sz *= 2\n if sz > 2048: raise"} {"instruction": "Given the following Python 3 function, write the documentation\ndef params_size(m: Union[nn.Module,Learner], size: tuple = (3, 64, 64))->Tuple[Sizes, Tensor, Hooks]:\n \"Pass a dummy input through the model to get the various sizes. Returns (res,x,hooks) if `full`\"\n if isinstance(m, Learner):\n if m.data.is_empty:\n raise Exception(\"This is an empty `Learner` and `Learner.summary` requires some data to pass through the model.\")\n ds_type = DatasetType.Train if m.data.train_dl else (DatasetType.Valid if m.data.valid_dl else DatasetType.Test)\n x = m.data.one_batch(ds_type=ds_type, detach=False, denorm=False)[0]\n x = [o[:1] for o in x] if is_listy(x) else x[:1]\n m = m.model\n elif isinstance(m, nn.Module): x = next(m.parameters()).new(1, *size)\n else: raise TypeError('You should either pass in a Learner or nn.Module')\n with hook_outputs(flatten_model(m)) as hook_o:\n with hook_params(flatten_model(m))as hook_p:\n x = m.eval()(*x) if is_listy(x) else m.eval()(x)\n output_size = [((o.stored.shape[1:]) if o.stored is not None else None) for o in hook_o]\n params = [(o.stored if o.stored is not None else (None,None)) for o in hook_p]\n params, trainables = map(list,zip(*params))\n return output_size, params, trainables", "response": "Pass a dummy input through the model to get the various sizes. Returns ( res x hooks ) if full"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef model_summary(m:Learner, n:int=70):\n \"Print a summary of `m` using a output text width of `n` chars\"\n info = layers_info(m)\n header = [\"Layer (type)\", \"Output Shape\", \"Param #\", \"Trainable\"]\n res = \"=\" * n + \"\\n\"\n res += f\"{header[0]:<20} {header[1]:<20} {header[2]:<10} {header[3]:<10}\\n\"\n res += \"=\" * n + \"\\n\"\n total_params = 0\n total_trainable_params = 0\n for layer, size, params, trainable in info:\n if size is None: continue\n total_params += int(params)\n total_trainable_params += int(params) * trainable\n size, trainable = str(list(size)), str(trainable)\n res += f\"{layer:<20} {size:<20} {int(params):<10,} {trainable:<10}\\n\"\n res += \"_\" * n + \"\\n\"\n res += f\"\\nTotal params: {total_params:,}\\n\"\n res += f\"Total trainable params: {total_trainable_params:,}\\n\"\n res += f\"Total non-trainable params: {total_params - total_trainable_params:,}\\n\"\n return PrettyString(res)", "response": "Print a summary of m using a output text width of n chars"} {"instruction": "Can you write a function in Python 3 where it\napplies hook_func to module input output.", "response": "def hook_fn(self, module:nn.Module, input:Tensors, output:Tensors):\n \"Applies `hook_func` to `module`, `input`, `output`.\"\n if self.detach:\n input = (o.detach() for o in input ) if is_listy(input ) else input.detach()\n output = (o.detach() for o in output) if is_listy(output) else output.detach()\n self.stored = self.hook_func(module, input, output)"} {"instruction": "Here you have a function in Python 3, explain what it does\ndef remove(self):\n \"Remove the hook from the model.\"\n if not self.removed:\n self.hook.remove()\n self.removed=True", "response": "Remove the hook from the model."} {"instruction": "Create a Python 3 function to\nregister the Hooks on self. modules.", "response": "def on_train_begin(self, **kwargs):\n \"Register the `Hooks` on `self.modules`.\"\n if not self.modules:\n self.modules = [m for m in flatten_model(self.learn.model)\n if hasattr(m, 'weight')]\n self.hooks = Hooks(self.modules, self.hook)"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef hook(self, m:nn.Module, i:Tensors, o:Tensors)->Tuple[Rank0Tensor,Rank0Tensor]:\n \"Take the mean and std of `o`.\"\n return o.mean().item(),o.std().item()", "response": "Take the mean and std of o."} {"instruction": "Write a Python 3 script to\ntake the stored results and puts it in self. stats", "response": "def on_batch_end(self, train, **kwargs):\n \"Take the stored results and puts it in `self.stats`\"\n if train: self.stats.append(self.hooks.stored)"} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef plots_from_files(imspaths, figsize=(10,5), rows=1, titles=None, maintitle=None):\n f = plt.figure(figsize=figsize)\n if maintitle is not None: plt.suptitle(maintitle, fontsize=16)\n for i in range(len(imspaths)):\n sp = f.add_subplot(rows, ceildiv(len(imspaths), rows), i+1)\n sp.axis('Off')\n if titles is not None: sp.set_title(titles[i], fontsize=16)\n img = plt.imread(imspaths[i])\n plt.imshow(img)", "response": "Plots images given image files."} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef plot_val_with_title(self, idxs, y):\n # if there are any samples to be displayed\n if len(idxs) > 0:\n imgs = np.stack([self.ds[x][0] for x in idxs])\n title_probs = [self.probs[x,y] for x in idxs]\n\n return plots(self.ds.denorm(imgs), rows=1, titles=title_probs)\n # if idxs is empty return false\n else:\n return False;", "response": "Displays the images and their probabilities of belonging to a certain class\n and returns a boolean array with the result of the function"} {"instruction": "Given the following Python 3 function, write the documentation\ndef most_by_mask(self, mask, y, mult):\n idxs = np.where(mask)[0]\n cnt = min(4, len(idxs))\n return idxs[np.argsort(mult * self.probs[idxs,y])[:cnt]]", "response": "Extracts the first 4 most correct or incorrect indexes from the ordered list of probabilities that are in the specified class."} {"instruction": "How would you code a function in Python 3 to\nextract the first 4 most uncertain samples from the ordered list of probabilities for the class y.", "response": "def most_uncertain_by_mask(self, mask, y):\n \"\"\" Extracts the first 4 most uncertain indexes from the ordered list of probabilities\n\n Arguments:\n mask (numpy.ndarray): the mask of probabilities specific to the selected class; a boolean array with shape (num_of_samples,) which contains True where class==selected_class, and False everywhere else\n y (int): the selected class\n\n Returns:\n idxs (ndarray): An array of indexes of length 4\n \"\"\"\n idxs = np.where(mask)[0]\n # the most uncertain samples will have abs(probs-1/num_classes) close to 0;\n return idxs[np.argsort(np.abs(self.probs[idxs,y]-(1/self.num_classes)))[:4]]"} {"instruction": "How would you code a function in Python 3 to\nreturn the class that is most correct and which correspond to the selected class y.", "response": "def most_by_correct(self, y, is_correct):\n \"\"\" Extracts the predicted classes which correspond to the selected class (y) and to the specific case (prediction is correct - is_true=True, prediction is wrong - is_true=False)\n\n Arguments:\n y (int): the selected class\n is_correct (boolean): a boolean flag (True, False) which specify the what to look for. Ex: True - most correct samples, False - most incorrect samples\n\n Returns:\n idxs (numpy.ndarray): An array of indexes (numpy.ndarray)\n \"\"\"\n # mult=-1 when the is_correct flag is true -> when we want to display the most correct classes we will make a descending sorting (argsort) because we want that the biggest probabilities to be displayed first.\n # When is_correct is false, we want to display the most incorrect classes, so we want an ascending sorting since our interest is in the smallest probabilities.\n mult = -1 if is_correct==True else 1\n return self.most_by_mask(((self.preds == self.ds.y)==is_correct)\n & (self.ds.y == y), y, mult)"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef plot_by_correct(self, y, is_correct):\n return self.plot_val_with_title(self.most_by_correct(y, is_correct), y)", "response": "Plots the most correct and wrong images for the selected class."} {"instruction": "Given the following Python 3 function, write the documentation\ndef most_by_uncertain(self, y):\n return self.most_uncertain_by_mask((self.ds.y == y), y)", "response": "Returns the predicted classes which correspond to the selected class and have probabilities nearest to 1."} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef main(\n gpus:Param(\"The GPUs to use for distributed training\", str)='all',\n script:Param(\"Script to run\", str, opt=False)='',\n args:Param(\"Args to pass to script\", nargs='...', opt=False)=''\n):\n \"PyTorch distributed training launch helper that spawns multiple distributed processes\"\n # Loosely based on torch.distributed.launch\n current_env = os.environ.copy()\n gpus = list(range(torch.cuda.device_count())) if gpus=='all' else list(gpus)\n current_env[\"WORLD_SIZE\"] = str(len(gpus))\n current_env[\"MASTER_ADDR\"] = '127.0.0.1'\n current_env[\"MASTER_PORT\"] = '29500'\n\n processes = []\n for i,gpu in enumerate(gpus):\n current_env[\"RANK\"] = str(i)\n cmd = [sys.executable, \"-u\", script, f\"--gpu={gpu}\"] + args\n process = subprocess.Popen(cmd, env=current_env)\n processes.append(process)\n\n for process in processes: process.wait()", "response": "PyTorch distributed training launch helper that spawns multiple distributed processes"} {"instruction": "Create a Python 3 function for\nadding the metrics names to the Recorder.", "response": "def on_train_begin(self, **kwargs):\n \"Add the metrics names to the `Recorder`.\"\n self.names = ifnone(self.learn.loss_func.metric_names, [])\n if not self.names: warn('LossMetrics requested but no loss_func.metric_names provided')\n self.learn.recorder.add_metric_names(self.names)"} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef on_epoch_begin(self, **kwargs):\n \"Initialize the metrics for this epoch.\"\n self.metrics = {name:0. for name in self.names}\n self.nums = 0", "response": "Initialize the metrics for this epoch."} {"instruction": "Explain what the following Python 3 code does\ndef on_batch_end(self, last_target, train, **kwargs):\n \"Update the metrics if not `train`\"\n if train: return\n bs = last_target.size(0)\n for name in self.names:\n self.metrics[name] += bs * self.learn.loss_func.metrics[name].detach().cpu()\n self.nums += bs", "response": "Update the metrics if not train"} {"instruction": "Here you have a function in Python 3, explain what it does\ndef on_epoch_end(self, last_metrics, **kwargs):\n \"Finish the computation and sends the result to the Recorder.\"\n if not self.nums: return\n metrics = [self.metrics[name]/self.nums for name in self.names]\n return {'last_metrics': last_metrics+metrics}", "response": "Finish the computation and sends the result to the Recorder."} {"instruction": "Here you have a function in Python 3, explain what it does\ndef on_train_begin(self, **kwargs):\n \"Create the various optimizers.\"\n self.G_A,self.G_B = self.learn.model.G_A,self.learn.model.G_B\n self.D_A,self.D_B = self.learn.model.D_A,self.learn.model.D_B\n self.crit = self.learn.loss_func.crit\n self.opt_G = self.learn.opt.new([nn.Sequential(*flatten_model(self.G_A), *flatten_model(self.G_B))])\n self.opt_D_A = self.learn.opt.new([nn.Sequential(*flatten_model(self.D_A))])\n self.opt_D_B = self.learn.opt.new([nn.Sequential(*flatten_model(self.D_B))])\n self.learn.opt.opt = self.opt_G.opt\n self._set_trainable()\n self.names = ['idt_loss', 'gen_loss', 'cyc_loss', 'da_loss', 'db_loss']\n self.learn.recorder.no_val=True\n self.learn.recorder.add_metric_names(self.names)\n self.smootheners = {n:SmoothenValue(0.98) for n in self.names}", "response": "Create the various optimizers."} {"instruction": "Make a summary of the following Python 3 code\ndef on_batch_end(self, last_input, last_output, **kwargs):\n \"Steps through the generators then each of the critics.\"\n self.G_A.zero_grad(); self.G_B.zero_grad()\n fake_A, fake_B = last_output[0].detach(), last_output[1].detach()\n real_A, real_B = last_input\n self._set_trainable(D_A=True)\n self.D_A.zero_grad()\n loss_D_A = 0.5 * (self.crit(self.D_A(real_A), True) + self.crit(self.D_A(fake_A), False))\n loss_D_A.backward()\n self.opt_D_A.step()\n self._set_trainable(D_B=True)\n self.D_B.zero_grad()\n loss_D_B = 0.5 * (self.crit(self.D_B(real_B), True) + self.crit(self.D_B(fake_B), False))\n loss_D_B.backward()\n self.opt_D_B.step()\n self._set_trainable()\n metrics = self.learn.loss_func.metrics + [loss_D_A, loss_D_B]\n for n,m in zip(self.names,metrics): self.smootheners[n].add_value(m)", "response": "Steps through the generators then each of the critics."} {"instruction": "Make a summary of the following Python 3 code\ndef on_epoch_end(self, last_metrics, **kwargs):\n \"Put the various losses in the recorder.\"\n return add_metrics(last_metrics, [s.smooth for k,s in self.smootheners.items()])", "response": "Put the various losses in the recorder."} {"instruction": "Can you write a function in Python 3 where it\nprepares file with metric names.", "response": "def on_train_begin(self, **kwargs: Any) -> None:\n \"Prepare file with metric names.\"\n self.path.parent.mkdir(parents=True, exist_ok=True) \n self.file = self.path.open('a') if self.append else self.path.open('w')\n self.file.write(','.join(self.learn.recorder.names[:(None if self.add_time else -1)]) + '\\n')"} {"instruction": "Can you create a Python 3 function that\nadds a line with epoch number smooth_loss and last_metrics.", "response": "def on_epoch_end(self, epoch: int, smooth_loss: Tensor, last_metrics: MetricsList, **kwargs: Any) -> bool:\n \"Add a line with `epoch` number, `smooth_loss` and `last_metrics`.\"\n last_metrics = ifnone(last_metrics, [])\n stats = [str(stat) if isinstance(stat, int) else '#na#' if stat is None else f'{stat:.6f}'\n for name, stat in zip(self.learn.recorder.names, [epoch, smooth_loss] + last_metrics)]\n if self.add_time: stats.append(format_time(time() - self.start_epoch))\n str_stats = ','.join(stats)\n self.file.write(str_stats + '\\n')"} {"instruction": "How would you explain what the following Python 3 function does\ndef get_master(layer_groups:ModuleList, flat_master:bool=False) -> Tuple[List[List[Tensor]], List[List[Tensor]]]:\n \"Return two lists, one for the model parameters in FP16 and one for the master parameters in FP32.\"\n split_params = split_no_wd_params(layer_groups)\n model_params = [[param for param in pg if param.requires_grad] for pg in split_params]\n if flat_master:\n master_params = []\n for lg in model_params:\n if len(lg) !=0 :\n mp = parameters_to_vector([param.data.float() for param in lg])\n mp = torch.nn.Parameter(mp, requires_grad=True)\n if mp.grad is None: mp.grad = mp.new(*mp.size())\n master_params.append([mp])\n else: master_params.append([])\n return model_params, master_params\n else:\n master_params = [[param.clone().float().detach() for param in lg] for lg in model_params]\n for mp in master_params:\n for param in mp: param.requires_grad = True\n return model_params, master_params", "response": "Return two lists one for the model parameters in FP16 and one for the master parameters in FP32."} {"instruction": "Given the following Python 3 function, write the documentation\ndef model_g2master_g(model_params:Sequence[Tensor], master_params:Sequence[Tensor], flat_master:bool=False)->None:\n \"Copy the `model_params` gradients to `master_params` for the optimizer step.\"\n if flat_master:\n for model_group,master_group in zip(model_params,master_params):\n if len(master_group) != 0:\n if master_group[0].grad is None: master_group[0].grad = master_group[0].data.new(*master_group[0].data.size())\n master_group[0].grad.data.copy_(parameters_to_vector([p.grad.data.float() for p in model_group]))\n else:\n for model_group,master_group in zip(model_params,master_params):\n for model, master in zip(model_group, master_group):\n if model.grad is not None:\n if master.grad is None: master.grad = master.data.new(*master.data.size())\n master.grad.data.copy_(model.grad.data)\n else: master.grad = None", "response": "Copy the model_params gradients to master_params for the optimizer step."} {"instruction": "Can you write a function in Python 3 where it\ncopies master_params to model_params.", "response": "def master2model(model_params:Sequence[Tensor], master_params:Sequence[Tensor], flat_master:bool=False)->None:\n \"Copy `master_params` to `model_params`.\"\n if flat_master:\n for model_group,master_group in zip(model_params,master_params):\n if len(model_group) != 0:\n for model, master in zip(model_group, _unflatten_dense_tensors(master_group[0].data, model_group)):\n model.data.copy_(master)\n else:\n for model_group,master_group in zip(model_params,master_params):\n for model, master in zip(model_group, master_group): model.data.copy_(master.data)"} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef on_train_begin(self, **kwargs:Any)->None:\n \"Prepare the master model.\"\n #Get a copy of the model params in FP32\n self.model_params, self.master_params = get_master(self.learn.layer_groups, self.flat_master)\n #Changes the optimizer so that the optimization step is done in FP32.\n new_opt = self.learn.opt.new_with_params(self.master_params)\n if self.opt is not None:\n self.opt.lr,self.opt.wd = self.learn.opt.lr,self.learn.opt.wd\n new_opt.load_state_dict(self.opt)\n self.learn.opt.opt = new_opt.opt\n self.noskip = 0", "response": "Prepare the master model."} {"instruction": "Can you tell what is the following Python 3 function doing\ndef on_backward_begin(self, last_loss:Rank0Tensor, **kwargs:Any) -> Rank0Tensor:\n \"Scale gradients up by `self.loss_scale` to prevent underflow.\"\n #To avoid gradient underflow, we scale the gradients\n ret_loss = last_loss * self.loss_scale\n return {'last_loss': ret_loss}", "response": "Scale gradients up by self. loss_scale to prevent underflow."} {"instruction": "Make a summary of the following Python 3 code\ndef on_backward_end(self, **kwargs:Any)->None:\n \"Convert the gradients back to FP32 and divide them by the scale.\"\n if self.dynamic and grad_overflow(self.model_params) and self.loss_scale > 1:\n self.loss_scale /= 2\n self.noskip = 0\n #The step will be skipped since we don't update the master grads so they are all None or zero\n else:\n model_g2master_g(self.model_params, self.master_params, self.flat_master)\n for group in self.master_params:\n for param in group:\n if param.grad is not None: param.grad.div_(self.loss_scale)\n if self.clip is not None:\n for group in self.master_params: nn.utils.clip_grad_norm_(group, self.clip)\n if not self.dynamic: return\n self.noskip += 1\n if self.noskip >= self.max_noskip and self.loss_scale < self.max_scale:\n self.loss_scale *= 2\n self.noskip = 0", "response": "Convert the gradients back to FP32 and divide them by the scale."} {"instruction": "How would you code a function in Python 3 to\nupdate the params from master to model and zero grad.", "response": "def on_step_end(self, **kwargs:Any)->None:\n \"Update the params from master to model and zero grad.\"\n #Zeros the gradients of the model since the optimizer is disconnected.\n self.learn.model.zero_grad()\n #Update the params from master to model.\n master2model(self.model_params, self.master_params, self.flat_master)"} {"instruction": "Write a Python 3 script for\nscaling the image so that the smallest axis is of size targ.", "response": "def scale_min(im, targ, interpolation=cv2.INTER_AREA):\n \"\"\" Scale the image so that the smallest axis is of size targ.\n\n Arguments:\n im (array): image\n targ (int): target size\n \"\"\"\n r,c,*_ = im.shape\n ratio = targ/min(r,c)\n sz = (scale_to(c, ratio, targ), scale_to(r, ratio, targ))\n return cv2.resize(im, sz, interpolation=interpolation)"} {"instruction": "Explain what the following Python 3 code does\ndef zoom_cv(x,z):\n if z==0: return x\n r,c,*_ = x.shape\n M = cv2.getRotationMatrix2D((c/2,r/2),0,z+1.)\n return cv2.warpAffine(x,M,(c,r))", "response": "Zoom the center of image x by a factor of z + 1 while retaining the original image size and proportion."} {"instruction": "Make a summary of the following Python 3 code\ndef stretch_cv(x,sr,sc,interpolation=cv2.INTER_AREA):\n if sr==0 and sc==0: return x\n r,c,*_ = x.shape\n x = cv2.resize(x, None, fx=sr+1, fy=sc+1, interpolation=interpolation)\n nr,nc,*_ = x.shape\n cr = (nr-r)//2; cc = (nc-c)//2\n return x[cr:r+cr, cc:c+cc]", "response": "Stretch image x vertically by sr + 1 and vertically by sc + 1."} {"instruction": "Make a summary of the following Python 3 code\ndef dihedral(x, dih):\n x = np.rot90(x, dih%4)\n return x if dih<4 else np.fliplr(x)", "response": "Perform any of 8 permutations of 90 - degrees rotations and flips for image x."} {"instruction": "Write a Python 3 script for\nadjusting image balance and contrast", "response": "def lighting(im, b, c):\n \"\"\" Adjust image balance and contrast \"\"\"\n if b==0 and c==1: return im\n mu = np.average(im)\n return np.clip((im-mu)*c+mu+b,0.,1.).astype(np.float32)"} {"instruction": "Write a Python 3 script for\nreturning a squared resized image.", "response": "def no_crop(im, min_sz=None, interpolation=cv2.INTER_AREA):\n \"\"\" Return a squared resized image \"\"\"\n r,c,*_ = im.shape\n if min_sz is None: min_sz = min(r,c)\n return cv2.resize(im, (min_sz, min_sz), interpolation=interpolation)"} {"instruction": "Can you implement a function in Python 3 that\nreturns a center crop of an image", "response": "def center_crop(im, min_sz=None):\n \"\"\" Return a center crop of an image \"\"\"\n r,c,*_ = im.shape\n if min_sz is None: min_sz = min(r,c)\n start_r = math.ceil((r-min_sz)/2)\n start_c = math.ceil((c-min_sz)/2)\n return crop(im, start_r, start_c, min_sz)"} {"instruction": "Implement a function in Python 3 to\ncut out n_holes number of square holes of size length in image at random locations.", "response": "def cutout(im, n_holes, length):\n \"\"\" Cut out n_holes number of square holes of size length in image at random locations. Holes may overlap. \"\"\"\n r,c,*_ = im.shape\n mask = np.ones((r, c), np.int32)\n for n in range(n_holes):\n y = np.random.randint(0, r)\n x = np.random.randint(0, c)\n\n y1 = int(np.clip(y - length / 2, 0, r))\n y2 = int(np.clip(y + length / 2, 0, r))\n x1 = int(np.clip(x - length / 2, 0, c))\n x2 = int(np.clip(x + length / 2, 0, c))\n mask[y1: y2, x1: x2] = 0.\n \n mask = mask[:,:,None]\n im = im * mask\n return im"} {"instruction": "Implement a function in Python 3 to\ncalculate the dimension of an image during scaling with aspect ratio", "response": "def scale_to(x, ratio, targ): \n '''Calculate dimension of an image during scaling with aspect ratio'''\n return max(math.floor(x*ratio), targ)"} {"instruction": "Implement a function in Python 3 to\ncrop image into a square of size sz", "response": "def crop(im, r, c, sz): \n '''\n crop image into a square of size sz, \n '''\n return im[r:r+sz, c:c+sz]"} {"instruction": "How would you explain what the following Python 3 function does\ndef to_bb(YY, y=\"deprecated\"):\n cols,rows = np.nonzero(YY)\n if len(cols)==0: return np.zeros(4, dtype=np.float32)\n top_row = np.min(rows)\n left_col = np.min(cols)\n bottom_row = np.max(rows)\n right_col = np.max(cols)\n return np.array([left_col, top_row, right_col, bottom_row], dtype=np.float32)", "response": "Convert mask YY to a bounding box"} {"instruction": "How would you explain what the following Python 3 function does\ndef coords2px(y, x):\n rows = np.rint([y[0], y[0], y[2], y[2]]).astype(int)\n cols = np.rint([y[1], y[3], y[1], y[3]]).astype(int)\n r,c,*_ = x.shape\n Y = np.zeros((r, c))\n Y[rows, cols] = 1\n return Y", "response": "Transforming coordinates to pixels."} {"instruction": "How would you explain what the following Python 3 function does\ndef compose(im, y, fns):\n for fn in fns:\n #pdb.set_trace()\n im, y =fn(im, y)\n return im if y is None else (im, y)", "response": "Apply a collection of transformation functions to images"} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef image_gen(normalizer, denorm, sz, tfms=None, max_zoom=None, pad=0, crop_type=None,\n tfm_y=None, sz_y=None, pad_mode=cv2.BORDER_REFLECT, scale=None):\n \"\"\"\n Generate a standard set of transformations\n\n Arguments\n ---------\n normalizer :\n image normalizing function\n denorm :\n image denormalizing function\n sz :\n size, sz_y = sz if not specified.\n tfms :\n iterable collection of transformation functions\n max_zoom : float,\n maximum zoom\n pad : int,\n padding on top, left, right and bottom\n crop_type :\n crop type\n tfm_y :\n y axis specific transformations\n sz_y :\n y size, height\n pad_mode :\n cv2 padding style: repeat, reflect, etc.\n\n Returns\n -------\n type : ``Transforms``\n transformer for specified image operations.\n\n See Also\n --------\n Transforms: the transformer object returned by this function\n \"\"\"\n if tfm_y is None: tfm_y=TfmType.NO\n if tfms is None: tfms=[]\n elif not isinstance(tfms, collections.Iterable): tfms=[tfms]\n if sz_y is None: sz_y = sz\n if scale is None:\n scale = [RandomScale(sz, max_zoom, tfm_y=tfm_y, sz_y=sz_y) if max_zoom is not None\n else Scale(sz, tfm_y, sz_y=sz_y)]\n elif not is_listy(scale): scale = [scale]\n if pad: scale.append(AddPadding(pad, mode=pad_mode))\n if crop_type!=CropType.GOOGLENET: tfms=scale+tfms\n return Transforms(sz, tfms, normalizer, denorm, crop_type,\n tfm_y=tfm_y, sz_y=sz_y)", "response": "Generate a standard set of transformations for an image."} {"instruction": "How would you implement a function in Python 3 that\ngives the statistics of the training image sets returns separate training and validation transforms.", "response": "def tfms_from_stats(stats, sz, aug_tfms=None, max_zoom=None, pad=0, crop_type=CropType.RANDOM,\n tfm_y=None, sz_y=None, pad_mode=cv2.BORDER_REFLECT, norm_y=True, scale=None):\n \"\"\" Given the statistics of the training image sets, returns separate training and validation transform functions\n \"\"\"\n if aug_tfms is None: aug_tfms=[]\n tfm_norm = Normalize(*stats, tfm_y=tfm_y if norm_y else TfmType.NO) if stats is not None else None\n tfm_denorm = Denormalize(*stats) if stats is not None else None\n val_crop = CropType.CENTER if crop_type in (CropType.RANDOM,CropType.GOOGLENET) else crop_type\n val_tfm = image_gen(tfm_norm, tfm_denorm, sz, pad=pad, crop_type=val_crop,\n tfm_y=tfm_y, sz_y=sz_y, scale=scale)\n trn_tfm = image_gen(tfm_norm, tfm_denorm, sz, pad=pad, crop_type=crop_type,\n tfm_y=tfm_y, sz_y=sz_y, tfms=aug_tfms, max_zoom=max_zoom, pad_mode=pad_mode, scale=scale)\n return trn_tfm, val_tfm"} {"instruction": "Implement a Python 3 function for\nreturning a list of transformers that are created according to the image statistics given by the model.", "response": "def tfms_from_model(f_model, sz, aug_tfms=None, max_zoom=None, pad=0, crop_type=CropType.RANDOM,\n tfm_y=None, sz_y=None, pad_mode=cv2.BORDER_REFLECT, norm_y=True, scale=None):\n \"\"\" Returns separate transformers of images for training and validation.\n Transformers are constructed according to the image statistics given by the model. (See tfms_from_stats)\n\n Arguments:\n f_model: model, pretrained or not pretrained\n \"\"\"\n stats = inception_stats if f_model in inception_models else imagenet_stats\n return tfms_from_stats(stats, sz, aug_tfms, max_zoom=max_zoom, pad=pad, crop_type=crop_type,\n tfm_y=tfm_y, sz_y=sz_y, pad_mode=pad_mode, norm_y=norm_y, scale=scale)"} {"instruction": "Implement a Python 3 function for\nreturning list of files in c that are images. check_ext will filter to image_extensions.", "response": "def get_image_files(c:PathOrStr, check_ext:bool=True, recurse=False)->FilePathList:\n \"Return list of files in `c` that are images. `check_ext` will filter to `image_extensions`.\"\n return get_files(c, extensions=(image_extensions if check_ext else None), recurse=recurse)"} {"instruction": "Create a Python 3 function for\nopening a COCO style json in fname and returns the lists of filenames with maybe prefix and labelled bboxes.", "response": "def get_annotations(fname, prefix=None):\n \"Open a COCO style json in `fname` and returns the lists of filenames (with maybe `prefix`) and labelled bboxes.\"\n annot_dict = json.load(open(fname))\n id2images, id2bboxes, id2cats = {}, collections.defaultdict(list), collections.defaultdict(list)\n classes = {}\n for o in annot_dict['categories']:\n classes[o['id']] = o['name']\n for o in annot_dict['annotations']:\n bb = o['bbox']\n id2bboxes[o['image_id']].append([bb[1],bb[0], bb[3]+bb[1], bb[2]+bb[0]])\n id2cats[o['image_id']].append(classes[o['category_id']])\n for o in annot_dict['images']:\n if o['id'] in id2bboxes:\n id2images[o['id']] = ifnone(prefix, '') + o['file_name']\n ids = list(id2images.keys())\n return [id2images[k] for k in ids], [[id2bboxes[k], id2cats[k]] for k in ids]"} {"instruction": "Write a Python 3 script to\nfunction that collect samples of labelled bboxes and adds padding with pad_idx.", "response": "def bb_pad_collate(samples:BatchSamples, pad_idx:int=0) -> Tuple[FloatTensor, Tuple[LongTensor, LongTensor]]:\n \"Function that collect `samples` of labelled bboxes and adds padding with `pad_idx`.\"\n if isinstance(samples[0][1], int): return data_collate(samples)\n max_len = max([len(s[1].data[1]) for s in samples])\n bboxes = torch.zeros(len(samples), max_len, 4)\n labels = torch.zeros(len(samples), max_len).long() + pad_idx\n imgs = []\n for i,s in enumerate(samples):\n imgs.append(s[0].data[None])\n bbs, lbls = s[1].data\n if not (bbs.nelement() == 0):\n bboxes[i,-len(lbls):] = bbs\n labels[i,-len(lbls):] = tensor(lbls)\n return torch.cat(imgs,0), (bboxes,labels)"} {"instruction": "Make a summary of the following Python 3 code\ndef normalize(x:TensorImage, mean:FloatTensor,std:FloatTensor)->TensorImage:\n \"Normalize `x` with `mean` and `std`.\"\n return (x-mean[...,None,None]) / std[...,None,None]", "response": "Normalize x with mean and std."} {"instruction": "How would you explain what the following Python 3 function does\ndef denormalize(x:TensorImage, mean:FloatTensor,std:FloatTensor, do_x:bool=True)->TensorImage:\n \"Denormalize `x` with `mean` and `std`.\"\n return x.cpu().float()*std[...,None,None] + mean[...,None,None] if do_x else x.cpu()", "response": "Denormalize x with mean and std."} {"instruction": "Write a Python 3 function that can\ncreate normalize and denormalize func using mean and std can specify do_y and device.", "response": "def normalize_funcs(mean:FloatTensor, std:FloatTensor, do_x:bool=True, do_y:bool=False)->Tuple[Callable,Callable]:\n \"Create normalize/denormalize func using `mean` and `std`, can specify `do_y` and `device`.\"\n mean,std = tensor(mean),tensor(std)\n return (partial(_normalize_batch, mean=mean, std=std, do_x=do_x, do_y=do_y),\n partial(denormalize, mean=mean, std=std, do_x=do_x))"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef channel_view(x:Tensor)->Tensor:\n \"Make channel the first axis of `x` and flatten remaining axes\"\n return x.transpose(0,1).contiguous().view(x.shape[1],-1)", "response": "Make channel the first axis of x and flatten remaining axes"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef download_images(urls:Collection[str], dest:PathOrStr, max_pics:int=1000, max_workers:int=8, timeout=4):\n \"Download images listed in text file `urls` to path `dest`, at most `max_pics`\"\n urls = open(urls).read().strip().split(\"\\n\")[:max_pics]\n dest = Path(dest)\n dest.mkdir(exist_ok=True)\n parallel(partial(_download_image_inner, dest, timeout=timeout), urls, max_workers=max_workers)", "response": "Download images listed in text file urls to path dest at most max_pics"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef resize_to(img, targ_sz:int, use_min:bool=False):\n \"Size to resize to, to hit `targ_sz` at same aspect ratio, in PIL coords (i.e w*h)\"\n w,h = img.size\n min_sz = (min if use_min else max)(w,h)\n ratio = targ_sz/min_sz\n return int(w*ratio),int(h*ratio)", "response": "Size to resize to hit targ_sz at same aspect ratio in PIL coords ( i. e w * h"} {"instruction": "How would you explain what the following Python 3 function does\ndef verify_image(file:Path, idx:int, delete:bool, max_size:Union[int,Tuple[int,int]]=None, dest:Path=None, n_channels:int=3,\n interp=PIL.Image.BILINEAR, ext:str=None, img_format:str=None, resume:bool=False, **kwargs):\n \"Check if the image in `file` exists, maybe resize it and copy it in `dest`.\"\n try:\n # deal with partially broken images as indicated by PIL warnings\n with warnings.catch_warnings():\n warnings.filterwarnings('error')\n try:\n with open(file, 'rb') as img_file: PIL.Image.open(img_file)\n except Warning as w:\n if \"Possibly corrupt EXIF data\" in str(w):\n if delete: # green light to modify files\n print(f\"{file}: Removing corrupt EXIF data\")\n warnings.simplefilter(\"ignore\")\n # save EXIF-cleaned up image, which happens automatically\n PIL.Image.open(file).save(file)\n else: # keep user's files intact\n print(f\"{file}: Not removing corrupt EXIF data, pass `delete=True` to do that\")\n else: warnings.warn(w)\n\n img = PIL.Image.open(file)\n imgarr = np.array(img)\n img_channels = 1 if len(imgarr.shape) == 2 else imgarr.shape[2]\n if (max_size is not None and (img.height > max_size or img.width > max_size)) or img_channels != n_channels:\n assert isinstance(dest, Path), \"You should provide `dest` Path to save resized image\"\n dest_fname = dest/file.name\n if ext is not None: dest_fname=dest_fname.with_suffix(ext)\n if resume and os.path.isfile(dest_fname): return\n if max_size is not None:\n new_sz = resize_to(img, max_size)\n img = img.resize(new_sz, resample=interp)\n if n_channels == 3: img = img.convert(\"RGB\")\n img.save(dest_fname, img_format, **kwargs)\n except Exception as e:\n print(f'{e}')\n if delete: file.unlink()", "response": "Check if the image in file exists maybe resize it and copy it in dest."} {"instruction": "Implement a Python 3 function for\nchecking if the images in path aren t broken maybe resize them and copy it in dest.", "response": "def verify_images(path:PathOrStr, delete:bool=True, max_workers:int=4, max_size:Union[int]=None, recurse:bool=False,\n dest:PathOrStr='.', n_channels:int=3, interp=PIL.Image.BILINEAR, ext:str=None, img_format:str=None,\n resume:bool=None, **kwargs):\n \"Check if the images in `path` aren't broken, maybe resize them and copy it in `dest`.\"\n path = Path(path)\n if resume is None and dest == '.': resume=False\n dest = path/Path(dest)\n os.makedirs(dest, exist_ok=True)\n files = get_image_files(path, recurse=recurse)\n func = partial(verify_image, delete=delete, max_size=max_size, dest=dest, n_channels=n_channels, interp=interp,\n ext=ext, img_format=img_format, resume=resume, **kwargs)\n parallel(func, files, max_workers=max_workers)"} {"instruction": "Given the following Python 3 function, write the documentation\ndef _ll_pre_transform(self, train_tfm:List[Callable], valid_tfm:List[Callable]):\n \"Call `train_tfm` and `valid_tfm` after opening image, before converting from `PIL.Image`\"\n self.train.x.after_open = compose(train_tfm)\n self.valid.x.after_open = compose(valid_tfm)\n return self", "response": "Call train_tfm and valid_tfm after opening image before converting from PIL. Image"} {"instruction": "Create a Python 3 function for\ncalling train_tfm and valid_tfm after opening image before converting from PIL. Image", "response": "def _db_pre_transform(self, train_tfm:List[Callable], valid_tfm:List[Callable]):\n \"Call `train_tfm` and `valid_tfm` after opening image, before converting from `PIL.Image`\"\n self.train_ds.x.after_open = compose(train_tfm)\n self.valid_ds.x.after_open = compose(valid_tfm)\n return self"} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef _presize(self, size:int, val_xtra_size:int=32, scale:Tuple[float]=(0.08, 1.0), ratio:Tuple[float]=(0.75, 4./3.),\n interpolation:int=2):\n \"Resize images to `size` using `RandomResizedCrop`, passing along `kwargs` to train transform\"\n return self.pre_transform(\n tvt.RandomResizedCrop(size, scale=scale, ratio=ratio, interpolation=interpolation), \n [tvt.Resize(size+val_xtra_size), tvt.CenterCrop(size)])", "response": "Resize images to size using RandomResizedCrop passing along kwargs to train transform"} {"instruction": "Write a Python 3 script for\ncreating an ImageDataBunch from LabelLists lls with potential ds_tfms.", "response": "def create_from_ll(cls, lls:LabelLists, bs:int=64, val_bs:int=None, ds_tfms:Optional[TfmList]=None,\n num_workers:int=defaults.cpus, dl_tfms:Optional[Collection[Callable]]=None, device:torch.device=None,\n test:Optional[PathOrStr]=None, collate_fn:Callable=data_collate, size:int=None, no_check:bool=False,\n resize_method:ResizeMethod=None, mult:int=None, padding_mode:str='reflection',\n mode:str='bilinear', tfm_y:bool=False)->'ImageDataBunch':\n \"Create an `ImageDataBunch` from `LabelLists` `lls` with potential `ds_tfms`.\"\n lls = lls.transform(tfms=ds_tfms, size=size, resize_method=resize_method, mult=mult, padding_mode=padding_mode,\n mode=mode, tfm_y=tfm_y)\n if test is not None: lls.add_test_folder(test)\n return lls.databunch(bs=bs, val_bs=val_bs, dl_tfms=dl_tfms, num_workers=num_workers, collate_fn=collate_fn,\n device=device, no_check=no_check)"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef from_folder(cls, path:PathOrStr, train:PathOrStr='train', valid:PathOrStr='valid',\n valid_pct=None, classes:Collection=None, **kwargs:Any)->'ImageDataBunch':\n \"Create from imagenet style dataset in `path` with `train`,`valid`,`test` subfolders (or provide `valid_pct`).\"\n path=Path(path)\n il = ImageList.from_folder(path)\n if valid_pct is None: src = il.split_by_folder(train=train, valid=valid)\n else: src = il.split_by_rand_pct(valid_pct)\n src = src.label_from_folder(classes=classes)\n return cls.create_from_ll(src, **kwargs)", "response": "Create from imagenet style dataset in path with train valid test subfolders or provide valid_pct."} {"instruction": "Create a Python 3 function to\ncreate from a DataFrame df.", "response": "def from_df(cls, path:PathOrStr, df:pd.DataFrame, folder:PathOrStr=None, label_delim:str=None, valid_pct:float=0.2,\n fn_col:IntsOrStrs=0, label_col:IntsOrStrs=1, suffix:str='', **kwargs:Any)->'ImageDataBunch':\n \"Create from a `DataFrame` `df`.\"\n src = (ImageList.from_df(df, path=path, folder=folder, suffix=suffix, cols=fn_col)\n .split_by_rand_pct(valid_pct)\n .label_from_df(label_delim=label_delim, cols=label_col))\n return cls.create_from_ll(src, **kwargs)"} {"instruction": "Given the following Python 3 function, write the documentation\ndef from_csv(cls, path:PathOrStr, folder:PathOrStr=None, label_delim:str=None, csv_labels:PathOrStr='labels.csv',\n valid_pct:float=0.2, fn_col:int=0, label_col:int=1, suffix:str='', delimiter:str=None,\n header:Optional[Union[int,str]]='infer', **kwargs:Any)->'ImageDataBunch':\n \"Create from a csv file in `path/csv_labels`.\"\n path = Path(path)\n df = pd.read_csv(path/csv_labels, header=header, delimiter=delimiter)\n return cls.from_df(path, df, folder=folder, label_delim=label_delim, valid_pct=valid_pct,\n fn_col=fn_col, label_col=label_col, suffix=suffix, **kwargs)", "response": "Create from a csv file in path / csv_labels."} {"instruction": "Can you tell what is the following Python 3 function doing\ndef from_lists(cls, path:PathOrStr, fnames:FilePathList, labels:Collection[str], valid_pct:float=0.2,\n item_cls:Callable=None, **kwargs):\n \"Create from list of `fnames` in `path`.\"\n item_cls = ifnone(item_cls, ImageList)\n fname2label = {f:l for (f,l) in zip(fnames, labels)}\n src = (item_cls(fnames, path=path).split_by_rand_pct(valid_pct)\n .label_from_func(lambda x:fname2label[x]))\n return cls.create_from_ll(src, **kwargs)", "response": "Create from list of fnames in path."} {"instruction": "Can you write a function in Python 3 where it\ncreates from list of fnames in path with label_func.", "response": "def from_name_func(cls, path:PathOrStr, fnames:FilePathList, label_func:Callable, valid_pct:float=0.2, **kwargs):\n \"Create from list of `fnames` in `path` with `label_func`.\"\n src = ImageList(fnames, path=path).split_by_rand_pct(valid_pct)\n return cls.create_from_ll(src.label_from_func(label_func), **kwargs)"} {"instruction": "Explain what the following Python 3 code does\ndef from_name_re(cls, path:PathOrStr, fnames:FilePathList, pat:str, valid_pct:float=0.2, **kwargs):\n \"Create from list of `fnames` in `path` with re expression `pat`.\"\n pat = re.compile(pat)\n def _get_label(fn):\n if isinstance(fn, Path): fn = fn.as_posix()\n res = pat.search(str(fn))\n assert res,f'Failed to find \"{pat}\" in \"{fn}\"'\n return res.group(1)\n return cls.from_name_func(path, fnames, _get_label, valid_pct=valid_pct, **kwargs)", "response": "Create from list of fnames in path with re expression pat."} {"instruction": "Explain what the following Python 3 code does\ndef single_from_classes(path:Union[Path, str], classes:Collection[str], ds_tfms:TfmList=None, **kwargs):\n \"Create an empty `ImageDataBunch` in `path` with `classes`. Typically used for inference.\"\n warn(\"\"\"This method is deprecated and will be removed in a future version, use `load_learner` after\n `Learner.export()`\"\"\", DeprecationWarning)\n sd = ImageList([], path=path, ignore_empty=True).split_none()\n return sd.label_const(0, label_cls=CategoryList, classes=classes).transform(ds_tfms, **kwargs).databunch()", "response": "Create an empty ImageDataBunch in path with classes. Typically used for inference."} {"instruction": "Can you tell what is the following Python 3 function doing\ndef batch_stats(self, funcs:Collection[Callable]=None, ds_type:DatasetType=DatasetType.Train)->Tensor:\n \"Grab a batch of data and call reduction function `func` per channel\"\n funcs = ifnone(funcs, [torch.mean,torch.std])\n x = self.one_batch(ds_type=ds_type, denorm=False)[0].cpu()\n return [func(channel_view(x), 1) for func in funcs]", "response": "Grab a batch of data and call reduction function func per channel"} {"instruction": "Implement a Python 3 function for\nadding normalize transform using stats ( defaults to DataBunch. batch_stats )", "response": "def normalize(self, stats:Collection[Tensor]=None, do_x:bool=True, do_y:bool=False)->None:\n \"Add normalize transform using `stats` (defaults to `DataBunch.batch_stats`)\"\n if getattr(self,'norm',False): raise Exception('Can not call normalize twice')\n if stats is None: self.stats = self.batch_stats()\n else: self.stats = stats\n self.norm,self.denorm = normalize_funcs(*self.stats, do_x=do_x, do_y=do_y)\n self.add_tfm(self.norm)\n return self"} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef open(self, fn):\n \"Open image in `fn`, subclass and overwrite for custom behavior.\"\n return open_image(fn, convert_mode=self.convert_mode, after_open=self.after_open)", "response": "Open image in fn subclass and overwrite for custom behavior."} {"instruction": "Write a Python 3 script to\nget the list of files in path that have an image suffix. recurse determines if we search subfolders.", "response": "def from_folder(cls, path:PathOrStr='.', extensions:Collection[str]=None, **kwargs)->ItemList:\n \"Get the list of files in `path` that have an image suffix. `recurse` determines if we search subfolders.\"\n extensions = ifnone(extensions, image_extensions)\n return super().from_folder(path=path, extensions=extensions, **kwargs)"} {"instruction": "Create a Python 3 function to\nget the filenames in cols of df with folder in front of them suffix at the end.", "response": "def from_df(cls, df:DataFrame, path:PathOrStr, cols:IntsOrStrs=0, folder:PathOrStr=None, suffix:str='', **kwargs)->'ItemList':\n \"Get the filenames in `cols` of `df` with `folder` in front of them, `suffix` at the end.\"\n suffix = suffix or ''\n res = super().from_df(df, path=path, cols=cols, **kwargs)\n pref = f'{res.path}{os.path.sep}'\n if folder is not None: pref += f'{folder}{os.path.sep}'\n res.items = np.char.add(np.char.add(pref, res.items.astype(str)), suffix)\n return res"} {"instruction": "How would you implement a function in Python 3 that\ngets the filenames in path / csv_name opened with header.", "response": "def from_csv(cls, path:PathOrStr, csv_name:str, header:str='infer', **kwargs)->'ItemList':\n \"Get the filenames in `path/csv_name` opened with `header`.\"\n path = Path(path)\n df = pd.read_csv(path/csv_name, header=header)\n return cls.from_df(df, path=path, **kwargs)"} {"instruction": "How would you code a function in Python 3 to\nshow the xs ys on a figure of figsize.", "response": "def show_xys(self, xs, ys, imgsize:int=4, figsize:Optional[Tuple[int,int]]=None, **kwargs):\n \"Show the `xs` (inputs) and `ys` (targets) on a figure of `figsize`.\"\n rows = int(np.ceil(math.sqrt(len(xs))))\n axs = subplots(rows, rows, imgsize=imgsize, figsize=figsize)\n for x,y,ax in zip(xs, ys, axs.flatten()): x.show(ax=ax, y=y, **kwargs)\n for ax in axs.flatten()[len(xs):]: ax.axis('off')\n plt.tight_layout()"} {"instruction": "Write a Python 3 function for\nshowing xs ys and zs on a figure of figsize.", "response": "def show_xyzs(self, xs, ys, zs, imgsize:int=4, figsize:Optional[Tuple[int,int]]=None, **kwargs):\n \"Show `xs` (inputs), `ys` (targets) and `zs` (predictions) on a figure of `figsize`.\"\n if self._square_show_res:\n title = 'Ground truth\\nPredictions'\n rows = int(np.ceil(math.sqrt(len(xs))))\n axs = subplots(rows, rows, imgsize=imgsize, figsize=figsize, title=title, weight='bold', size=12)\n for x,y,z,ax in zip(xs,ys,zs,axs.flatten()): x.show(ax=ax, title=f'{str(y)}\\n{str(z)}', **kwargs)\n for ax in axs.flatten()[len(xs):]: ax.axis('off')\n else:\n title = 'Ground truth/Predictions'\n axs = subplots(len(xs), 2, imgsize=imgsize, figsize=figsize, title=title, weight='bold', size=14)\n for i,(x,y,z) in enumerate(zip(xs,ys,zs)):\n x.show(ax=axs[i,0], y=y, **kwargs)\n x.show(ax=axs[i,1], y=z, **kwargs)"} {"instruction": "Given the following Python 3 function, write the documentation\ndef generate_classes(self, items):\n \"Generate classes from unique `items` and add `background`.\"\n classes = super().generate_classes([o[1] for o in items])\n classes = ['background'] + list(classes)\n return classes", "response": "Generate classes from unique items and add background."} {"instruction": "Write a Python 3 function that can\nshow the xs ys on a figure of figsize.", "response": "def show_xys(self, xs, ys, imgsize:int=4, figsize:Optional[Tuple[int,int]]=None, **kwargs):\n \"Show the `xs` (inputs) and `ys`(targets) on a figure of `figsize`.\"\n axs = subplots(len(xs), 2, imgsize=imgsize, figsize=figsize)\n for i, (x,y) in enumerate(zip(xs,ys)):\n x.show(ax=axs[i,0], **kwargs)\n y.show(ax=axs[i,1], **kwargs)\n plt.tight_layout()"} {"instruction": "Given the following Python 3 function, write the documentation\ndef show_xyzs(self, xs, ys, zs, imgsize:int=4, figsize:Optional[Tuple[int,int]]=None, **kwargs):\n \"Show `xs` (inputs), `ys` (targets) and `zs` (predictions) on a figure of `figsize`.\"\n title = 'Input / Prediction / Target'\n axs = subplots(len(xs), 3, imgsize=imgsize, figsize=figsize, title=title, weight='bold', size=14)\n for i,(x,y,z) in enumerate(zip(xs,ys,zs)):\n x.show(ax=axs[i,0], **kwargs)\n y.show(ax=axs[i,2], **kwargs)\n z.show(ax=axs[i,1], **kwargs)", "response": "Show xs ys and zs on a figure of figsize."} {"instruction": "Here you have a function in Python 3, explain what it does\ndef gpu_mem_get(id=None):\n \"get total, used and free memory (in MBs) for gpu `id`. if `id` is not passed, currently selected torch device is used\"\n if not use_gpu: return GPUMemory(0, 0, 0)\n if id is None: id = torch.cuda.current_device()\n try:\n handle = pynvml.nvmlDeviceGetHandleByIndex(id)\n info = pynvml.nvmlDeviceGetMemoryInfo(handle)\n return GPUMemory(*(map(b2mb, [info.total, info.free, info.used])))\n except:\n return GPUMemory(0, 0, 0)", "response": "get total used and free memory for gpu id. if id is not passed currently selected torch device is used"} {"instruction": "Can you create a Python 3 function that\ngets [ gpu_id its_free_ram for the first gpu with highest available RAM", "response": "def gpu_with_max_free_mem():\n \"get [gpu_id, its_free_ram] for the first gpu with highest available RAM\"\n mem_all = gpu_mem_get_all()\n if not len(mem_all): return None, 0\n free_all = np.array([x.free for x in mem_all])\n id = np.argmax(free_all)\n return id, free_all[id]"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef gpu_mem_trace(func):\n \"A decorator that runs `GPUMemTrace` w/ report on func\"\n @functools.wraps(func)\n def wrapper(*args, **kwargs):\n with GPUMemTrace(ctx=func.__qualname__, on_exit_report=True):\n return func(*args, **kwargs)\n return wrapper", "response": "A decorator that runs GPUMemTrace w/ report on func"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef reduce_mem_usage(df):\n start_mem = df.memory_usage().sum() / 1024**2\n print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))\n\n #Removed from debugging\n columns = df.columns\n #.drop('index')\n\n for col in columns:\n col_type = df[col].dtype\n if str(col_type) != 'category' and col_type != 'datetime64[ns]' and col_type != bool:\n if col_type != object:\n c_min = df[col].min()\n c_max = df[col].max()\n if str(col_type)[:3] == 'int':\n if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:\n df[col] = df[col].astype(np.int8)\n elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:\n df[col] = df[col].astype(np.int16)\n elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:\n df[col] = df[col].astype(np.int32)\n elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:\n df[col] = df[col].astype(np.int64)\n else:\n #if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:\n #df[col] = df[col].astype(np.float16)\n #Sometimes causes and error and had to remove\n if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:\n df[col] = df[col].astype(np.float32)\n else:\n print('Error '+col+' Value would be a float64. Disregarding.')\n else:\n df[col] = df[col].astype('category')\n\n end_mem = df.memory_usage().sum() / 1024**2\n print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))\n print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))\n\n return df", "response": "Reduce memory usage of all the columns of a dataframe to reduce memory usage."} {"instruction": "How would you code a function in Python 3 to\nreturn ctx or subctx depending on this and constructor arguments", "response": "def _get_ctx(self, subctx=None):\n \"Return ' (ctx: subctx)' or ' (ctx)' or ' (subctx)' or '' depending on this and constructor arguments\"\n l = []\n if self.ctx is not None: l.append(self.ctx)\n if subctx is not None: l.append(subctx)\n return '' if len(l) == 0 else f\" ({': '.join(l)})\""} {"instruction": "Explain what the following Python 3 code does\ndef _learner_distributed(learn:Learner, cuda_id:int, cache_dir:PathOrStr='tmp'):\n \"Put `learn` on distributed training with `cuda_id`.\"\n learn.callbacks.append(DistributedTrainer(learn, cuda_id))\n learn.callbacks.append(DistributedRecorder(learn, cuda_id, cache_dir))\n return learn", "response": "Put learn on distributed training with cuda_id."} {"instruction": "Write a Python 3 script to\nconstruct a XResNet - 18 model.", "response": "def xresnet18(pretrained=False, **kwargs):\n \"\"\"Constructs a XResNet-18 model.\n\n Args:\n pretrained (bool): If True, returns a model pre-trained on ImageNet\n \"\"\"\n model = XResNet(BasicBlock, [2, 2, 2, 2], **kwargs)\n if pretrained: model.load_state_dict(model_zoo.load_url(model_urls['xresnet18']))\n return model"} {"instruction": "Implement a function in Python 3 to\nconstruct a XResNet - 50 model.", "response": "def xresnet50_2(pretrained=False, **kwargs):\n \"\"\"Constructs a XResNet-50 model.\n\n Args:\n pretrained (bool): If True, returns a model pre-trained on ImageNet\n \"\"\"\n model = XResNet(Bottleneck, [3, 4, 6, 3], **kwargs)\n if pretrained: model.load_state_dict(model_zoo.load_url(model_urls['xresnet50']))\n return model"} {"instruction": "How would you code a function in Python 3 to\ncalculate loss and metrics for a batch call out to callbacks as necessary.", "response": "def loss_batch(model:nn.Module, xb:Tensor, yb:Tensor, loss_func:OptLossFunc=None, opt:OptOptimizer=None,\n cb_handler:Optional[CallbackHandler]=None)->Tuple[Union[Tensor,int,float,str]]:\n \"Calculate loss and metrics for a batch, call out to callbacks as necessary.\"\n cb_handler = ifnone(cb_handler, CallbackHandler())\n if not is_listy(xb): xb = [xb]\n if not is_listy(yb): yb = [yb]\n out = model(*xb)\n out = cb_handler.on_loss_begin(out)\n\n if not loss_func: return to_detach(out), yb[0].detach()\n loss = loss_func(out, *yb)\n\n if opt is not None:\n loss,skip_bwd = cb_handler.on_backward_begin(loss)\n if not skip_bwd: loss.backward()\n if not cb_handler.on_backward_end(): opt.step()\n if not cb_handler.on_step_end(): opt.zero_grad()\n\n return loss.detach().cpu()"} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef get_preds(model:nn.Module, dl:DataLoader, pbar:Optional[PBar]=None, cb_handler:Optional[CallbackHandler]=None,\n activ:nn.Module=None, loss_func:OptLossFunc=None, n_batch:Optional[int]=None) -> List[Tensor]:\n \"Tuple of predictions and targets, and optional losses (if `loss_func`) using `dl`, max batches `n_batch`.\"\n res = [torch.cat(o).cpu() for o in\n zip(*validate(model, dl, cb_handler=cb_handler, pbar=pbar, average=False, n_batch=n_batch))]\n if loss_func is not None:\n with NoneReduceOnCPU(loss_func) as lf: res.append(lf(res[0], res[1]))\n if activ is not None: res[0] = activ(res[0])\n return res", "response": "Tuple of predictions and targets and optional losses ( if loss_func ) using dl max batches n_batch."} {"instruction": "Make a summary of the following Python 3 code\ndef validate(model:nn.Module, dl:DataLoader, loss_func:OptLossFunc=None, cb_handler:Optional[CallbackHandler]=None,\n pbar:Optional[PBar]=None, average=True, n_batch:Optional[int]=None)->Iterator[Tuple[Union[Tensor,int],...]]:\n \"Calculate `loss_func` of `model` on `dl` in evaluation mode.\"\n model.eval()\n with torch.no_grad():\n val_losses,nums = [],[]\n if cb_handler: cb_handler.set_dl(dl)\n for xb,yb in progress_bar(dl, parent=pbar, leave=(pbar is not None)):\n if cb_handler: xb, yb = cb_handler.on_batch_begin(xb, yb, train=False)\n val_loss = loss_batch(model, xb, yb, loss_func, cb_handler=cb_handler)\n val_losses.append(val_loss)\n if not is_listy(yb): yb = [yb]\n nums.append(yb[0].shape[0])\n if cb_handler and cb_handler.on_batch_end(val_losses[-1]): break\n if n_batch and (len(nums)>=n_batch): break\n nums = np.array(nums, dtype=np.float32)\n if average: return (to_np(torch.stack(val_losses)) * nums).sum() / nums.sum()\n else: return val_losses", "response": "Calculate loss_func of model on dl in evaluation mode."} {"instruction": "Explain what the following Python 3 code does\ndef train_epoch(model:nn.Module, dl:DataLoader, opt:optim.Optimizer, loss_func:LossFunction)->None:\n \"Simple training of `model` for 1 epoch of `dl` using optim `opt` and loss function `loss_func`.\"\n model.train()\n for xb,yb in dl:\n loss = loss_func(model(xb), yb)\n loss.backward()\n opt.step()\n opt.zero_grad()", "response": "Simple training of model for 1 epoch of dl using optim opt and loss function loss_func."} {"instruction": "Can you tell what is the following Python 3 function doing\ndef fit(epochs:int, learn:BasicLearner, callbacks:Optional[CallbackList]=None, metrics:OptMetrics=None)->None:\n \"Fit the `model` on `data` and learn using `loss_func` and `opt`.\"\n assert len(learn.data.train_dl) != 0, f\"\"\"Your training dataloader is empty, can't train a model.\n Use a smaller batch size (batch size={learn.data.train_dl.batch_size} for {len(learn.data.train_dl.dataset)} elements).\"\"\"\n cb_handler = CallbackHandler(callbacks, metrics)\n pbar = master_bar(range(epochs))\n cb_handler.on_train_begin(epochs, pbar=pbar, metrics=metrics)\n\n exception=False\n try:\n for epoch in pbar:\n learn.model.train()\n cb_handler.set_dl(learn.data.train_dl)\n cb_handler.on_epoch_begin()\n for xb,yb in progress_bar(learn.data.train_dl, parent=pbar):\n xb, yb = cb_handler.on_batch_begin(xb, yb)\n loss = loss_batch(learn.model, xb, yb, learn.loss_func, learn.opt, cb_handler)\n if cb_handler.on_batch_end(loss): break\n\n if not cb_handler.skip_validate and not learn.data.empty_val:\n val_loss = validate(learn.model, learn.data.valid_dl, loss_func=learn.loss_func,\n cb_handler=cb_handler, pbar=pbar)\n else: val_loss=None\n if cb_handler.on_epoch_end(val_loss): break\n except Exception as e:\n exception = e\n raise\n finally: cb_handler.on_train_end(exception)", "response": "Fit the model on data and learn using loss_func and opt."} {"instruction": "Can you tell what is the following Python 3 function doing\ndef load_learner(path:PathOrStr, file:PathLikeOrBinaryStream='export.pkl', test:ItemList=None, **db_kwargs):\n \"Load a `Learner` object saved with `export_state` in `path/file` with empty data, optionally add `test` and load on `cpu`. `file` can be file-like (file or buffer)\"\n source = Path(path)/file if is_pathlike(file) else file\n state = torch.load(source, map_location='cpu') if defaults.device == torch.device('cpu') else torch.load(source)\n model = state.pop('model')\n src = LabelLists.load_state(path, state.pop('data'))\n if test is not None: src.add_test(test)\n data = src.databunch(**db_kwargs)\n cb_state = state.pop('cb_state')\n clas_func = state.pop('cls')\n res = clas_func(data, model, **state)\n res.callback_fns = state['callback_fns'] #to avoid duplicates\n res.callbacks = [load_callback(c,s, res) for c,s in cb_state.items()]\n return res", "response": "Load a Learner object saved with export_state in path / file with empty data optionally add test and load on cpu. file can be file - like ( file or buffer )"} {"instruction": "Can you create a Python 3 function that\ninitializes recording status at beginning of training.", "response": "def on_train_begin(self, pbar:PBar, metrics_names:Collection[str], **kwargs:Any)->None:\n \"Initialize recording status at beginning of training.\"\n self.pbar = pbar\n self.names = ['epoch', 'train_loss'] if self.no_val else ['epoch', 'train_loss', 'valid_loss']\n self.metrics_names = metrics_names\n self.names += self.metrics_names\n if hasattr(self, '_added_met_names'): self.names += self._added_met_names\n if self.add_time: self.names.append('time')\n if not self.silent: self.pbar.write(self.names, table=True)\n self.losses,self.val_losses,self.lrs,self.moms,self.metrics,self.nb_batches = [],[],[],[],[],[]"} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef on_batch_begin(self, train, **kwargs:Any)->None:\n \"Record learning rate and momentum at beginning of batch.\"\n if train:\n self.lrs.append(self.opt.lr)\n self.moms.append(self.opt.mom)", "response": "Record learning rate and momentum at beginning of batch."} {"instruction": "Create a Python 3 function for\nrecording the loss before any other callback has a chance to modify it.", "response": "def on_backward_begin(self, smooth_loss:Tensor, **kwargs:Any)->None:\n \"Record the loss before any other callback has a chance to modify it.\"\n self.losses.append(smooth_loss)\n if self.pbar is not None and hasattr(self.pbar,'child'):\n self.pbar.child.comment = f'{smooth_loss:.4f}'"} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef on_epoch_end(self, epoch:int, num_batch:int, smooth_loss:Tensor,\n last_metrics=MetricsList, **kwargs:Any)->bool:\n \"Save epoch info: num_batch, smooth_loss, metrics.\"\n self.nb_batches.append(num_batch)\n if last_metrics is not None: self.val_losses.append(last_metrics[0])\n else: last_metrics = [] if self.no_val else [None]\n if len(last_metrics) > 1: self.metrics.append(last_metrics[1:])\n self.format_stats([epoch, smooth_loss] + last_metrics)", "response": "Save epoch info num_batch smooth_loss metrics."} {"instruction": "Can you write a function in Python 3 where it\nformats stats before printing.", "response": "def format_stats(self, stats:TensorOrNumList)->None:\n \"Format stats before printing.\"\n str_stats = []\n for name,stat in zip(self.names,stats):\n str_stats.append('#na#' if stat is None else str(stat) if isinstance(stat, int) else f'{stat:.6f}')\n if self.add_time: str_stats.append(format_time(time() - self.start_epoch))\n if not self.silent: self.pbar.write(str_stats, table=True)"} {"instruction": "Can you create a Python 3 function that\nadds names to the inner metric names.", "response": "def add_metric_names(self, names):\n \"Add `names` to the inner metric names.\"\n if hasattr(self, '_added_met_names'): self._added_met_names += names\n else: self._added_met_names = names"} {"instruction": "Can you implement a function in Python 3 that\nplots learning rate show_moms to include momentum.", "response": "def plot_lr(self, show_moms=False, skip_start:int=0, skip_end:int=0, return_fig:bool=None)->Optional[plt.Figure]:\n \"Plot learning rate, `show_moms` to include momentum.\"\n lrs = self._split_list(self.lrs, skip_start, skip_end)\n iterations = self._split_list(range_of(self.lrs), skip_start, skip_end)\n if show_moms:\n moms = self._split_list(self.moms, skip_start, skip_end)\n fig, axs = plt.subplots(1,2, figsize=(12,4))\n axs[0].plot(iterations, lrs)\n axs[0].set_xlabel('Iterations')\n axs[0].set_ylabel('Learning Rate')\n axs[1].plot(iterations, moms)\n axs[1].set_xlabel('Iterations')\n axs[1].set_ylabel('Momentum')\n else:\n fig, ax = plt.subplots()\n ax.plot(iterations, lrs)\n ax.set_xlabel('Iterations')\n ax.set_ylabel('Learning Rate')\n if ifnone(return_fig, defaults.return_fig): return fig\n if not IN_NOTEBOOK: plot_sixel(fig)"} {"instruction": "Implement a Python 3 function for\nplotting learning rate and losses trimmed between skip_start and skip_end. Optionally plot and return min gradient", "response": "def plot(self, skip_start:int=10, skip_end:int=5, suggestion:bool=False, return_fig:bool=None,\n **kwargs)->Optional[plt.Figure]:\n \"Plot learning rate and losses, trimmed between `skip_start` and `skip_end`. Optionally plot and return min gradient\"\n lrs = self._split_list(self.lrs, skip_start, skip_end)\n losses = self._split_list(self.losses, skip_start, skip_end)\n losses = [x.item() for x in losses]\n if 'k' in kwargs: losses = self.smoothen_by_spline(lrs, losses, **kwargs)\n fig, ax = plt.subplots(1,1)\n ax.plot(lrs, losses)\n ax.set_ylabel(\"Loss\")\n ax.set_xlabel(\"Learning Rate\")\n ax.set_xscale('log')\n ax.xaxis.set_major_formatter(plt.FormatStrFormatter('%.0e'))\n if suggestion:\n try: mg = (np.gradient(np.array(losses))).argmin()\n except:\n print(\"Failed to compute the gradients, there might not be enough points.\")\n return\n print(f\"Min numerical gradient: {lrs[mg]:.2E}\")\n ax.plot(lrs[mg],losses[mg],markersize=10,marker='o',color='red')\n self.min_grad_lr = lrs[mg]\n if ifnone(return_fig, defaults.return_fig): return fig\n if not IN_NOTEBOOK: plot_sixel(fig)"} {"instruction": "Write a Python 3 function for\nplotting training and validation losses.", "response": "def plot_losses(self, skip_start:int=0, skip_end:int=0, return_fig:bool=None)->Optional[plt.Figure]:\n \"Plot training and validation losses.\"\n fig, ax = plt.subplots(1,1)\n losses = self._split_list(self.losses, skip_start, skip_end)\n iterations = self._split_list(range_of(self.losses), skip_start, skip_end)\n ax.plot(iterations, losses, label='Train')\n val_iter = self._split_list_val(np.cumsum(self.nb_batches), skip_start, skip_end)\n val_losses = self._split_list_val(self.val_losses, skip_start, skip_end)\n ax.plot(val_iter, val_losses, label='Validation')\n ax.set_ylabel('Loss')\n ax.set_xlabel('Batches processed')\n ax.legend()\n if ifnone(return_fig, defaults.return_fig): return fig\n if not IN_NOTEBOOK: plot_sixel(fig)"} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef plot_metrics(self, skip_start:int=0, skip_end:int=0, return_fig:bool=None)->Optional[plt.Figure]:\n \"Plot metrics collected during training.\"\n assert len(self.metrics) != 0, \"There are no metrics to plot.\"\n fig, axes = plt.subplots(len(self.metrics[0]),1,figsize=(6, 4*len(self.metrics[0])))\n val_iter = self._split_list_val(np.cumsum(self.nb_batches), skip_start, skip_end)\n axes = axes.flatten() if len(self.metrics[0]) != 1 else [axes]\n for i, ax in enumerate(axes):\n values = [met[i] for met in self.metrics]\n values = self._split_list_val(values, skip_start, skip_end)\n ax.plot(val_iter, values)\n ax.set_ylabel(str(self.metrics_names[i]))\n ax.set_xlabel('Batches processed') \n if ifnone(return_fig, defaults.return_fig): return fig\n if not IN_NOTEBOOK: plot_sixel(fig)", "response": "Plot metrics collected during training."} {"instruction": "Implement a Python 3 function for\nlooking at params ( annotated with Param in func and return an ArgumentParser", "response": "def anno_parser(func):\n \"Look at params (annotated with `Param`) in func and return an `ArgumentParser`\"\n p = ArgumentParser(description=func.__doc__)\n for k,v in inspect.signature(func).parameters.items():\n param = func.__annotations__.get(k, Param())\n kwargs = param.kwargs\n if v.default != inspect.Parameter.empty: kwargs['default'] = v.default\n p.add_argument(f\"{param.pre}{k}\", **kwargs)\n return p"} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef call_parse(func):\n \"Decorator to create a simple CLI from `func` using `anno_parser`\"\n name = inspect.currentframe().f_back.f_globals['__name__']\n if name == \"__main__\":\n args = anno_parser(func).parse_args()\n func(**args.__dict__)\n else: return func", "response": "Decorator to create a simple CLI from func using anno_parser"} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef call_plac(f):\n \"Decorator to create a simple CLI from `func` using `plac`\"\n name = inspect.currentframe().f_back.f_globals['__name__']\n if name == '__main__':\n import plac\n res = plac.call(f)\n if callable(res): res()\n else: return f", "response": "Decorator to create a simple CLI from func using plac"} {"instruction": "Here you have a function in Python 3, explain what it does\ndef numericalize_tok(tokens, max_vocab=50000, min_freq=0, unk_tok=\"_unk_\", pad_tok=\"_pad_\", bos_tok=\"_bos_\", eos_tok=\"_eos_\"):\n if isinstance(tokens, str):\n raise ValueError(\"Expected to receive a list of tokens. Received a string instead\")\n if isinstance(tokens[0], list):\n tokens = [p for o in tokens for p in o]\n freq = Counter(tokens)\n int2tok = [o for o,c in freq.most_common(max_vocab) if c>min_freq]\n unk_id = 3\n int2tok.insert(0, bos_tok)\n int2tok.insert(1, pad_tok)\n int2tok.insert(2, eos_tok)\n int2tok.insert(unk_id, unk_tok)\n tok2int = collections.defaultdict(lambda:unk_id, {v:k for k,v in enumerate(int2tok)})\n return int2tok, tok2int", "response": "Takes in text tokens and returns int2tok and tok2int converters"} {"instruction": "Write a Python 3 function for\nstarting a new kernel and return its Manager and Client", "response": "def start_new_kernel(startup_timeout=60, kernel_name='python', **kwargs):\n \"\"\"Start a new kernel, and return its Manager and Client\"\"\"\n logger.debug('Starting new kernel: \"%s\"' % kernel_name)\n km = KernelManager(kernel_name=kernel_name,\n kernel_spec_manager=NbvalKernelspecManager())\n km.start_kernel(**kwargs)\n kc = km.client()\n kc.start_channels()\n try:\n kc.wait_for_ready(timeout=startup_timeout)\n except RuntimeError:\n logger.exception('Failure starting kernel \"%s\"', kernel_name)\n kc.stop_channels()\n km.shutdown_kernel()\n raise\n\n return km, kc"} {"instruction": "How would you explain what the following Python 3 function does\ndef get_kernel_spec(self, kernel_name):\n if kernel_name == CURRENT_ENV_KERNEL_NAME:\n return self.kernel_spec_class(\n resource_dir=ipykernel.kernelspec.RESOURCES,\n **ipykernel.kernelspec.get_kernel_dict())\n else:\n return super(NbvalKernelspecManager, self).get_kernel_spec(kernel_name)", "response": "Returns a : class:`KernelSpec` instance for the given kernel name."} {"instruction": "Here you have a function in Python 3, explain what it does\ndef get_message(self, stream, timeout=None):\n try:\n if stream == 'iopub':\n msg = self.kc.get_iopub_msg(timeout=timeout)\n elif stream == 'shell':\n msg = self.kc.get_shell_msg(timeout=timeout)\n else:\n raise ValueError('Invalid stream specified: \"%s\"' % stream)\n except Empty:\n logger.debug('Kernel: Timeout waiting for message on %s', stream)\n raise\n logger.debug(\"Kernel message (%s):\\n%s\", stream, pformat(msg))\n return msg", "response": "Function to get a message from the iopub channel."} {"instruction": "Write a Python 3 function that can\nexecute a string of python code in cell input.", "response": "def execute_cell_input(self, cell_input, allow_stdin=None):\n \"\"\"\n Executes a string of python code in cell input.\n We do not allow the kernel to make requests to the stdin\n this is the norm for notebooks\n\n Function returns a unique message id of the reply from\n the kernel.\n \"\"\"\n if cell_input:\n logger.debug('Executing cell: \"%s\"...', cell_input.splitlines()[0][:40])\n else:\n logger.debug('Executing empty cell')\n return self.kc.execute(cell_input, allow_stdin=allow_stdin, stop_on_error=False)"} {"instruction": "How would you explain what the following Python 3 function does\ndef await_reply(self, msg_id, timeout=None):\n while True:\n msg = self.get_message(stream='shell', timeout=timeout)\n\n # Is this the message we are waiting for?\n if msg['parent_header'].get('msg_id') == msg_id:\n if msg['content']['status'] == 'aborted':\n # This should not occur!\n raise RuntimeError('Kernel aborted execution request')\n return", "response": "Wait for a reply for a message id."} {"instruction": "Make a summary of the following Python 3 code\ndef await_idle(self, parent_id, timeout):\n while True:\n # Get a message from the kernel iopub channel\n msg = self.get_message(timeout=timeout, stream='iopub') # raises Empty on timeout!\n\n if msg['parent_header'].get('msg_id') != parent_id:\n continue\n if msg['msg_type'] == 'status':\n if msg['content']['execution_state'] == 'idle':\n break", "response": "Poll the iopub stream until an idle message is received for the given parent ID"} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef stop(self):\n logger.debug('Stopping kernel')\n self.kc.stop_channels()\n self.km.shutdown_kernel(now=True)\n del self.km", "response": "Stops the kernel process and shutdown the kernel manager."} {"instruction": "Write a Python 3 function that can\nget a list of index values for a validation set from a dataset", "response": "def get_cv_idxs(n, cv_idx=0, val_pct=0.2, seed=42):\n \"\"\" Get a list of index values for Validation set from a dataset\n \n Arguments:\n n : int, Total number of elements in the data set.\n cv_idx : int, starting index [idx_start = cv_idx*int(val_pct*n)] \n val_pct : (int, float), validation set percentage \n seed : seed value for RandomState\n \n Returns:\n list of indexes \n \"\"\"\n np.random.seed(seed)\n n_val = int(val_pct*n)\n idx_start = cv_idx*n_val\n idxs = np.random.permutation(n)\n return idxs[idx_start:idx_start+n_val]"} {"instruction": "Implement a function in Python 3 to\nenlarge or shrink a single image to scale such that the smaller of the height or width dimension is equal to targ.", "response": "def resize_img(fname, targ, path, new_path, fn=None):\n \"\"\"\n Enlarge or shrink a single image to scale, such that the smaller of the height or width dimension is equal to targ.\n \"\"\"\n if fn is None:\n fn = resize_fn(targ)\n dest = os.path.join(path_for(path, new_path, targ), fname)\n if os.path.exists(dest): return\n im = Image.open(os.path.join(path, fname)).convert('RGB')\n os.makedirs(os.path.split(dest)[0], exist_ok=True)\n fn(im).save(dest)"} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef resize_imgs(fnames, targ, path, new_path, resume=True, fn=None):\n target_path = path_for(path, new_path, targ)\n if resume:\n subdirs = {os.path.dirname(p) for p in fnames}\n subdirs = {s for s in subdirs if os.path.exists(os.path.join(target_path, s))}\n already_resized_fnames = set()\n for subdir in subdirs:\n files = [os.path.join(subdir, file) for file in os.listdir(os.path.join(target_path, subdir))]\n already_resized_fnames.update(set(files))\n original_fnames = set(fnames)\n fnames = list(original_fnames - already_resized_fnames)\n \n errors = {}\n def safely_process(fname):\n try:\n resize_img(fname, targ, path, new_path, fn=fn)\n except Exception as ex:\n errors[fname] = str(ex)\n\n if len(fnames) > 0:\n with ThreadPoolExecutor(num_cpus()) as e:\n ims = e.map(lambda fname: safely_process(fname), fnames)\n for _ in tqdm(ims, total=len(fnames), leave=False): pass\n if errors:\n print('Some images failed to process:')\n print(json.dumps(errors, indent=2))\n return os.path.join(path,new_path,str(targ))", "response": "Enlarge or shrink a set of images in the same directory to scale such that the smaller of the height or width dimension is equal to targ."} {"instruction": "Explain what the following Python 3 code does\ndef read_dir(path, folder):\n full_path = os.path.join(path, folder)\n fnames = glob(f\"{full_path}/*.*\")\n directories = glob(f\"{full_path}/*/\")\n if any(fnames):\n return [os.path.relpath(f,path) for f in fnames]\n elif any(directories):\n raise FileNotFoundError(\"{} has subdirectories but contains no files. Is your directory structure is correct?\".format(full_path))\n else:\n raise FileNotFoundError(\"{} folder doesn't exist or is empty\".format(full_path))", "response": "Reads a folder and returns a list of relative file paths to path for all files within it."} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef n_hot(ids, c):\n '''\n one hot encoding by index. Returns array of length c, where all entries are 0, except for the indecies in ids\n '''\n res = np.zeros((c,), dtype=np.float32)\n res[ids] = 1\n return res", "response": "Returns array of length c where all entries are 0 except for the indecies in ids"} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef folder_source(path, folder):\n fnames, lbls, all_lbls = read_dirs(path, folder)\n lbl2idx = {lbl:idx for idx,lbl in enumerate(all_lbls)}\n idxs = [lbl2idx[lbl] for lbl in lbls]\n lbl_arr = np.array(idxs, dtype=int)\n return fnames, lbl_arr, all_lbls", "response": "Returns the filenames and labels for a folder within a path\n \n "} {"instruction": "Create a Python 3 function to\nparse filenames and labels from a CSV file.", "response": "def parse_csv_labels(fn, skip_header=True, cat_separator = ' '):\n \"\"\"Parse filenames and label sets from a CSV file.\n\n This method expects that the csv file at path :fn: has two columns. If it\n has a header, :skip_header: should be set to True. The labels in the\n label set are expected to be space separated.\n\n Arguments:\n fn: Path to a CSV file.\n skip_header: A boolean flag indicating whether to skip the header.\n\n Returns:\n a two-tuple of (\n image filenames,\n a dictionary of filenames and corresponding labels\n )\n .\n :param cat_separator: the separator for the categories column\n \"\"\"\n df = pd.read_csv(fn, index_col=0, header=0 if skip_header else None, dtype=str)\n fnames = df.index.values\n df.iloc[:,0] = df.iloc[:,0].str.split(cat_separator)\n return fnames, list(df.to_dict().values())[0]"} {"instruction": "Write a Python 3 script for\ntruing if the fn points to a DICOM image", "response": "def isdicom(fn):\n '''True if the fn points to a DICOM image'''\n fn = str(fn)\n if fn.endswith('.dcm'):\n return True\n # Dicom signature from the dicom spec.\n with open(fn,'rb') as fh:\n fh.seek(0x80)\n return fh.read(4)==b'DICM'"} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef open_image(fn):\n flags = cv2.IMREAD_UNCHANGED+cv2.IMREAD_ANYDEPTH+cv2.IMREAD_ANYCOLOR\n if not os.path.exists(fn) and not str(fn).startswith(\"http\"):\n raise OSError('No such file or directory: {}'.format(fn))\n elif os.path.isdir(fn) and not str(fn).startswith(\"http\"):\n raise OSError('Is a directory: {}'.format(fn))\n elif isdicom(fn):\n slice = pydicom.read_file(fn)\n if slice.PhotometricInterpretation.startswith('MONOCHROME'):\n # Make a fake RGB image\n im = np.stack([slice.pixel_array]*3,-1)\n return im / ((1 << slice.BitsStored)-1)\n else:\n # No support for RGB yet, as it involves various color spaces.\n # It shouldn't be too difficult to add though, if needed.\n raise OSError('Unsupported DICOM image with PhotometricInterpretation=={}'.format(slice.PhotometricInterpretation))\n else:\n #res = np.array(Image.open(fn), dtype=np.float32)/255\n #if len(res.shape)==2: res = np.repeat(res[...,None],3,2)\n #return res\n try:\n if str(fn).startswith(\"http\"):\n req = urllib.urlopen(str(fn))\n image = np.asarray(bytearray(req.read()), dtype=\"uint8\")\n im = cv2.imdecode(image, flags).astype(np.float32)/255\n else:\n im = cv2.imread(str(fn), flags).astype(np.float32)/255\n if im is None: raise OSError(f'File not recognized by opencv: {fn}')\n return cv2.cvtColor(im, cv2.COLOR_BGR2RGB)\n except Exception as e:\n raise OSError('Error handling image at: {}'.format(fn)) from e", "response": "Opens an image using OpenCV and returns the image in RGB format."} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef split_by_idx(idxs, *a):\n mask = np.zeros(len(a[0]),dtype=bool)\n mask[np.array(idxs)] = True\n return [(o[mask],o[~mask]) for o in a]", "response": "Split each array passed as a to a pair of arrays like this ( elements selected by idxs the remaining elements"} {"instruction": "Write a Python 3 script for\nreversing the normalization done to a batch of images.", "response": "def denorm(self,arr):\n \"\"\"Reverse the normalization done to a batch of images.\n\n Arguments:\n arr: of shape/size (N,3,sz,sz)\n \"\"\"\n if type(arr) is not np.ndarray: arr = to_np(arr)\n if len(arr.shape)==3: arr = arr[None]\n return self.transform.denorm(np.rollaxis(arr,1,4))"} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef resized(self, dl, targ, new_path, resume = True, fn=None):\n return dl.dataset.resize_imgs(targ, new_path, resume=resume, fn=fn) if dl else None", "response": "Resizes the images in targ to new_path."} {"instruction": "Here you have a function in Python 3, explain what it does\ndef from_arrays(cls, path, trn, val, bs=64, tfms=(None,None), classes=None, num_workers=4, test=None, continuous=False):\n f = ArraysIndexRegressionDataset if continuous else ArraysIndexDataset\n datasets = cls.get_ds(f, trn, val, tfms, test=test)\n return cls(path, datasets, bs, num_workers, classes=classes)", "response": "Read in images and their labels given as numpy arrays."} {"instruction": "Can you tell what is the following Python 3 function doing\ndef from_paths(cls, path, bs=64, tfms=(None,None), trn_name='train', val_name='valid', test_name=None, test_with_labels=False, num_workers=8):\n assert not(tfms[0] is None or tfms[1] is None), \"please provide transformations for your train and validation sets\"\n trn,val = [folder_source(path, o) for o in (trn_name, val_name)]\n if test_name:\n test = folder_source(path, test_name) if test_with_labels else read_dir(path, test_name)\n else: test = None\n datasets = cls.get_ds(FilesIndexArrayDataset, trn, val, tfms, path=path, test=test)\n return cls(path, datasets, bs, num_workers, classes=trn[2])", "response": "Read in images and their labels given as sub - folders names."} {"instruction": "Explain what the following Python 3 code does\ndef from_csv(cls, path, folder, csv_fname, bs=64, tfms=(None,None),\n val_idxs=None, suffix='', test_name=None, continuous=False, skip_header=True, num_workers=8, cat_separator=' '):\n \"\"\" Read in images and their labels given as a CSV file.\n\n This method should be used when training image labels are given in an CSV file as opposed to\n sub-directories with label names.\n\n Arguments:\n path: a root path of the data (used for storing trained models, precomputed values, etc)\n folder: a name of the folder in which training images are contained.\n csv_fname: a name of the CSV file which contains target labels.\n bs: batch size\n tfms: transformations (for data augmentations). e.g. output of `tfms_from_model`\n val_idxs: index of images to be used for validation. e.g. output of `get_cv_idxs`.\n If None, default arguments to get_cv_idxs are used.\n suffix: suffix to add to image names in CSV file (sometimes CSV only contains the file name without file\n extension e.g. '.jpg' - in which case, you can set suffix as '.jpg')\n test_name: a name of the folder which contains test images.\n continuous: if True, the data set is used to train regression models. If False, it is used \n to train classification models.\n skip_header: skip the first row of the CSV file.\n num_workers: number of workers\n cat_separator: Labels category separator\n\n Returns:\n ImageClassifierData\n \"\"\"\n assert not (tfms[0] is None or tfms[1] is None), \"please provide transformations for your train and validation sets\"\n assert not (os.path.isabs(folder)), \"folder needs to be a relative path\"\n fnames,y,classes = csv_source(folder, csv_fname, skip_header, suffix, continuous=continuous, cat_separator=cat_separator)\n return cls.from_names_and_array(path, fnames, y, classes, val_idxs, test_name,\n num_workers=num_workers, suffix=suffix, tfms=tfms, bs=bs, continuous=continuous)", "response": "Read in images and their labels given as a CSV file."} {"instruction": "Can you create a Python 3 function that\nreads in images given a sub - folder and a numpy array containing target labels.", "response": "def from_path_and_array(cls, path, folder, y, classes=None, val_idxs=None, test_name=None,\n num_workers=8, tfms=(None,None), bs=64):\n \"\"\" Read in images given a sub-folder and their labels given a numpy array\n\n Arguments:\n path: a root path of the data (used for storing trained models, precomputed values, etc)\n folder: a name of the folder in which training images are contained.\n y: numpy array which contains target labels ordered by filenames.\n bs: batch size\n tfms: transformations (for data augmentations). e.g. output of `tfms_from_model`\n val_idxs: index of images to be used for validation. e.g. output of `get_cv_idxs`.\n If None, default arguments to get_cv_idxs are used.\n test_name: a name of the folder which contains test images.\n num_workers: number of workers\n\n Returns:\n ImageClassifierData\n \"\"\"\n assert not (tfms[0] is None or tfms[1] is None), \"please provide transformations for your train and validation sets\"\n assert not (os.path.isabs(folder)), \"folder needs to be a relative path\"\n fnames = np.core.defchararray.add(f'{folder}/', sorted(os.listdir(f'{path}{folder}')))\n return cls.from_names_and_array(path, fnames, y, classes, val_idxs, test_name,\n num_workers=num_workers, tfms=tfms, bs=bs)"} {"instruction": "Write a Python 3 script for\nbeing the code running in the ipython environment ( jupyter including )", "response": "def is_in_ipython():\n \"Is the code running in the ipython environment (jupyter including)\"\n\n program_name = os.path.basename(os.getenv('_', ''))\n\n if ('jupyter-notebook' in program_name or # jupyter-notebook\n 'ipython' in program_name or # ipython\n 'JPY_PARENT_PID' in os.environ): # ipython-notebook\n return True\n else:\n return False"} {"instruction": "Given the following Python 3 function, write the documentation\ndef get_ref_free_exc_info():\n \"Free traceback from references to locals() in each frame to avoid circular reference leading to gc.collect() unable to reclaim memory\"\n type, val, tb = sys.exc_info()\n traceback.clear_frames(tb)\n return (type, val, tb)", "response": "Free traceback from references to locals in each frame to avoid circular reference leading to gc. collect() unable to reclaim memory"} {"instruction": "Make a summary of the following Python 3 code\ndef gpu_mem_restore(func):\n \"Reclaim GPU RAM if CUDA out of memory happened, or execution was interrupted\"\n @functools.wraps(func)\n def wrapper(*args, **kwargs):\n tb_clear_frames = os.environ.get('FASTAI_TB_CLEAR_FRAMES', None)\n if not IS_IN_IPYTHON or tb_clear_frames==\"0\":\n return func(*args, **kwargs)\n\n try:\n return func(*args, **kwargs)\n except Exception as e:\n if (\"CUDA out of memory\" in str(e) or\n \"device-side assert triggered\" in str(e) or\n tb_clear_frames == \"1\"):\n type, val, tb = get_ref_free_exc_info() # must!\n gc.collect()\n if \"device-side assert triggered\" in str(e):\n warn(\"\"\"When 'device-side assert triggered' error happens, it's not possible to recover and you must restart the kernel to continue. Use os.environ['CUDA_LAUNCH_BLOCKING']=\"1\" before restarting to debug\"\"\")\n raise type(val).with_traceback(tb) from None\n else: raise # re-raises the exact last exception\n return wrapper", "response": "Reclaim GPU RAM if CUDA out of memory happened or execution was interrupted"} {"instruction": "Here you have a function in Python 3, explain what it does\ndef fit(model, data, n_epochs, opt, crit, metrics=None, callbacks=None, stepper=Stepper,\n swa_model=None, swa_start=None, swa_eval_freq=None, visualize=False, **kwargs):\n \"\"\" Fits a model\n\n Arguments:\n model (model): any pytorch module\n net = to_gpu(net)\n data (ModelData): see ModelData class and subclasses (can be a list)\n opts: an optimizer. Example: optim.Adam. \n If n_epochs is a list, it needs to be the layer_optimizer to get the optimizer as it changes.\n n_epochs(int or list): number of epochs (or list of number of epochs)\n crit: loss function to optimize. Example: F.cross_entropy\n \"\"\"\n\n seq_first = kwargs.pop('seq_first', False)\n all_val = kwargs.pop('all_val', False)\n get_ep_vals = kwargs.pop('get_ep_vals', False)\n validate_skip = kwargs.pop('validate_skip', 0)\n metrics = metrics or []\n callbacks = callbacks or []\n avg_mom=0.98\n batch_num,avg_loss=0,0.\n for cb in callbacks: cb.on_train_begin()\n names = [\"epoch\", \"trn_loss\", \"val_loss\"] + [f.__name__ for f in metrics]\n if swa_model is not None:\n swa_names = ['swa_loss'] + [f'swa_{f.__name__}' for f in metrics]\n names += swa_names\n # will use this to call evaluate later\n swa_stepper = stepper(swa_model, None, crit, **kwargs)\n\n layout = \"{!s:10} \" * len(names)\n if not isinstance(n_epochs, Iterable): n_epochs=[n_epochs]\n if not isinstance(data, Iterable): data = [data]\n if len(data) == 1: data = data * len(n_epochs)\n for cb in callbacks: cb.on_phase_begin()\n model_stepper = stepper(model, opt.opt if hasattr(opt,'opt') else opt, crit, **kwargs)\n ep_vals = collections.OrderedDict()\n tot_epochs = int(np.ceil(np.array(n_epochs).sum()))\n cnt_phases = np.array([ep * len(dat.trn_dl) for (ep,dat) in zip(n_epochs,data)]).cumsum()\n phase = 0\n for epoch in tnrange(tot_epochs, desc='Epoch'):\n if phase >= len(n_epochs): break #Sometimes cumulated errors make this append.\n model_stepper.reset(True)\n cur_data = data[phase]\n if hasattr(cur_data, 'trn_sampler'): cur_data.trn_sampler.set_epoch(epoch)\n if hasattr(cur_data, 'val_sampler'): cur_data.val_sampler.set_epoch(epoch)\n num_batch = len(cur_data.trn_dl)\n t = tqdm(iter(cur_data.trn_dl), leave=False, total=num_batch, miniters=0)\n if all_val: val_iter = IterBatch(cur_data.val_dl)\n\n for (*x,y) in t:\n batch_num += 1\n for cb in callbacks: cb.on_batch_begin()\n loss = model_stepper.step(V(x),V(y), epoch)\n avg_loss = avg_loss * avg_mom + loss * (1-avg_mom)\n debias_loss = avg_loss / (1 - avg_mom**batch_num)\n t.set_postfix(loss=debias_loss, refresh=False)\n stop=False\n los = debias_loss if not all_val else [debias_loss] + validate_next(model_stepper,metrics, val_iter)\n for cb in callbacks: stop = stop or cb.on_batch_end(los)\n if stop: return\n if batch_num >= cnt_phases[phase]:\n for cb in callbacks: cb.on_phase_end()\n phase += 1\n if phase >= len(n_epochs):\n t.close()\n break\n for cb in callbacks: cb.on_phase_begin()\n if isinstance(opt, LayerOptimizer): model_stepper.opt = opt.opt\n if cur_data != data[phase]:\n t.close()\n break\n\n if not all_val:\n vals = validate(model_stepper, cur_data.val_dl, metrics, epoch, seq_first=seq_first, validate_skip = validate_skip)\n stop=False\n for cb in callbacks: stop = stop or cb.on_epoch_end(vals)\n if swa_model is not None:\n if (epoch + 1) >= swa_start and ((epoch + 1 - swa_start) % swa_eval_freq == 0 or epoch == tot_epochs - 1):\n fix_batchnorm(swa_model, cur_data.trn_dl)\n swa_vals = validate(swa_stepper, cur_data.val_dl, metrics, epoch, validate_skip = validate_skip)\n vals += swa_vals\n\n if epoch > 0: \n print_stats(epoch, [debias_loss] + vals, visualize, prev_val)\n else:\n print(layout.format(*names))\n print_stats(epoch, [debias_loss] + vals, visualize)\n prev_val = [debias_loss] + vals\n ep_vals = append_stats(ep_vals, epoch, [debias_loss] + vals)\n if stop: break\n for cb in callbacks: cb.on_train_end()\n if get_ep_vals: return vals, ep_vals\n else: return vals", "response": "Fits a model to a set of data."} {"instruction": "Given the following Python 3 function, write the documentation\ndef validate_next(stepper, metrics, val_iter):\n stepper.reset(False)\n with no_grad_context():\n (*x,y) = val_iter.next()\n preds,l = stepper.evaluate(VV(x), VV(y))\n res = [delistify(to_np(l))]\n res += [f(datafy(preds), datafy(y)) for f in metrics]\n stepper.reset(True)\n return res", "response": "Computes the loss on the next minibatch of the validation set."} {"instruction": "Write a Python 3 script for\ncreating link to documentation.", "response": "def link_type(arg_type, arg_name=None, include_bt:bool=True):\n \"Create link to documentation.\"\n arg_name = arg_name or fn_name(arg_type)\n if include_bt: arg_name = code_esc(arg_name)\n if belongs_to_module(arg_type, 'torch') and ('Tensor' not in arg_name): return f'[{arg_name}]({get_pytorch_link(arg_type)})'\n if is_fastai_class(arg_type): return f'[{arg_name}]({get_fn_link(arg_type)})'\n return arg_name"} {"instruction": "Write a Python 3 function that can\ncheck if t belongs to module_name.", "response": "def belongs_to_module(t, module_name):\n \"Check if `t` belongs to `module_name`.\"\n if hasattr(t, '__func__'): return belongs_to_module(t.__func__, module_name)\n if not inspect.getmodule(t): return False\n return inspect.getmodule(t).__name__.startswith(module_name)"} {"instruction": "Write a Python 3 script for\nformatting function param to param1 Type = val. Font weights param1 = bold val = italic", "response": "def format_param(p):\n \"Formats function param to `param1:Type=val`. Font weights: param1=bold, val=bold+italic\"\n arg_prefix = arg_prefixes.get(p.kind, '') # asterisk prefix for *args and **kwargs\n res = f\"**{arg_prefix}{code_esc(p.name)}**\"\n if hasattr(p, 'annotation') and p.annotation != p.empty: res += f':{anno_repr(p.annotation)}'\n if p.default != p.empty:\n default = getattr(p.default, 'func', p.default)\n default = getattr(default, '__name__', default)\n res += f'=***`{repr(default)}`***'\n return res"} {"instruction": "Implement a Python 3 function for\nformatting and link func definition to show in documentation", "response": "def format_ft_def(func, full_name:str=None)->str:\n \"Format and link `func` definition to show in documentation\"\n sig = inspect.signature(func)\n name = f'{full_name or func.__name__}'\n fmt_params = [format_param(param) for name,param\n in sig.parameters.items() if name not in ('self','cls')]\n arg_str = f\"({', '.join(fmt_params)})\"\n if sig.return_annotation and (sig.return_annotation != sig.empty): arg_str += f\" \u2192 {anno_repr(sig.return_annotation)}\"\n if is_fastai_class(type(func)): arg_str += f\" :: {link_type(type(func))}\"\n f_name = f\"class {name}\" if inspect.isclass(func) else name\n return f'{f_name}',f'{name}{arg_str}'"} {"instruction": "How would you implement a function in Python 3 that\nshows documentation for element elt. Supported types class Callable and enum.", "response": "def show_doc(elt, doc_string:bool=True, full_name:str=None, arg_comments:dict=None, title_level=None, alt_doc_string:str='',\n ignore_warn:bool=False, markdown=True, show_tests=True):\n \"Show documentation for element `elt`. Supported types: class, Callable, and enum.\"\n arg_comments = ifnone(arg_comments, {})\n anchor_id = get_anchor(elt)\n elt = getattr(elt, '__func__', elt)\n full_name = full_name or fn_name(elt)\n if inspect.isclass(elt):\n if is_enum(elt.__class__): name,args = get_enum_doc(elt, full_name)\n else: name,args = get_cls_doc(elt, full_name)\n elif isinstance(elt, Callable): name,args = format_ft_def(elt, full_name)\n else: raise Exception(f'doc definition not supported for {full_name}')\n source_link = get_function_source(elt) if is_fastai_class(elt) else \"\"\n test_link, test_modal = get_pytest_html(elt, anchor_id=anchor_id) if show_tests else ('', '')\n title_level = ifnone(title_level, 2 if inspect.isclass(elt) else 4)\n doc = f'{name}{source_link}{test_link}'\n doc += f'\\n\\n> {args}\\n\\n'\n doc += f'{test_modal}'\n if doc_string and (inspect.getdoc(elt) or arg_comments):\n doc += format_docstring(elt, arg_comments, alt_doc_string, ignore_warn) + ' '\n if markdown: display(Markdown(doc))\n else: return doc"} {"instruction": "Can you write a function in Python 3 where it\nshows show_doc info in preview window along with link to full docs.", "response": "def doc(elt):\n \"Show `show_doc` info in preview window along with link to full docs.\"\n global use_relative_links\n use_relative_links = False\n elt = getattr(elt, '__func__', elt)\n md = show_doc(elt, markdown=False)\n if is_fastai_class(elt):\n md += f'\\n\\nShow in docs'\n output = HTMLExporter().markdown2html(md)\n use_relative_links = True\n if IS_IN_COLAB: get_ipython().run_cell_magic(u'html', u'', output)\n else:\n try: page.page({'text/html': output})\n except: display(Markdown(md))"} {"instruction": "How would you explain what the following Python 3 function does\ndef format_docstring(elt, arg_comments:dict={}, alt_doc_string:str='', ignore_warn:bool=False)->str:\n \"Merge and format the docstring definition with `arg_comments` and `alt_doc_string`.\"\n parsed = \"\"\n doc = parse_docstring(inspect.getdoc(elt))\n description = alt_doc_string or f\"{doc['short_description']} {doc['long_description']}\"\n if description: parsed += f'\\n\\n{link_docstring(inspect.getmodule(elt), description)}'\n\n resolved_comments = {**doc.get('comments', {}), **arg_comments} # arg_comments takes priority\n args = inspect.getfullargspec(elt).args if not is_enum(elt.__class__) else elt.__members__.keys()\n if resolved_comments: parsed += '\\n'\n for a in resolved_comments:\n parsed += f'\\n- *{a}*: {resolved_comments[a]}'\n if a not in args and not ignore_warn: warn(f'Doc arg mismatch: {a}')\n\n return_comment = arg_comments.get('return') or doc.get('return')\n if return_comment: parsed += f'\\n\\n*return*: {return_comment}'\n return parsed", "response": "Merge and format the docstring definition with arg_comments and alt_doc_string."} {"instruction": "Can you implement a function in Python 3 that\nsearches docstring for backticks and attempt to link those functions to respective documentation.", "response": "def link_docstring(modules, docstring:str, overwrite:bool=False)->str:\n \"Search `docstring` for backticks and attempt to link those functions to respective documentation.\"\n mods = listify(modules)\n for mod in mods: _modvars.update(mod.__dict__) # concat all module definitions\n return re.sub(BT_REGEX, replace_link, docstring)"} {"instruction": "Implement a Python 3 function for\nattempting to resolve keywords such as Learner. lr_find. match_last starts matching from last component.", "response": "def find_elt(modvars, keyword, match_last=False):\n \"Attempt to resolve keywords such as Learner.lr_find. `match_last` starts matching from last component.\"\n keyword = strip_fastai(keyword)\n if keyword in modvars: return modvars[keyword]\n comps = keyword.split('.')\n comp_elt = modvars.get(comps[0])\n if hasattr(comp_elt, '__dict__'): return find_elt(comp_elt.__dict__, '.'.join(comps[1:]), match_last=match_last)"} {"instruction": "Implement a function in Python 3 to\nreturn module from mod_name.", "response": "def import_mod(mod_name:str, ignore_errors=False):\n \"Return module from `mod_name`.\"\n splits = str.split(mod_name, '.')\n try:\n if len(splits) > 1 : mod = importlib.import_module('.' + '.'.join(splits[1:]), splits[0])\n else: mod = importlib.import_module(mod_name)\n return mod\n except:\n if not ignore_errors: print(f\"Module {mod_name} doesn't exist.\")"} {"instruction": "Implement a Python 3 function for\nshowing documentation for ft_name see show_doc.", "response": "def show_doc_from_name(mod_name, ft_name:str, doc_string:bool=True, arg_comments:dict={}, alt_doc_string:str=''):\n \"Show documentation for `ft_name`, see `show_doc`.\"\n mod = import_mod(mod_name)\n splits = str.split(ft_name, '.')\n assert hasattr(mod, splits[0]), print(f\"Module {mod_name} doesn't have a function named {splits[0]}.\")\n elt = getattr(mod, splits[0])\n for i,split in enumerate(splits[1:]):\n assert hasattr(elt, split), print(f\"Class {'.'.join(splits[:i+1])} doesn't have a function named {split}.\")\n elt = getattr(elt, split)\n show_doc(elt, doc_string, ft_name, arg_comments, alt_doc_string)"} {"instruction": "Write a Python 3 function for\nreturning all the functions of module mod.", "response": "def get_ft_names(mod, include_inner=False)->List[str]:\n \"Return all the functions of module `mod`.\"\n # If the module has an attribute __all__, it picks those.\n # Otherwise, it returns all the functions defined inside a module.\n fn_names = []\n for elt_name in get_exports(mod):\n elt = getattr(mod,elt_name)\n #This removes the files imported from elsewhere\n try: fname = inspect.getfile(elt)\n except: continue\n if mod.__file__.endswith('__init__.py'):\n if inspect.ismodule(elt): fn_names.append(elt_name)\n else: continue\n else:\n if (fname != mod.__file__): continue\n if inspect.isclass(elt) or inspect.isfunction(elt): fn_names.append(elt_name)\n else: continue\n if include_inner and inspect.isclass(elt) and not is_enum(elt.__class__):\n fn_names.extend(get_inner_fts(elt))\n return fn_names"} {"instruction": "Given the following Python 3 function, write the documentation\ndef get_inner_fts(elt)->List[str]:\n \"List the inner functions of a class.\"\n fts = []\n for ft_name in elt.__dict__.keys():\n if ft_name.startswith('_'): continue\n ft = getattr(elt, ft_name)\n if inspect.isfunction(ft): fts.append(f'{elt.__name__}.{ft_name}')\n if inspect.ismethod(ft): fts.append(f'{elt.__name__}.{ft_name}')\n if inspect.isclass(ft): fts += [f'{elt.__name__}.{n}' for n in get_inner_fts(ft)]\n return fts", "response": "List the inner functions of a class."} {"instruction": "Write a Python 3 script for\ndisplaying table of contents for given mod_name.", "response": "def get_module_toc(mod_name):\n \"Display table of contents for given `mod_name`.\"\n mod = import_mod(mod_name)\n ft_names = mod.__all__ if hasattr(mod,'__all__') else get_ft_names(mod)\n ft_names.sort(key = str.lower)\n tabmat = ''\n for ft_name in ft_names:\n tabmat += f'- [{ft_name}](#{ft_name})\\n'\n elt = getattr(mod, ft_name)\n if inspect.isclass(elt) and not is_enum(elt.__class__):\n in_ft_names = get_inner_fts(elt)\n for name in in_ft_names:\n tabmat += f' - [{name}](#{name})\\n'\n display(Markdown(tabmat))"} {"instruction": "Write a Python 3 script for\nreturning function link to notebook documentation of ft. Private functions link to source code", "response": "def get_fn_link(ft)->str:\n \"Return function link to notebook documentation of `ft`. Private functions link to source code\"\n ft = getattr(ft, '__func__', ft)\n anchor = strip_fastai(get_anchor(ft))\n module_name = strip_fastai(get_module_name(ft))\n base = '' if use_relative_links else FASTAI_DOCS\n return f'{base}/{module_name}.html#{anchor}'"} {"instruction": "Can you implement a function in Python 3 that\nreturns link to pytorch docs of ft.", "response": "def get_pytorch_link(ft)->str:\n \"Returns link to pytorch docs of `ft`.\"\n name = ft.__name__\n ext = '.html'\n if name == 'device': return f'{PYTORCH_DOCS}tensor_attributes{ext}#torch-device'\n if name == 'Tensor': return f'{PYTORCH_DOCS}tensors{ext}#torch-tensor'\n if name.startswith('torchvision'):\n doc_path = get_module_name(ft).replace('.', '/')\n if inspect.ismodule(ft): name = name.replace('.', '-')\n return f'{PYTORCH_DOCS}{doc_path}{ext}#{name}'\n if name.startswith('torch.nn') and inspect.ismodule(ft): # nn.functional is special case\n nn_link = name.replace('.', '-')\n return f'{PYTORCH_DOCS}nn{ext}#{nn_link}'\n paths = get_module_name(ft).split('.')\n if len(paths) == 1: return f'{PYTORCH_DOCS}{paths[0]}{ext}#{paths[0]}.{name}'\n\n offset = 1 if paths[1] == 'utils' else 0 # utils is a pytorch special case\n doc_path = paths[1+offset]\n if inspect.ismodule(ft): return f'{PYTORCH_DOCS}{doc_path}{ext}#module-{name}'\n fnlink = '.'.join(paths[:(2+offset)]+[name])\n return f'{PYTORCH_DOCS}{doc_path}{ext}#{fnlink}'"} {"instruction": "How would you code a function in Python 3 to\nreturn github link for given file", "response": "def get_source_link(file, line, display_text=\"[source]\", **kwargs)->str:\n \"Returns github link for given file\"\n link = f\"{SOURCE_URL}{file}#L{line}\"\n if display_text is None: return link\n return f'{display_text}'"} {"instruction": "Write a Python 3 function that can\nreturn link to ft in source code.", "response": "def get_function_source(ft, **kwargs)->str:\n \"Returns link to `ft` in source code.\"\n try: line = inspect.getsourcelines(ft)[1]\n except Exception: return ''\n mod_path = get_module_name(ft).replace('.', '/') + '.py'\n return get_source_link(mod_path, line, **kwargs)"} {"instruction": "Write a Python 3 script for\nfinding comment markers in the nbval cell source. Yields an iterable of ( marker_type True ).", "response": "def find_comment_markers(cellsource):\n \"\"\"Look through the cell source for comments which affect nbval's behaviour\n\n Yield an iterable of ``(MARKER_TYPE, True)``.\n \"\"\"\n found = {}\n for line in cellsource.splitlines():\n line = line.strip()\n if line.startswith('#'):\n # print(\"Found comment in '{}'\".format(line))\n comment = line.lstrip('#').strip()\n if comment in comment_markers:\n # print(\"Found marker {}\".format(comment))\n marker = comment_markers[comment]\n if not isinstance(marker, tuple):\n # If not an explicit tuple ('option', True/False),\n # imply ('option', True)\n marker = (marker, True)\n marker_type = marker[0]\n if marker_type in found:\n warnings.warn(\n \"Conflicting comment markers found, using the latest: \"\n \" %s VS %s\" %\n (found[marker_type], comment))\n found[marker_type] = comment\n yield marker"} {"instruction": "Given the following Python 3 function, write the documentation\ndef coalesce_streams(outputs):\n if not outputs:\n return outputs\n\n new_outputs = []\n streams = {}\n for output in outputs:\n if (output.output_type == 'stream'):\n if output.name in streams:\n streams[output.name].text += output.text\n else:\n new_outputs.append(output)\n streams[output.name] = output\n else:\n new_outputs.append(output)\n\n # process \\r and \\b characters\n for output in streams.values():\n old = output.text\n while len(output.text) < len(old):\n old = output.text\n # Cancel out anything-but-newline followed by backspace\n output.text = backspace_pat.sub('', output.text)\n # Replace all carriage returns not followed by newline\n output.text = carriagereturn_pat.sub('', output.text)\n\n return new_outputs", "response": "Coalesce all stream outputs with shared names into single streams."} {"instruction": "Can you tell what is the following Python 3 function doing\ndef transform_streams_for_comparison(outputs):\n new_outputs = []\n for output in outputs:\n if (output.output_type == 'stream'):\n # Transform output\n new_outputs.append({\n 'output_type': 'stream',\n output.name: output.text,\n })\n else:\n new_outputs.append(output)\n return new_outputs", "response": "Makes failure output for streams better by having key be the stream name"} {"instruction": "Create a Python 3 function to\ntrim and hash base64 strings", "response": "def _trim_base64(s):\n \"\"\"Trim and hash base64 strings\"\"\"\n if len(s) > 64 and _base64.match(s.replace('\\n', '')):\n h = hash_string(s)\n s = '%s...' % (s[:8], h[:16])\n return s"} {"instruction": "How would you explain what the following Python 3 function does\ndef _indent(s, indent=' '):\n if isinstance(s, six.string_types):\n return '\\n'.join(('%s%s' % (indent, line) for line in s.splitlines()))\n return s", "response": "Intent each line with indent"} {"instruction": "Can you implement a function in Python 3 that\ncalls by pytest to setup the collector cells in the log file.", "response": "def setup(self):\n \"\"\"\n Called by pytest to setup the collector cells in .\n Here we start a kernel and setup the sanitize patterns.\n \"\"\"\n\n if self.parent.config.option.current_env:\n kernel_name = CURRENT_ENV_KERNEL_NAME\n else:\n kernel_name = self.nb.metadata.get(\n 'kernelspec', {}).get('name', 'python')\n self.kernel = RunningKernel(kernel_name, str(self.fspath.dirname))\n self.setup_sanitize_files()\n if getattr(self.parent.config.option, 'cov_source', None):\n setup_coverage(self.parent.config, self.kernel, getattr(self, \"fspath\", None))"} {"instruction": "Make a summary of the following Python 3 code\ndef setup_sanitize_files(self):\n for fname in self.get_sanitize_files():\n with open(fname, 'r') as f:\n self.sanitize_patterns.update(get_sanitize_patterns(f.read()))", "response": "Load the contents of the file into the sanitise patterns dictionary."} {"instruction": "Write a Python 3 function for\nreturning list of all sanitize files provided by the user on the command line.", "response": "def get_sanitize_files(self):\n \"\"\"\n Return list of all sanitize files provided by the user on the command line.\n\n N.B.: We only support one sanitize file at the moment, but\n this is likely to change in the future\n\n \"\"\"\n if self.parent.config.option.sanitize_with is not None:\n return [self.parent.config.option.sanitize_with]\n else:\n return []"} {"instruction": "Create a Python 3 function for\ngetting a message from the iopub channel of the notebook kernel.", "response": "def get_kernel_message(self, timeout=None, stream='iopub'):\n \"\"\"\n Gets a message from the iopub channel of the notebook kernel.\n \"\"\"\n return self.kernel.get_message(stream, timeout=timeout)"} {"instruction": "Implement a Python 3 function for\nyielding all code cells in the notebook.", "response": "def collect(self):\n \"\"\"\n The collect function is required by pytest and is used to yield pytest\n Item objects. We specify an Item for each code cell in the notebook.\n \"\"\"\n\n self.nb = nbformat.read(str(self.fspath), as_version=4)\n\n # Start the cell count\n cell_num = 0\n\n # Iterate over the cells in the notebook\n for cell in self.nb.cells:\n # Skip the cells that have text, headings or related stuff\n # Only test code cells\n if cell.cell_type == 'code':\n # The cell may contain a comment indicating that its output\n # should be checked or ignored. If it doesn't, use the default\n # behaviour. The --nbval option checks unmarked cells.\n with warnings.catch_warnings(record=True) as ws:\n options = defaultdict(bool, find_metadata_tags(cell.metadata))\n comment_opts = dict(find_comment_markers(cell.source))\n if set(comment_opts.keys()) & set(options.keys()):\n warnings.warn(\n \"Overlapping options from comments and metadata, \"\n \"using options from comments: %s\" %\n str(set(comment_opts.keys()) & set(options.keys())))\n for w in ws:\n self.parent.config.warn(\n \"C1\",\n str(w.message),\n '%s:Cell %d' % (\n getattr(self, \"fspath\", None),\n cell_num))\n options.update(comment_opts)\n options.setdefault('check', self.compare_outputs)\n yield IPyNbCell('Cell ' + str(cell_num), self, cell_num,\n cell, options)\n\n # Update 'code' cell count\n cell_num += 1"} {"instruction": "Given the following Python 3 function, write the documentation\ndef repr_failure(self, excinfo):\n exc = excinfo.value\n cc = self.colors\n if isinstance(exc, NbCellError):\n msg_items = [\n cc.FAIL + \"Notebook cell execution failed\" + cc.ENDC]\n formatstring = (\n cc.OKBLUE + \"Cell %d: %s\\n\\n\" +\n \"Input:\\n\" + cc.ENDC + \"%s\\n\")\n msg_items.append(formatstring % (\n exc.cell_num,\n str(exc),\n exc.source\n ))\n if exc.inner_traceback:\n msg_items.append((\n cc.OKBLUE + \"Traceback:\" + cc.ENDC + \"\\n%s\\n\") %\n exc.inner_traceback)\n return \"\\n\".join(msg_items)\n else:\n return \"pytest plugin exception: %s\" % str(exc)", "response": "repr_failure - returns a string that can be used to display a failure message."} {"instruction": "Write a Python 3 script for\nformatting an output for printing", "response": "def format_output_compare(self, key, left, right):\n \"\"\"Format an output for printing\"\"\"\n if isinstance(left, six.string_types):\n left = _trim_base64(left)\n if isinstance(right, six.string_types):\n right = _trim_base64(right)\n\n cc = self.colors\n\n self.comparison_traceback.append(\n cc.OKBLUE\n + \" mismatch '%s'\" % key\n + cc.FAIL)\n\n # Use comparison repr from pytest:\n hook_result = self.ihook.pytest_assertrepr_compare(\n config=self.config, op='==', left=left, right=right)\n for new_expl in hook_result:\n if new_expl:\n new_expl = [' %s' % line.replace(\"\\n\", \"\\\\n\") for line in new_expl]\n self.comparison_traceback.append(\"\\n assert reference_output == test_output failed:\\n\")\n self.comparison_traceback.extend(new_expl)\n break\n else:\n # Fallback repr:\n self.comparison_traceback.append(\n \" <<<<<<<<<<<< Reference output from ipynb file:\"\n + cc.ENDC)\n self.comparison_traceback.append(_indent(left))\n self.comparison_traceback.append(\n cc.FAIL\n + ' ============ disagrees with newly computed (test) output:'\n + cc.ENDC)\n self.comparison_traceback.append(_indent(right))\n self.comparison_traceback.append(\n cc.FAIL\n + ' >>>>>>>>>>>>')\n self.comparison_traceback.append(cc.ENDC)"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef sanitize(self, s):\n if not isinstance(s, six.string_types):\n return s\n\n \"\"\"\n re.sub matches a regex and replaces it with another.\n The regex replacements are taken from a file if the option\n is passed when py.test is called. Otherwise, the strings\n are not processed\n \"\"\"\n for regex, replace in six.iteritems(self.parent.sanitize_patterns):\n s = re.sub(regex, replace, s)\n return s", "response": "sanitize a string for comparison."} {"instruction": "Create a Python 3 function for\ncomputing the outputs for several augmented inputs for TTA", "response": "def _tta_only(learn:Learner, ds_type:DatasetType=DatasetType.Valid, scale:float=1.35) -> Iterator[List[Tensor]]:\n \"Computes the outputs for several augmented inputs for TTA\"\n dl = learn.dl(ds_type)\n ds = dl.dataset\n old = ds.tfms\n augm_tfm = [o for o in learn.data.train_ds.tfms if o.tfm not in\n (crop_pad, flip_lr, dihedral, zoom)]\n try:\n pbar = master_bar(range(8))\n for i in pbar:\n row = 1 if i&1 else 0\n col = 1 if i&2 else 0\n flip = i&4\n d = {'row_pct':row, 'col_pct':col, 'is_random':False}\n tfm = [*augm_tfm, zoom(scale=scale, **d), crop_pad(**d)]\n if flip: tfm.append(flip_lr(p=1.))\n ds.tfms = tfm\n yield get_preds(learn.model, dl, pbar=pbar, activ=_loss_func2activ(learn.loss_func))[0]\n finally: ds.tfms = old"} {"instruction": "How would you explain what the following Python 3 function does\ndef _TTA(learn:Learner, beta:float=0.4, scale:float=1.35, ds_type:DatasetType=DatasetType.Valid, with_loss:bool=False) -> Tensors:\n \"Applies TTA to predict on `ds_type` dataset.\"\n preds,y = learn.get_preds(ds_type)\n all_preds = list(learn.tta_only(scale=scale, ds_type=ds_type))\n avg_preds = torch.stack(all_preds).mean(0)\n if beta is None: return preds,avg_preds,y\n else: \n final_preds = preds*beta + avg_preds*(1-beta)\n if with_loss: \n with NoneReduceOnCPU(learn.loss_func) as lf: loss = lf(final_preds, y)\n return final_preds, y, loss\n return final_preds, y", "response": "Applies TTA to predict on ds_type dataset."} {"instruction": "How would you implement a function in Python 3 that\ncomputes the f_beta between preds and targets", "response": "def fbeta(y_pred:Tensor, y_true:Tensor, thresh:float=0.2, beta:float=2, eps:float=1e-9, sigmoid:bool=True)->Rank0Tensor:\n \"Computes the f_beta between `preds` and `targets`\"\n beta2 = beta ** 2\n if sigmoid: y_pred = y_pred.sigmoid()\n y_pred = (y_pred>thresh).float()\n y_true = y_true.float()\n TP = (y_pred*y_true).sum(dim=1)\n prec = TP/(y_pred.sum(dim=1)+eps)\n rec = TP/(y_true.sum(dim=1)+eps)\n res = (prec*rec)/(prec*beta2+rec+eps)*(1+beta2)\n return res.mean()"} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef accuracy(input:Tensor, targs:Tensor)->Rank0Tensor:\n \"Compute accuracy with `targs` when `input` is bs * n_classes.\"\n n = targs.shape[0]\n input = input.argmax(dim=-1).view(n,-1)\n targs = targs.view(n,-1)\n return (input==targs).float().mean()", "response": "Compute accuracy with targs when input is bs * n_classes."} {"instruction": "Write a Python 3 function that can\ncompute accuracy when y_pred and y_true are the same size.", "response": "def accuracy_thresh(y_pred:Tensor, y_true:Tensor, thresh:float=0.5, sigmoid:bool=True)->Rank0Tensor:\n \"Compute accuracy when `y_pred` and `y_true` are the same size.\"\n if sigmoid: y_pred = y_pred.sigmoid()\n return ((y_pred>thresh)==y_true.byte()).float().mean()"} {"instruction": "How would you explain what the following Python 3 function does\ndef top_k_accuracy(input:Tensor, targs:Tensor, k:int=5)->Rank0Tensor:\n \"Computes the Top-k accuracy (target is in the top k predictions).\"\n input = input.topk(k=k, dim=-1)[1]\n targs = targs.unsqueeze(dim=-1).expand_as(input)\n return (input == targs).max(dim=-1)[0].float().mean()", "response": "Computes the Top - k accuracy ( target is in the top k predictions."} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef dice(input:Tensor, targs:Tensor, iou:bool=False)->Rank0Tensor:\n \"Dice coefficient metric for binary target. If iou=True, returns iou metric, classic for segmentation problems.\"\n n = targs.shape[0]\n input = input.argmax(dim=1).view(n,-1)\n targs = targs.view(n,-1)\n intersect = (input * targs).sum().float()\n union = (input+targs).sum().float()\n if not iou: return (2. * intersect / union if union > 0 else union.new([1.]).squeeze())\n else: return intersect / (union-intersect+1.0)", "response": "Dice coefficient metric for binary target. If iou = True returns iou metric classic for segmentation problems."} {"instruction": "Can you tell what is the following Python 3 function doing\ndef exp_rmspe(pred:Tensor, targ:Tensor)->Rank0Tensor:\n \"Exp RMSE between `pred` and `targ`.\"\n pred,targ = flatten_check(pred,targ)\n pred, targ = torch.exp(pred), torch.exp(targ)\n pct_var = (targ - pred)/targ\n return torch.sqrt((pct_var**2).mean())", "response": "Exp RMSE between pred and targ."} {"instruction": "Make a summary of the following Python 3 code\ndef mean_absolute_error(pred:Tensor, targ:Tensor)->Rank0Tensor:\n \"Mean absolute error between `pred` and `targ`.\"\n pred,targ = flatten_check(pred,targ)\n return torch.abs(targ - pred).mean()", "response": "Mean absolute error between pred and targ."} {"instruction": "Explain what the following Python 3 code does\ndef mean_squared_error(pred:Tensor, targ:Tensor)->Rank0Tensor:\n \"Mean squared error between `pred` and `targ`.\"\n pred,targ = flatten_check(pred,targ)\n return F.mse_loss(pred, targ)", "response": "Mean squared error between pred and targ."} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef root_mean_squared_error(pred:Tensor, targ:Tensor)->Rank0Tensor:\n \"Root mean squared error between `pred` and `targ`.\"\n pred,targ = flatten_check(pred,targ)\n return torch.sqrt(F.mse_loss(pred, targ))", "response": "Root mean squared error between pred and targ."} {"instruction": "Can you write a function in Python 3 where it\nmeans squared logarithmic error between pred and targ.", "response": "def mean_squared_logarithmic_error(pred:Tensor, targ:Tensor)->Rank0Tensor:\n \"Mean squared logarithmic error between `pred` and `targ`.\"\n pred,targ = flatten_check(pred,targ)\n return F.mse_loss(torch.log(1 + pred), torch.log(1 + targ))"} {"instruction": "Make a summary of the following Python 3 code\ndef explained_variance(pred:Tensor, targ:Tensor)->Rank0Tensor:\n \"Explained variance between `pred` and `targ`.\"\n pred,targ = flatten_check(pred,targ)\n var_pct = torch.var(targ - pred) / torch.var(targ)\n return 1 - var_pct", "response": "Explained variance between pred and targ."} {"instruction": "How would you code a function in Python 3 to\nuse trapezoid method to calculate the area under roc curve", "response": "def auc_roc_score(input:Tensor, targ:Tensor):\n \"Using trapezoid method to calculate the area under roc curve\"\n fpr, tpr = roc_curve(input, targ)\n d = fpr[1:] - fpr[:-1]\n sl1, sl2 = [slice(None)], [slice(None)]\n sl1[-1], sl2[-1] = slice(1, None), slice(None, -1)\n return (d * (tpr[tuple(sl1)] + tpr[tuple(sl2)]) / 2.).sum(-1)"} {"instruction": "Make a summary of the following Python 3 code\ndef roc_curve(input:Tensor, targ:Tensor):\n \"Returns the false positive and true positive rates\"\n targ = (targ == 1)\n desc_score_indices = torch.flip(input.argsort(-1), [-1])\n input = input[desc_score_indices]\n targ = targ[desc_score_indices]\n d = input[1:] - input[:-1]\n distinct_value_indices = torch.nonzero(d).transpose(0,1)[0]\n threshold_idxs = torch.cat((distinct_value_indices, LongTensor([len(targ) - 1]).to(targ.device)))\n tps = torch.cumsum(targ * 1, dim=-1)[threshold_idxs]\n fps = (1 + threshold_idxs - tps)\n if tps[0] != 0 or fps[0] != 0:\n fps = torch.cat((LongTensor([0]), fps))\n tps = torch.cat((LongTensor([0]), tps))\n fpr, tpr = fps.float() / fps[-1], tps.float() / tps[-1]\n return fpr, tpr", "response": "Returns the false positive and true positive rates"} {"instruction": "Given the following Python 3 function, write the documentation\ndef A(*a):\n return np.array(a[0]) if len(a)==1 else [np.array(o) for o in a]", "response": "convert iterable object into numpy array"} {"instruction": "How would you code a function in Python 3 to\nconvert numpy array into a pytorch tensor.", "response": "def T(a, half=False, cuda=True):\n \"\"\"\n Convert numpy array into a pytorch tensor. \n if Cuda is available and USE_GPU=True, store resulting tensor in GPU.\n \"\"\"\n if not torch.is_tensor(a):\n a = np.array(np.ascontiguousarray(a))\n if a.dtype in (np.int8, np.int16, np.int32, np.int64):\n a = torch.LongTensor(a.astype(np.int64))\n elif a.dtype in (np.float32, np.float64):\n a = to_half(a) if half else torch.FloatTensor(a)\n else: raise NotImplementedError(a.dtype)\n if cuda: a = to_gpu(a)\n return a"} {"instruction": "How would you code a function in Python 3 to\ncreate a single or a list of pytorch tensors depending on input x", "response": "def V(x, requires_grad=False, volatile=False):\n '''creates a single or a list of pytorch tensors, depending on input x. '''\n return map_over(x, lambda o: V_(o, requires_grad, volatile))"} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef to_np(v):\n '''returns an np.array object given an input of np.array, list, tuple, torch variable or tensor.'''\n if isinstance(v, float): return np.array(v)\n if isinstance(v, (np.ndarray, np.generic)): return v\n if isinstance(v, (list,tuple)): return [to_np(o) for o in v]\n if isinstance(v, Variable): v=v.data\n if torch.cuda.is_available():\n if is_half_tensor(v): v=v.float()\n if isinstance(v, torch.FloatTensor): v=v.float()\n return v.cpu().numpy()", "response": "returns an np. array object given an input of np. array list tuple torch variable or tensor."} {"instruction": "Explain what the following Python 3 code does\ndef to_gpu(x, *args, **kwargs):\n '''puts pytorch variable to gpu, if cuda is available and USE_GPU is set to true. '''\n return x.cuda(*args, **kwargs) if USE_GPU else x", "response": "puts pytorch variable to gpu if cuda is available."} {"instruction": "Make a summary of the following Python 3 code\ndef split_by_idxs(seq, idxs):\n '''A generator that returns sequence pieces, seperated by indexes specified in idxs. '''\n last = 0\n for idx in idxs:\n if not (-len(seq) <= idx < len(seq)):\n raise KeyError(f'Idx {idx} is out-of-bounds')\n yield seq[last:idx]\n last = idx\n yield seq[last:]", "response": "A generator that returns sequence pieces seperated by indexes specified in idxs."} {"instruction": "Write a Python 3 function for\nsplitting iterables a in equal parts of size sz", "response": "def partition(a, sz): \n \"\"\"splits iterables a in equal parts of size sz\"\"\"\n return [a[i:i+sz] for i in range(0, len(a), sz)]"} {"instruction": "Explain what the following Python 3 code does\ndef chunk_iter(iterable, chunk_size):\n '''A generator that yields chunks of iterable, chunk_size at a time. '''\n while True:\n chunk = []\n try:\n for _ in range(chunk_size): chunk.append(next(iterable))\n yield chunk\n except StopIteration:\n if chunk: yield chunk\n break", "response": "A generator that yields chunks of iterable chunk_size at a time."} {"instruction": "Create a Python 3 function for\napplying change in brightness of image x.", "response": "def _brightness(x, change:uniform):\n \"Apply `change` in brightness of image `x`.\"\n return x.add_(scipy.special.logit(change))"} {"instruction": "Explain what the following Python 3 code does\ndef _rotate(degrees:uniform):\n \"Rotate image by `degrees`.\"\n angle = degrees * math.pi / 180\n return [[cos(angle), -sin(angle), 0.],\n [sin(angle), cos(angle), 0.],\n [0. , 0. , 1.]]", "response": "Rotate image by degrees."} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef _get_zoom_mat(sw:float, sh:float, c:float, r:float)->AffineMatrix:\n \"`sw`,`sh` scale width,height - `c`,`r` focus col,row.\"\n return [[sw, 0, c],\n [0, sh, r],\n [0, 0, 1.]]", "response": "`sw`,`sh` scale width,height - `c`,`r` focus col,row."} {"instruction": "Can you implement a function in Python 3 that\nzooms image by scale. row_pct col_pct select focal point of zoom.", "response": "def _zoom(scale:uniform=1.0, row_pct:uniform=0.5, col_pct:uniform=0.5):\n \"Zoom image by `scale`. `row_pct`,`col_pct` select focal point of zoom.\"\n s = 1-1/scale\n col_c = s * (2*col_pct - 1)\n row_c = s * (2*row_pct - 1)\n return _get_zoom_mat(1/scale, 1/scale, col_c, row_c)"} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef _jitter(c, magnitude:uniform):\n \"Replace pixels by random neighbors at `magnitude`.\"\n c.flow.add_((torch.rand_like(c.flow)-0.5)*magnitude*2)\n return c", "response": "Replace pixels by random neighbors at magnitude."} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef _pad_default(x, padding:int, mode='reflection'):\n \"Pad `x` with `padding` pixels. `mode` fills in space ('zeros','reflection','border').\"\n mode = _pad_mode_convert[mode]\n return F.pad(x[None], (padding,)*4, mode=mode)[0]", "response": "Pad x with padding pixels. mode fills in space ('zeros','reflection','border')."} {"instruction": "Make a summary of the following Python 3 code\ndef _cutout(x, n_holes:uniform_int=1, length:uniform_int=40):\n \"Cut out `n_holes` number of square holes of size `length` in image at random locations.\"\n h,w = x.shape[1:]\n for n in range(n_holes):\n h_y = np.random.randint(0, h)\n h_x = np.random.randint(0, w)\n y1 = int(np.clip(h_y - length / 2, 0, h))\n y2 = int(np.clip(h_y + length / 2, 0, h))\n x1 = int(np.clip(h_x - length / 2, 0, w))\n x2 = int(np.clip(h_x + length / 2, 0, w))\n x[:, y1:y2, x1:x2] = 0\n return x", "response": "Cut out n_holes number of square holes of size length in image at random locations."} {"instruction": "Can you implement a function in Python 3 that\nrandomizes one of the channels of the input image", "response": "def _rgb_randomize(x, channel:int=None, thresh:float=0.3):\n \"Randomize one of the channels of the input image\"\n if channel is None: channel = np.random.randint(0, x.shape[0] - 1)\n x[channel] = torch.rand(x.shape[1:]) * np.random.uniform(0, thresh)\n return x"} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef _crop_default(x, size, row_pct:uniform=0.5, col_pct:uniform=0.5):\n \"Crop `x` to `size` pixels. `row_pct`,`col_pct` select focal point of crop.\"\n rows,cols = tis2hw(size)\n row_pct,col_pct = _minus_epsilon(row_pct,col_pct)\n row = int((x.size(1)-rows+1) * row_pct)\n col = int((x.size(2)-cols+1) * col_pct)\n return x[:, row:row+rows, col:col+cols].contiguous()", "response": "Crop x to size pixels. row_pct col_pct select focal point of crop."} {"instruction": "How would you implement a function in Python 3 that\ncrops and pad tfm - row_pct col_pct sets focal point.", "response": "def _crop_pad_default(x, size, padding_mode='reflection', row_pct:uniform = 0.5, col_pct:uniform = 0.5):\n \"Crop and pad tfm - `row_pct`,`col_pct` sets focal point.\"\n padding_mode = _pad_mode_convert[padding_mode]\n size = tis2hw(size)\n if x.shape[1:] == torch.Size(size): return x\n rows,cols = size\n row_pct,col_pct = _minus_epsilon(row_pct,col_pct)\n if x.size(1)Tensor:\n \"Find 8 coeff mentioned [here](https://web.archive.org/web/20150222120106/xenia.media.mit.edu/~cwren/interpolator/).\"\n matrix = []\n #The equations we'll need to solve.\n for p1, p2 in zip(targ_pts, orig_pts):\n matrix.append([p1[0], p1[1], 1, 0, 0, 0, -p2[0]*p1[0], -p2[0]*p1[1]])\n matrix.append([0, 0, 0, p1[0], p1[1], 1, -p2[1]*p1[0], -p2[1]*p1[1]])\n\n A = FloatTensor(matrix)\n B = FloatTensor(orig_pts).view(8, 1)\n #The 8 scalars we seek are solution of AX = B\n return _solve_func(B,A)[0][:,0]"} {"instruction": "Create a Python 3 function for\ntransforming coords with coeffs.", "response": "def _apply_perspective(coords:FlowField, coeffs:Points)->FlowField:\n \"Transform `coords` with `coeffs`.\"\n size = coords.flow.size()\n #compress all the dims expect the last one ang adds ones, coords become N * 3\n coords.flow = coords.flow.view(-1,2)\n #Transform the coeffs in a 3*3 matrix with a 1 at the bottom left\n coeffs = torch.cat([coeffs, FloatTensor([1])]).view(3,3)\n coords.flow = torch.addmm(coeffs[:,2], coords.flow, coeffs[:,:2].t())\n coords.flow.mul_(1/coords.flow[:,2].unsqueeze(1))\n coords.flow = coords.flow[:,:2].view(size)\n return coords"} {"instruction": "Given the following Python 3 function, write the documentation\ndef _do_perspective_warp(c:FlowField, targ_pts:Points, invert=False):\n \"Apply warp to `targ_pts` from `_orig_pts` to `c` `FlowField`.\"\n if invert: return _apply_perspective(c, _find_coeffs(targ_pts, _orig_pts))\n return _apply_perspective(c, _find_coeffs(_orig_pts, targ_pts))", "response": "Apply warp to targ_pts from _orig_pts to c FlowField."} {"instruction": "How would you explain what the following Python 3 function does\ndef _perspective_warp(c, magnitude:partial(uniform,size=8)=0, invert=False):\n \"Apply warp of `magnitude` to `c`.\"\n magnitude = magnitude.view(4,2)\n targ_pts = [[x+m for x,m in zip(xs, ms)] for xs, ms in zip(_orig_pts, magnitude)]\n return _do_perspective_warp(c, targ_pts, invert)", "response": "Apply warp of magnitude to c."} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef _symmetric_warp(c, magnitude:partial(uniform,size=4)=0, invert=False):\n \"Apply symmetric warp of `magnitude` to `c`.\"\n m = listify(magnitude, 4)\n targ_pts = [[-1-m[3],-1-m[1]], [-1-m[2],1+m[1]], [1+m[3],-1-m[0]], [1+m[2],1+m[0]]]\n return _do_perspective_warp(c, targ_pts, invert)", "response": "Apply symmetric warp of magnitude to c."} {"instruction": "Given the following Python 3 function, write the documentation\ndef _tilt(c, direction:uniform_int, magnitude:uniform=0, invert=False):\n \"Tilt `c` field with random `direction` and `magnitude`.\"\n orig_pts = [[-1,-1], [-1,1], [1,-1], [1,1]]\n if direction == 0: targ_pts = [[-1,-1], [-1,1], [1,-1-magnitude], [1,1+magnitude]]\n elif direction == 1: targ_pts = [[-1,-1-magnitude], [-1,1+magnitude], [1,-1], [1,1]]\n elif direction == 2: targ_pts = [[-1,-1], [-1-magnitude,1], [1,-1], [1+magnitude,1]]\n elif direction == 3: targ_pts = [[-1-magnitude,-1], [-1,1], [1+magnitude,-1], [1,1]]\n coeffs = _find_coeffs(targ_pts, _orig_pts) if invert else _find_coeffs(_orig_pts, targ_pts)\n return _apply_perspective(c, coeffs)", "response": "Tilt c field with random direction and magnitude."} {"instruction": "Here you have a function in Python 3, explain what it does\ndef get_transforms(do_flip:bool=True, flip_vert:bool=False, max_rotate:float=10., max_zoom:float=1.1,\n max_lighting:float=0.2, max_warp:float=0.2, p_affine:float=0.75,\n p_lighting:float=0.75, xtra_tfms:Optional[Collection[Transform]]=None)->Collection[Transform]:\n \"Utility func to easily create a list of flip, rotate, `zoom`, warp, lighting transforms.\"\n res = [rand_crop()]\n if do_flip: res.append(dihedral_affine() if flip_vert else flip_lr(p=0.5))\n if max_warp: res.append(symmetric_warp(magnitude=(-max_warp,max_warp), p=p_affine))\n if max_rotate: res.append(rotate(degrees=(-max_rotate,max_rotate), p=p_affine))\n if max_zoom>1: res.append(rand_zoom(scale=(1.,max_zoom), p=p_affine))\n if max_lighting:\n res.append(brightness(change=(0.5*(1-max_lighting), 0.5*(1+max_lighting)), p=p_lighting))\n res.append(contrast(scale=(1-max_lighting, 1/(1-max_lighting)), p=p_lighting))\n # train , valid\n return (res + listify(xtra_tfms), [crop_pad()])", "response": "Utility func to easily create a list of flip rotate zoom warp lighting transforms."} {"instruction": "How would you explain what the following Python 3 function does\ndef rand_resize_crop(size:int, max_scale:float=2., ratios:Tuple[float,float]=(0.75,1.33)):\n \"Randomly resize and crop the image to a ratio in `ratios` after a zoom of `max_scale`.\"\n return [zoom_squish(scale=(1.,max_scale,8), squish=(*ratios,8), invert=(0.5,8), row_pct=(0.,1.), col_pct=(0.,1.)),\n crop(size=size)]", "response": "Randomly resize and crop the image to a ratio in ratios after a zoom of max_scale."} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef adjust_learning_rate(optimizer, epoch, gammas, schedule):\n lr = args.learning_rate\n assert len(gammas) == len(schedule), \"length of gammas and schedule should be equal\"\n for (gamma, step) in zip(gammas, schedule):\n if (epoch >= step):\n lr = lr * gamma\n else:\n break\n for param_group in optimizer.param_groups:\n param_group['lr'] = lr\n return lr", "response": "Adjusts the learning rate to the initial LR decayed by 10 every 30 epochs"} {"instruction": "Implement a function in Python 3 to\nfit the model to the model data and layer_opt and returns the total number of epochs to fit the model.", "response": "def fit_gen(self, model, data, layer_opt, n_cycle, cycle_len=None, cycle_mult=1, cycle_save_name=None, best_save_name=None,\n use_clr=None, use_clr_beta=None, metrics=None, callbacks=None, use_wd_sched=False, norm_wds=False, \n wds_sched_mult=None, use_swa=False, swa_start=1, swa_eval_freq=5, **kwargs):\n\n \"\"\"Method does some preparation before finally delegating to the 'fit' method for\n fitting the model. Namely, if cycle_len is defined, it adds a 'Cosine Annealing'\n scheduler for varying the learning rate across iterations.\n\n Method also computes the total number of epochs to fit based on provided 'cycle_len',\n 'cycle_mult', and 'n_cycle' parameters.\n\n Args:\n model (Learner): Any neural architecture for solving a supported problem.\n Eg. ResNet-34, RNN_Learner etc.\n\n data (ModelData): An instance of ModelData.\n\n layer_opt (LayerOptimizer): An instance of the LayerOptimizer class\n\n n_cycle (int): number of cycles\n\n cycle_len (int): number of epochs before lr is reset to the initial value.\n E.g if cycle_len = 3, then the lr is varied between a maximum\n and minimum value over 3 epochs.\n\n cycle_mult (int): additional parameter for influencing how the lr resets over\n the cycles. For an intuitive explanation, please see\n https://github.com/fastai/fastai/blob/master/courses/dl1/lesson1.ipynb\n\n cycle_save_name (str): use to save the weights at end of each cycle (requires\n use_clr, use_clr_beta or cycle_len arg)\n\n best_save_name (str): use to save weights of best model during training.\n\n metrics (function): some function for evaluating a desired metric. Eg. accuracy.\n\n callbacks (list(Callback)): callbacks to apply during the training.\n\n use_wd_sched (bool, optional): set to True to enable weight regularization using\n the technique mentioned in https://arxiv.org/abs/1711.05101. When this is True\n alone (see below), the regularization is detached from gradient update and\n applied directly to the weights.\n\n norm_wds (bool, optional): when this is set to True along with use_wd_sched, the\n regularization factor is normalized with each training cycle.\n\n wds_sched_mult (function, optional): when this is provided along with use_wd_sched\n as True, the value computed by this function is multiplied with the regularization\n strength. This function is passed the WeightDecaySchedule object. And example\n function that can be passed is:\n f = lambda x: np.array(x.layer_opt.lrs) / x.init_lrs\n \n use_swa (bool, optional): when this is set to True, it will enable the use of\n Stochastic Weight Averaging (https://arxiv.org/abs/1803.05407). The learner will\n include an additional model (in the swa_model attribute) for keeping track of the \n average weights as described in the paper. All testing of this technique so far has\n been in image classification, so use in other contexts is not guaranteed to work.\n \n swa_start (int, optional): if use_swa is set to True, then this determines the epoch\n to start keeping track of the average weights. It is 1-indexed per the paper's\n conventions.\n \n swa_eval_freq (int, optional): if use_swa is set to True, this determines the frequency\n at which to evaluate the performance of the swa_model. This evaluation can be costly\n for models using BatchNorm (requiring a full pass through the data), which is why the\n default is not to evaluate after each epoch.\n\n Returns:\n None\n \"\"\"\n\n if cycle_save_name:\n assert use_clr or use_clr_beta or cycle_len, \"cycle_save_name argument requires either of the following arguments use_clr, use_clr_beta, cycle_len\"\n\n if callbacks is None: callbacks=[]\n if metrics is None: metrics=self.metrics\n\n if use_wd_sched:\n # This needs to come before CosAnneal() because we need to read the initial learning rate from\n # layer_opt.lrs - but CosAnneal() alters the layer_opt.lrs value initially (divides by 100)\n if np.sum(layer_opt.wds) == 0:\n print('fit() warning: use_wd_sched is set to True, but weight decay(s) passed are 0. Use wds to '\n 'pass weight decay values.')\n batch_per_epoch = len(data.trn_dl)\n cl = cycle_len if cycle_len else 1\n self.wd_sched = WeightDecaySchedule(layer_opt, batch_per_epoch, cl, cycle_mult, n_cycle,\n norm_wds, wds_sched_mult)\n callbacks += [self.wd_sched]\n\n if use_clr is not None:\n clr_div,cut_div = use_clr[:2]\n moms = use_clr[2:] if len(use_clr) > 2 else None\n cycle_end = self.get_cycle_end(cycle_save_name)\n assert cycle_len, \"use_clr requires cycle_len arg\"\n self.sched = CircularLR(layer_opt, len(data.trn_dl)*cycle_len, on_cycle_end=cycle_end, div=clr_div, cut_div=cut_div,\n momentums=moms)\n elif use_clr_beta is not None:\n div,pct = use_clr_beta[:2]\n moms = use_clr_beta[2:] if len(use_clr_beta) > 3 else None\n cycle_end = self.get_cycle_end(cycle_save_name)\n assert cycle_len, \"use_clr_beta requires cycle_len arg\"\n self.sched = CircularLR_beta(layer_opt, len(data.trn_dl)*cycle_len, on_cycle_end=cycle_end, div=div,\n pct=pct, momentums=moms)\n elif cycle_len:\n cycle_end = self.get_cycle_end(cycle_save_name)\n cycle_batches = len(data.trn_dl)*cycle_len\n self.sched = CosAnneal(layer_opt, cycle_batches, on_cycle_end=cycle_end, cycle_mult=cycle_mult)\n elif not self.sched: self.sched=LossRecorder(layer_opt)\n callbacks+=[self.sched]\n\n if best_save_name is not None:\n callbacks+=[SaveBestModel(self, layer_opt, metrics, best_save_name)]\n\n if use_swa:\n # make a copy of the model to track average weights\n self.swa_model = copy.deepcopy(model)\n callbacks+=[SWA(model, self.swa_model, swa_start)]\n\n n_epoch = int(sum_geom(cycle_len if cycle_len else 1, cycle_mult, n_cycle))\n return fit(model, data, n_epoch, layer_opt.opt, self.crit,\n metrics=metrics, callbacks=callbacks, reg_fn=self.reg_fn, clip=self.clip, fp16=self.fp16,\n swa_model=self.swa_model if use_swa else None, swa_start=swa_start, \n swa_eval_freq=swa_eval_freq, **kwargs)"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef lr_find(self, start_lr=1e-5, end_lr=10, wds=None, linear=False, **kwargs):\n self.save('tmp')\n layer_opt = self.get_layer_opt(start_lr, wds)\n self.sched = LR_Finder(layer_opt, len(self.data.trn_dl), end_lr, linear=linear)\n self.fit_gen(self.model, self.data, layer_opt, 1, **kwargs)\n self.load('tmp')", "response": "This function finds an optimal learning rate for a model."} {"instruction": "Explain what the following Python 3 code does\ndef lr_find2(self, start_lr=1e-5, end_lr=10, num_it = 100, wds=None, linear=False, stop_dv=True, **kwargs):\n self.save('tmp')\n layer_opt = self.get_layer_opt(start_lr, wds)\n self.sched = LR_Finder2(layer_opt, num_it, end_lr, linear=linear, metrics=self.metrics, stop_dv=stop_dv)\n self.fit_gen(self.model, self.data, layer_opt, num_it//len(self.data.trn_dl) + 1, all_val=True, **kwargs)\n self.load('tmp')", "response": "This variant of lr_find2 computes the best learning rate for a learner. It does not do anything but it does it. It does not do anything but it does it do it."} {"instruction": "Implement a function in Python 3 to\npredict the value of the object in the n - term system for the given array of values.", "response": "def predict_array(self, arr):\n \"\"\"\n Args:\n arr: a numpy array to be used as input to the model for prediction purposes\n Returns:\n a numpy array containing the predictions from the model\n \"\"\"\n if not isinstance(arr, np.ndarray): raise OSError(f'Not valid numpy array')\n self.model.eval()\n return to_np(self.model(to_gpu(V(T(arr)))))"} {"instruction": "Explain what the following Python 3 code does\ndef TTA(self, n_aug=4, is_test=False):\n dl1 = self.data.test_dl if is_test else self.data.val_dl\n dl2 = self.data.test_aug_dl if is_test else self.data.aug_dl\n preds1,targs = predict_with_targs(self.model, dl1)\n preds1 = [preds1]*math.ceil(n_aug/4)\n preds2 = [predict_with_targs(self.model, dl2)[0] for i in tqdm(range(n_aug), leave=False)]\n return np.stack(preds1+preds2), targs", "response": "Predicts with Test Time Augmentation ( TTA )."} {"instruction": "Make a summary of the following Python 3 code\ndef fit_opt_sched(self, phases, cycle_save_name=None, best_save_name=None, stop_div=False, data_list=None, callbacks=None, \n cut = None, use_swa=False, swa_start=1, swa_eval_freq=5, **kwargs):\n \"\"\"Wraps us the content of phases to send them to model.fit(..)\n\n This will split the training in several parts, each with their own learning rates/\n wds/momentums/optimizer detailed in phases.\n\n Additionaly we can add a list of different data objets in data_list to train\n on different datasets (to change the size for instance) for each of these groups.\n\n Args:\n phases: a list of TrainingPhase objects\n stop_div: when True, stops the training if the loss goes too high\n data_list: a list of different Data objects.\n kwargs: other arguments\n use_swa (bool, optional): when this is set to True, it will enable the use of\n Stochastic Weight Averaging (https://arxiv.org/abs/1803.05407). The learner will\n include an additional model (in the swa_model attribute) for keeping track of the \n average weights as described in the paper. All testing of this technique so far has\n been in image classification, so use in other contexts is not guaranteed to work. \n swa_start (int, optional): if use_swa is set to True, then this determines the epoch\n to start keeping track of the average weights. It is 1-indexed per the paper's\n conventions.\n swa_eval_freq (int, optional): if use_swa is set to True, this determines the frequency\n at which to evaluate the performance of the swa_model. This evaluation can be costly\n for models using BatchNorm (requiring a full pass through the data), which is why the\n default is not to evaluate after each epoch.\n Returns:\n None\n \"\"\"\n if data_list is None: data_list=[]\n if callbacks is None: callbacks=[]\n layer_opt = LayerOptimizer(phases[0].opt_fn, self.get_layer_groups(), 1e-2, phases[0].wds)\n if len(data_list) == 0: nb_batches = [len(self.data.trn_dl)] * len(phases)\n else: nb_batches = [len(data.trn_dl) for data in data_list] \n self.sched = OptimScheduler(layer_opt, phases, nb_batches, stop_div)\n callbacks.append(self.sched)\n metrics = self.metrics\n if best_save_name is not None:\n callbacks+=[SaveBestModel(self, layer_opt, metrics, best_save_name)]\n if use_swa:\n # make a copy of the model to track average weights\n self.swa_model = copy.deepcopy(self.model)\n callbacks+=[SWA(self.model, self.swa_model, swa_start)]\n n_epochs = [phase.epochs for phase in phases] if cut is None else cut\n if len(data_list)==0: data_list = [self.data]\n return fit(self.model, data_list, n_epochs,layer_opt, self.crit,\n metrics=metrics, callbacks=callbacks, reg_fn=self.reg_fn, clip=self.clip, fp16=self.fp16,\n swa_model=self.swa_model if use_swa else None, swa_start=swa_start, \n swa_eval_freq=swa_eval_freq, **kwargs)", "response": "Wraps us the content of phases to send them to the model."} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef on_loss_begin(self, last_output:Tuple[Tensor,Tensor,Tensor], **kwargs):\n \"Save the extra outputs for later and only returns the true output.\"\n self.raw_out,self.out = last_output[1],last_output[2]\n return {'last_output': last_output[0]}", "response": "Save the extra outputs for later and only returns the true output."} {"instruction": "Explain what the following Python 3 code does\ndef on_backward_begin(self, last_loss:Rank0Tensor, last_input:Tensor, **kwargs):\n \"Apply AR and TAR to `last_loss`.\"\n #AR and TAR\n if self.alpha != 0.: last_loss += self.alpha * self.out[-1].float().pow(2).mean()\n if self.beta != 0.:\n h = self.raw_out[-1]\n if len(h)>1: last_loss += self.beta * (h[:,1:] - h[:,:-1]).float().pow(2).mean()\n return {'last_loss': last_loss}", "response": "Apply AR and TAR to last_loss."} {"instruction": "Create a Python 3 function to\nconvert the model wgts to go with a new vocabulary.", "response": "def convert_weights(wgts:Weights, stoi_wgts:Dict[str,int], itos_new:Collection[str]) -> Weights:\n \"Convert the model `wgts` to go with a new vocabulary.\"\n dec_bias, enc_wgts = wgts.get('1.decoder.bias', None), wgts['0.encoder.weight']\n wgts_m = enc_wgts.mean(0)\n if dec_bias is not None: bias_m = dec_bias.mean(0)\n new_w = enc_wgts.new_zeros((len(itos_new),enc_wgts.size(1))).zero_()\n if dec_bias is not None: new_b = dec_bias.new_zeros((len(itos_new),)).zero_()\n for i,w in enumerate(itos_new):\n r = stoi_wgts[w] if w in stoi_wgts else -1\n new_w[i] = enc_wgts[r] if r>=0 else wgts_m\n if dec_bias is not None: new_b[i] = dec_bias[r] if r>=0 else bias_m\n wgts['0.encoder.weight'] = new_w\n if '0.encoder_dp.emb.weight' in wgts: wgts['0.encoder_dp.emb.weight'] = new_w.clone()\n wgts['1.decoder.weight'] = new_w.clone()\n if dec_bias is not None: wgts['1.decoder.bias'] = new_b\n return wgts"} {"instruction": "Create a Python 3 function to\ncreate a language model from arch and its config maybe pretrained.", "response": "def get_language_model(arch:Callable, vocab_sz:int, config:dict=None, drop_mult:float=1.):\n \"Create a language model from `arch` and its `config`, maybe `pretrained`.\"\n meta = _model_meta[arch]\n config = ifnone(config, meta['config_lm'].copy())\n for k in config.keys(): \n if k.endswith('_p'): config[k] *= drop_mult\n tie_weights,output_p,out_bias = map(config.pop, ['tie_weights', 'output_p', 'out_bias'])\n init = config.pop('init') if 'init' in config else None\n encoder = arch(vocab_sz, **config)\n enc = encoder.encoder if tie_weights else None\n decoder = LinearDecoder(vocab_sz, config[meta['hid_name']], output_p, tie_encoder=enc, bias=out_bias)\n model = SequentialRNN(encoder, decoder)\n return model if init is None else model.apply(init)"} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef language_model_learner(data:DataBunch, arch, config:dict=None, drop_mult:float=1., pretrained:bool=True,\n pretrained_fnames:OptStrTuple=None, **learn_kwargs) -> 'LanguageLearner':\n \"Create a `Learner` with a language model from `data` and `arch`.\"\n model = get_language_model(arch, len(data.vocab.itos), config=config, drop_mult=drop_mult)\n meta = _model_meta[arch]\n learn = LanguageLearner(data, model, split_func=meta['split_lm'], **learn_kwargs)\n if pretrained:\n if 'url' not in meta: \n warn(\"There are no pretrained weights for that architecture yet!\")\n return learn\n model_path = untar_data(meta['url'], data=False)\n fnames = [list(model_path.glob(f'*.{ext}'))[0] for ext in ['pth', 'pkl']]\n learn.load_pretrained(*fnames)\n learn.freeze()\n if pretrained_fnames is not None:\n fnames = [learn.path/learn.model_dir/f'{fn}.{ext}' for fn,ext in zip(pretrained_fnames, ['pth', 'pkl'])]\n learn.load_pretrained(*fnames)\n learn.freeze()\n return learn", "response": "Create a Learner with a language model from data and arch."} {"instruction": "Write a Python 3 function for\ncreating a text classifier from arch and its config maybe pretrained.", "response": "def get_text_classifier(arch:Callable, vocab_sz:int, n_class:int, bptt:int=70, max_len:int=20*70, config:dict=None, \n drop_mult:float=1., lin_ftrs:Collection[int]=None, ps:Collection[float]=None,\n pad_idx:int=1) -> nn.Module:\n \"Create a text classifier from `arch` and its `config`, maybe `pretrained`.\"\n meta = _model_meta[arch]\n config = ifnone(config, meta['config_clas'].copy())\n for k in config.keys(): \n if k.endswith('_p'): config[k] *= drop_mult\n if lin_ftrs is None: lin_ftrs = [50]\n if ps is None: ps = [0.1]*len(lin_ftrs)\n layers = [config[meta['hid_name']] * 3] + lin_ftrs + [n_class]\n ps = [config.pop('output_p')] + ps\n init = config.pop('init') if 'init' in config else None\n encoder = MultiBatchEncoder(bptt, max_len, arch(vocab_sz, **config), pad_idx=pad_idx)\n model = SequentialRNN(encoder, PoolingLinearClassifier(layers, ps))\n return model if init is None else model.apply(init)"} {"instruction": "Write a Python 3 script to\ncreate a Learner with a text classifier from data and arch.", "response": "def text_classifier_learner(data:DataBunch, arch:Callable, bptt:int=70, max_len:int=70*20, config:dict=None, \n pretrained:bool=True, drop_mult:float=1., lin_ftrs:Collection[int]=None, \n ps:Collection[float]=None, **learn_kwargs) -> 'TextClassifierLearner':\n \"Create a `Learner` with a text classifier from `data` and `arch`.\"\n model = get_text_classifier(arch, len(data.vocab.itos), data.c, bptt=bptt, max_len=max_len,\n config=config, drop_mult=drop_mult, lin_ftrs=lin_ftrs, ps=ps)\n meta = _model_meta[arch]\n learn = RNNLearner(data, model, split_func=meta['split_clas'], **learn_kwargs)\n if pretrained:\n if 'url' not in meta: \n warn(\"There are no pretrained weights for that architecture yet!\")\n return learn\n model_path = untar_data(meta['url'], data=False)\n fnames = [list(model_path.glob(f'*.{ext}'))[0] for ext in ['pth', 'pkl']]\n learn.load_pretrained(*fnames, strict=False)\n learn.freeze()\n return learn"} {"instruction": "Here you have a function in Python 3, explain what it does\ndef save_encoder(self, name:str):\n \"Save the encoder to `name` inside the model directory.\"\n encoder = get_model(self.model)[0]\n if hasattr(encoder, 'module'): encoder = encoder.module\n torch.save(encoder.state_dict(), self.path/self.model_dir/f'{name}.pth')", "response": "Save the encoder to name inside the model directory."} {"instruction": "Explain what the following Python 3 code does\ndef load_encoder(self, name:str, device:torch.device=None):\n \"Load the encoder `name` from the model directory.\"\n encoder = get_model(self.model)[0]\n if device is None: device = self.data.device\n if hasattr(encoder, 'module'): encoder = encoder.module\n encoder.load_state_dict(torch.load(self.path/self.model_dir/f'{name}.pth'))\n encoder.load_state_dict(torch.load(self.path/self.model_dir/f'{name}.pth', map_location=device))\n self.freeze()", "response": "Load the encoder name from the model directory."} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef load_pretrained(self, wgts_fname:str, itos_fname:str, strict:bool=True):\n \"Load a pretrained model and adapts it to the data vocabulary.\"\n old_itos = pickle.load(open(itos_fname, 'rb'))\n old_stoi = {v:k for k,v in enumerate(old_itos)}\n wgts = torch.load(wgts_fname, map_location=lambda storage, loc: storage)\n if 'model' in wgts: wgts = wgts['model']\n wgts = convert_weights(wgts, old_stoi, self.data.train_ds.vocab.itos)\n self.model.load_state_dict(wgts, strict=strict)", "response": "Load a pretrained model and adapts it to the data vocabulary."} {"instruction": "Given the following Python 3 function, write the documentation\ndef get_preds(self, ds_type:DatasetType=DatasetType.Valid, with_loss:bool=False, n_batch:Optional[int]=None, pbar:Optional[PBar]=None,\n ordered:bool=False) -> List[Tensor]:\n \"Return predictions and targets on the valid, train, or test set, depending on `ds_type`.\"\n self.model.reset()\n if ordered: np.random.seed(42)\n preds = super().get_preds(ds_type=ds_type, with_loss=with_loss, n_batch=n_batch, pbar=pbar)\n if ordered and hasattr(self.dl(ds_type), 'sampler'):\n np.random.seed(42)\n sampler = [i for i in self.dl(ds_type).sampler]\n reverse_sampler = np.argsort(sampler)\n preds = [p[reverse_sampler] for p in preds] \n return(preds)", "response": "Return predictions and targets on the valid train or test set depending on ds_type."} {"instruction": "Given the following Python 3 function, write the documentation\ndef predict(self, text:str, n_words:int=1, no_unk:bool=True, temperature:float=1., min_p:float=None, sep:str=' ',\n decoder=decode_spec_tokens):\n \"Return the `n_words` that come after `text`.\"\n ds = self.data.single_dl.dataset\n self.model.reset()\n xb,yb = self.data.one_item(text)\n new_idx = []\n for _ in range(n_words): #progress_bar(range(n_words), leave=False):\n res = self.pred_batch(batch=(xb,yb))[0][-1]\n #if len(new_idx) == 0: self.model[0].select_hidden([0])\n if no_unk: res[self.data.vocab.stoi[UNK]] = 0.\n if min_p is not None: \n if (res >= min_p).float().sum() == 0:\n warn(f\"There is no item with probability >= {min_p}, try a lower value.\")\n else: res[res < min_p] = 0.\n if temperature != 1.: res.pow_(1 / temperature)\n idx = torch.multinomial(res, 1).item()\n new_idx.append(idx)\n xb = xb.new_tensor([idx])[None]\n return text + sep + sep.join(decoder(self.data.vocab.textify(new_idx, sep=None)))", "response": "Return the n_words that come after text."} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef beam_search(self, text:str, n_words:int, no_unk:bool=True, top_k:int=10, beam_sz:int=1000, temperature:float=1.,\n sep:str=' ', decoder=decode_spec_tokens):\n \"Return the `n_words` that come after `text` using beam search.\"\n ds = self.data.single_dl.dataset\n self.model.reset()\n xb, yb = self.data.one_item(text)\n nodes = None\n xb = xb.repeat(top_k, 1)\n nodes = xb.clone()\n scores = xb.new_zeros(1).float()\n with torch.no_grad():\n for k in progress_bar(range(n_words), leave=False):\n out = F.log_softmax(self.model(xb)[0][:,-1], dim=-1)\n if no_unk: out[:,self.data.vocab.stoi[UNK]] = -float('Inf')\n values, indices = out.topk(top_k, dim=-1)\n scores = (-values + scores[:,None]).view(-1)\n indices_idx = torch.arange(0,nodes.size(0))[:,None].expand(nodes.size(0), top_k).contiguous().view(-1)\n sort_idx = scores.argsort()[:beam_sz]\n scores = scores[sort_idx]\n nodes = torch.cat([nodes[:,None].expand(nodes.size(0),top_k,nodes.size(1)),\n indices[:,:,None].expand(nodes.size(0),top_k,1),], dim=2)\n nodes = nodes.view(-1, nodes.size(2))[sort_idx]\n self.model[0].select_hidden(indices_idx[sort_idx])\n xb = nodes[:,-1][:,None]\n if temperature != 1.: scores.div_(temperature)\n node_idx = torch.multinomial(torch.exp(-scores), 1).item()\n return text + sep + sep.join(decoder(self.data.vocab.textify([i.item() for i in nodes[node_idx][1:] ], sep=None)))", "response": "Return the n_words that come after text using beam search."} {"instruction": "Create a Python 3 function for\nshowing rows result of predictions on ds_type dataset.", "response": "def show_results(self, ds_type=DatasetType.Valid, rows:int=5, max_len:int=20):\n from IPython.display import display, HTML\n \"Show `rows` result of predictions on `ds_type` dataset.\"\n ds = self.dl(ds_type).dataset\n x,y = self.data.one_batch(ds_type, detach=False, denorm=False)\n preds = self.pred_batch(batch=(x,y))\n y = y.view(*x.size())\n z = preds.view(*x.size(),-1).argmax(dim=2)\n xs = [ds.x.reconstruct(grab_idx(x, i)) for i in range(rows)]\n ys = [ds.x.reconstruct(grab_idx(y, i)) for i in range(rows)]\n zs = [ds.x.reconstruct(grab_idx(z, i)) for i in range(rows)]\n items,names = [],['text', 'target', 'pred']\n for i, (x,y,z) in enumerate(zip(xs,ys,zs)):\n txt_x = ' '.join(x.text.split(' ')[:max_len])\n txt_y = ' '.join(y.text.split(' ')[max_len-1:2*max_len-1])\n txt_z = ' '.join(z.text.split(' ')[max_len-1:2*max_len-1])\n items.append([txt_x, txt_y, txt_z])\n items = np.array(items)\n df = pd.DataFrame({n:items[:,i] for i,n in enumerate(names)}, columns=names)\n with pd.option_context('display.max_colwidth', -1):\n display(HTML(df.to_html(index=False)))"} {"instruction": "Implement a function in Python 3 to\nconcatenate the arrs along the batch dimension.", "response": "def concat(self, arrs:Collection[Tensor])->Tensor:\n \"Concatenate the `arrs` along the batch dimension.\"\n return [torch.cat([l[si] for l in arrs], dim=1) for si in range_of(arrs[0])]"} {"instruction": "Write a Python 3 function that can\nsequence of batchnorm ( if bn ) dropout with p and linear ( n_in n_out ) layers followed by actn.", "response": "def bn_drop_lin(n_in:int, n_out:int, bn:bool=True, p:float=0., actn:Optional[nn.Module]=None):\n \"Sequence of batchnorm (if `bn`), dropout (with `p`) and linear (`n_in`,`n_out`) layers followed by `actn`.\"\n layers = [nn.BatchNorm1d(n_in)] if bn else []\n if p != 0: layers.append(nn.Dropout(p))\n layers.append(nn.Linear(n_in, n_out))\n if actn is not None: layers.append(actn)\n return layers"} {"instruction": "How would you implement a function in Python 3 that\ncreates and initialize a nn. Conv1d layer with spectral normalization.", "response": "def conv1d(ni:int, no:int, ks:int=1, stride:int=1, padding:int=0, bias:bool=False):\n \"Create and initialize a `nn.Conv1d` layer with spectral normalization.\"\n conv = nn.Conv1d(ni, no, ks, stride=stride, padding=padding, bias=bias)\n nn.init.kaiming_normal_(conv.weight)\n if bias: conv.bias.data.zero_()\n return spectral_norm(conv)"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef conv2d(ni:int, nf:int, ks:int=3, stride:int=1, padding:int=None, bias=False, init:LayerFunc=nn.init.kaiming_normal_) -> nn.Conv2d:\n \"Create and initialize `nn.Conv2d` layer. `padding` defaults to `ks//2`.\"\n if padding is None: padding = ks//2\n return init_default(nn.Conv2d(ni, nf, kernel_size=ks, stride=stride, padding=padding, bias=bias), init)", "response": "Create and initialize nn. Conv2d layer. padding defaults to ks//2."} {"instruction": "How would you implement a function in Python 3 that\ncreates nn. ConvTranspose2d layer.", "response": "def conv2d_trans(ni:int, nf:int, ks:int=2, stride:int=2, padding:int=0, bias=False) -> nn.ConvTranspose2d:\n \"Create `nn.ConvTranspose2d` layer.\"\n return nn.ConvTranspose2d(ni, nf, kernel_size=ks, stride=stride, padding=padding, bias=bias)"} {"instruction": "Can you create a Python 3 function that\nreturns a relu activation maybe leaky and inplace.", "response": "def relu(inplace:bool=False, leaky:float=None):\n \"Return a relu activation, maybe `leaky` and `inplace`.\"\n return nn.LeakyReLU(inplace=inplace, negative_slope=leaky) if leaky is not None else nn.ReLU(inplace=inplace)"} {"instruction": "Implement a function in Python 3 to\ncreate a sequence of convolutional.", "response": "def conv_layer(ni:int, nf:int, ks:int=3, stride:int=1, padding:int=None, bias:bool=None, is_1d:bool=False,\n norm_type:Optional[NormType]=NormType.Batch, use_activ:bool=True, leaky:float=None,\n transpose:bool=False, init:Callable=nn.init.kaiming_normal_, self_attention:bool=False):\n \"Create a sequence of convolutional (`ni` to `nf`), ReLU (if `use_activ`) and batchnorm (if `bn`) layers.\"\n if padding is None: padding = (ks-1)//2 if not transpose else 0\n bn = norm_type in (NormType.Batch, NormType.BatchZero)\n if bias is None: bias = not bn\n conv_func = nn.ConvTranspose2d if transpose else nn.Conv1d if is_1d else nn.Conv2d\n conv = init_default(conv_func(ni, nf, kernel_size=ks, bias=bias, stride=stride, padding=padding), init)\n if norm_type==NormType.Weight: conv = weight_norm(conv)\n elif norm_type==NormType.Spectral: conv = spectral_norm(conv)\n layers = [conv]\n if use_activ: layers.append(relu(True, leaky=leaky))\n if bn: layers.append((nn.BatchNorm1d if is_1d else nn.BatchNorm2d)(nf))\n if self_attention: layers.append(SelfAttention(nf))\n return nn.Sequential(*layers)"} {"instruction": "How would you explain what the following Python 3 function does\ndef res_block(nf, dense:bool=False, norm_type:Optional[NormType]=NormType.Batch, bottle:bool=False, **conv_kwargs):\n \"Resnet block of `nf` features. `conv_kwargs` are passed to `conv_layer`.\"\n norm2 = norm_type\n if not dense and (norm_type==NormType.Batch): norm2 = NormType.BatchZero\n nf_inner = nf//2 if bottle else nf\n return SequentialEx(conv_layer(nf, nf_inner, norm_type=norm_type, **conv_kwargs),\n conv_layer(nf_inner, nf, norm_type=norm2, **conv_kwargs),\n MergeLayer(dense))", "response": "Resnet block of nf features. conv_kwargs are passed to conv_layer."} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef sigmoid_range(x, low, high):\n \"Sigmoid function with range `(low, high)`\"\n return torch.sigmoid(x) * (high - low) + low", "response": "Sigmoid function with range low and high"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef icnr(x, scale=2, init=nn.init.kaiming_normal_):\n \"ICNR init of `x`, with `scale` and `init` function.\"\n ni,nf,h,w = x.shape\n ni2 = int(ni/(scale**2))\n k = init(torch.zeros([ni2,nf,h,w])).transpose(0, 1)\n k = k.contiguous().view(ni2, nf, -1)\n k = k.repeat(1, 1, scale**2)\n k = k.contiguous().view([nf,ni,h,w]).transpose(0, 1)\n x.data.copy_(k)", "response": "ICNR init of x with scale and init function."} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef CrossEntropyFlat(*args, axis:int=-1, **kwargs):\n \"Same as `nn.CrossEntropyLoss`, but flattens input and target.\"\n return FlattenedLoss(nn.CrossEntropyLoss, *args, axis=axis, **kwargs)", "response": "Same as nn. CrossEntropyLoss but flattens input and target."} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef BCEFlat(*args, axis:int=-1, floatify:bool=True, **kwargs):\n \"Same as `nn.BCELoss`, but flattens input and target.\"\n return FlattenedLoss(nn.BCELoss, *args, axis=axis, floatify=floatify, is_2d=False, **kwargs)", "response": "Same as nn. BCELoss but flattens input and target."} {"instruction": "Here you have a function in Python 3, explain what it does\ndef simple_cnn(actns:Collection[int], kernel_szs:Collection[int]=None,\n strides:Collection[int]=None, bn=False) -> nn.Sequential:\n \"CNN with `conv_layer` defined by `actns`, `kernel_szs` and `strides`, plus batchnorm if `bn`.\"\n nl = len(actns)-1\n kernel_szs = ifnone(kernel_szs, [3]*nl)\n strides = ifnone(strides , [2]*nl)\n layers = [conv_layer(actns[i], actns[i+1], kernel_szs[i], stride=strides[i],\n norm_type=(NormType.Batch if bn and i<(len(strides)-1) else None)) for i in range_of(strides)]\n layers.append(PoolFlatten())\n return nn.Sequential(*layers)", "response": "CNN with conv_layer defined by actns kernel_szs and strides plus batchnorm if bn."} {"instruction": "Explain what the following Python 3 code does\ndef embedding(ni:int,nf:int) -> nn.Module:\n \"Create an embedding layer.\"\n emb = nn.Embedding(ni, nf)\n # See https://arxiv.org/abs/1711.09160\n with torch.no_grad(): trunc_normal_(emb.weight, std=0.01)\n return emb", "response": "Create an embedding layer."} {"instruction": "Implement a function in Python 3 to\nprepare MLflow experiment and log params", "response": "def on_train_begin(self, **kwargs: Any) -> None:\n \"Prepare MLflow experiment and log params\"\n self.client = mlflow.tracking.MlflowClient(self.uri)\n exp = self.client.get_experiment_by_name(self.exp_name)\n self.exp_id = self.client.create_experiment(self.exp_name) if exp is None else exp.experiment_id\n run = self.client.create_run(experiment_id=self.exp_id)\n self.run = run.info.run_uuid\n for k,v in self.params.items():\n self.client.log_param(run_id=self.run, key=k, value=v)"} {"instruction": "Here you have a function in Python 3, explain what it does\ndef on_epoch_end(self, epoch, **kwargs:Any)->None:\n \"Send loss and metrics values to MLFlow after each epoch\"\n if kwargs['smooth_loss'] is None or kwargs[\"last_metrics\"] is None: return\n metrics = [kwargs['smooth_loss']] + kwargs[\"last_metrics\"]\n for name, val in zip(self.metrics_names, metrics):\n self.client.log_metric(self.run, name, np.float(val))", "response": "Send loss and metrics values to MLFlow after each epoch"} {"instruction": "Here you have a function in Python 3, explain what it does\ndef on_train_end(self, **kwargs: Any) -> None: \n \"Store the notebook and stop run\"\n self.client.log_artifact(run_id=self.run, local_path=self.nb_path)\n self.client.set_terminated(run_id=self.run)", "response": "Store the notebook and stop run"} {"instruction": "Here you have a function in Python 3, explain what it does\ndef pil2tensor(image:Union[NPImage,NPArray],dtype:np.dtype)->TensorImage:\n \"Convert PIL style `image` array to torch style image tensor.\"\n a = np.asarray(image)\n if a.ndim==2 : a = np.expand_dims(a,2)\n a = np.transpose(a, (1, 0, 2))\n a = np.transpose(a, (2, 1, 0))\n return torch.from_numpy(a.astype(dtype, copy=False) )", "response": "Convert PIL style image array to torch style image tensor."} {"instruction": "Write a Python 3 script to\nconvert from torch style image to numpy / matplotlib style.", "response": "def image2np(image:Tensor)->np.ndarray:\n \"Convert from torch style `image` to numpy/matplotlib style.\"\n res = image.cpu().permute(1,2,0).numpy()\n return res[...,0] if res.shape[2]==1 else res"} {"instruction": "Can you implement a function in Python 3 that\nconverts bounding box points from ( width height center ) to ( height top left.", "response": "def bb2hw(a:Collection[int])->np.ndarray:\n \"Convert bounding box points from (width,height,center) to (height,width,top,left).\"\n return np.array([a[1],a[0],a[3]-a[1],a[2]-a[0]])"} {"instruction": "Make a summary of the following Python 3 code\ndef tis2hw(size:Union[int,TensorImageSize]) -> Tuple[int,int]:\n \"Convert `int` or `TensorImageSize` to (height,width) of an image.\"\n if type(size) is str: raise RuntimeError(\"Expected size to be an int or a tuple, got a string.\")\n return listify(size, 2) if isinstance(size, int) else listify(size[-2:],2)", "response": "Convert int or TensorImageSize to ( height width ) of an image."} {"instruction": "Implement a function in Python 3 to\noutline bounding box onto image Patch.", "response": "def _draw_outline(o:Patch, lw:int):\n \"Outline bounding box onto image `Patch`.\"\n o.set_path_effects([patheffects.Stroke(\n linewidth=lw, foreground='black'), patheffects.Normal()])"} {"instruction": "How would you explain what the following Python 3 function does\ndef _draw_rect(ax:plt.Axes, b:Collection[int], color:str='white', text=None, text_size=14):\n \"Draw bounding box on `ax`.\"\n patch = ax.add_patch(patches.Rectangle(b[:2], *b[-2:], fill=False, edgecolor=color, lw=2))\n _draw_outline(patch, 4)\n if text is not None:\n patch = ax.text(*b[:2], text, verticalalignment='top', color=color, fontsize=text_size, weight='bold')\n _draw_outline(patch,1)", "response": "Draw bounding box on ax."} {"instruction": "Given the following Python 3 function, write the documentation\ndef open_image(fn:PathOrStr, div:bool=True, convert_mode:str='RGB', cls:type=Image,\n after_open:Callable=None)->Image:\n \"Return `Image` object created from image in file `fn`.\"\n with warnings.catch_warnings():\n warnings.simplefilter(\"ignore\", UserWarning) # EXIF warning from TiffPlugin\n x = PIL.Image.open(fn).convert(convert_mode)\n if after_open: x = after_open(x)\n x = pil2tensor(x,np.float32)\n if div: x.div_(255)\n return cls(x)", "response": "Return Image object created from image in file fn."} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef open_mask(fn:PathOrStr, div=False, convert_mode='L', after_open:Callable=None)->ImageSegment:\n \"Return `ImageSegment` object create from mask in file `fn`. If `div`, divides pixel values by 255.\"\n return open_image(fn, div=div, convert_mode=convert_mode, cls=ImageSegment, after_open=after_open)", "response": "Return ImageSegment object create from mask in file fn. If div divides pixel values by 255."} {"instruction": "Implement a Python 3 function for\nreturning ImageSegment object create from run - length encoded string in mask_lre with size in shape.", "response": "def open_mask_rle(mask_rle:str, shape:Tuple[int, int])->ImageSegment:\n \"Return `ImageSegment` object create from run-length encoded string in `mask_lre` with size in `shape`.\"\n x = FloatTensor(rle_decode(str(mask_rle), shape).astype(np.uint8))\n x = x.view(shape[1], shape[0], -1)\n return ImageSegment(x.permute(2,0,1))"} {"instruction": "Given the following Python 3 function, write the documentation\ndef rle_encode(img:NPArrayMask)->str:\n \"Return run-length encoding string from `img`.\"\n pixels = np.concatenate([[0], img.flatten() , [0]])\n runs = np.where(pixels[1:] != pixels[:-1])[0] + 1\n runs[1::2] -= runs[::2]\n return ' '.join(str(x) for x in runs)", "response": "Return run - length encoding string from img."} {"instruction": "Here you have a function in Python 3, explain what it does\ndef rle_decode(mask_rle:str, shape:Tuple[int,int])->NPArrayMask:\n \"Return an image array from run-length encoded string `mask_rle` with `shape`.\"\n s = mask_rle.split()\n starts, lengths = [np.asarray(x, dtype=int) for x in (s[0:][::2], s[1:][::2])]\n starts -= 1\n ends = starts + lengths\n img = np.zeros(shape[0]*shape[1], dtype=np.uint)\n for low, up in zip(starts, ends): img[low:up] = 1\n return img.reshape(shape)", "response": "Return an image array from run - length encoded string mask_rle with shape."} {"instruction": "Make a summary of the following Python 3 code\ndef show_image(img:Image, ax:plt.Axes=None, figsize:tuple=(3,3), hide_axis:bool=True, cmap:str='binary',\n alpha:float=None, **kwargs)->plt.Axes:\n \"Display `Image` in notebook.\"\n if ax is None: fig,ax = plt.subplots(figsize=figsize)\n ax.imshow(image2np(img.data), cmap=cmap, alpha=alpha, **kwargs)\n if hide_axis: ax.axis('off')\n return ax", "response": "Display Image in notebook."} {"instruction": "Can you implement a function in Python 3 that\nscales the coords in flow to - 1 / 1 or the image size depending on to_unit.", "response": "def scale_flow(flow, to_unit=True):\n \"Scale the coords in `flow` to -1/1 or the image size depending on `to_unit`.\"\n s = tensor([flow.size[0]/2,flow.size[1]/2])[None]\n if to_unit: flow.flow = flow.flow/s-1\n else: flow.flow = (flow.flow+1)*s\n return flow"} {"instruction": "Write a Python 3 function for\nmultiplying c by m - can adjust for rectangular shaped c.", "response": "def _affine_mult(c:FlowField,m:AffineMatrix)->FlowField:\n \"Multiply `c` by `m` - can adjust for rectangular shaped `c`.\"\n if m is None: return c\n size = c.flow.size()\n h,w = c.size\n m[0,1] *= h/w\n m[1,0] *= w/h\n c.flow = c.flow.view(-1,2)\n c.flow = torch.addmm(m[:2,2], c.flow, m[:2,:2].t()).view(size)\n return c"} {"instruction": "Implement a function in Python 3 to\napply the inverse affine transform described in m to c.", "response": "def _affine_inv_mult(c, m):\n \"Applies the inverse affine transform described in `m` to `c`.\"\n size = c.flow.size()\n h,w = c.size\n m[0,1] *= h/w\n m[1,0] *= w/h\n c.flow = c.flow.view(-1,2)\n a = torch.inverse(m[:2,:2].t())\n c.flow = torch.mm(c.flow - m[:2,2], a).view(size)\n return c"} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef _round_multiple(x:int, mult:int=None)->int:\n \"Calc `x` to nearest multiple of `mult`.\"\n return (int(x/mult+0.5)*mult) if mult is not None else x", "response": "Calc x to nearest multiple of mult."} {"instruction": "How would you code a function in Python 3 to\ncall func for every combination of r c on a subplot", "response": "def plot_multi(func:Callable[[int,int,plt.Axes],None], r:int=1, c:int=1, figsize:Tuple=(12,6)):\n \"Call `func` for every combination of `r,c` on a subplot\"\n axes = plt.subplots(r, c, figsize=figsize)[1]\n for i in range(r):\n for j in range(c): func(i,j,axes[i,j])"} {"instruction": "Explain what the following Python 3 code does\ndef show_multi(func:Callable[[int,int],Image], r:int=1, c:int=1, figsize:Tuple=(9,9)):\n \"Call `func(i,j).show(ax)` for every combination of `r,c`\"\n plot_multi(lambda i,j,ax: func(i,j).show(ax), r, c, figsize=figsize)", "response": "Call func i j. show ( ax ) for every combination of r c"} {"instruction": "Make a summary of the following Python 3 code\ndef show_all(imgs:Collection[Image], r:int=1, c:Optional[int]=None, figsize=(12,6)):\n \"Show all `imgs` using `r` rows\"\n imgs = listify(imgs)\n if c is None: c = len(imgs)//r\n for i,ax in plot_flat(r,c,figsize): imgs[i].show(ax)", "response": "Show all imgs using r rows"} {"instruction": "Implement a function in Python 3 to\napply all tfms to the Image if do_resolve picks value for random args.", "response": "def apply_tfms(self, tfms:TfmList, do_resolve:bool=True, xtra:Optional[Dict[Callable,dict]]=None,\n size:Optional[Union[int,TensorImageSize]]=None, resize_method:ResizeMethod=None,\n mult:int=None, padding_mode:str='reflection', mode:str='bilinear', remove_out:bool=True)->TensorImage:\n \"Apply all `tfms` to the `Image`, if `do_resolve` picks value for random args.\"\n if not (tfms or xtra or size): return self\n tfms = listify(tfms)\n xtra = ifnone(xtra, {})\n default_rsz = ResizeMethod.SQUISH if (size is not None and is_listy(size)) else ResizeMethod.CROP\n resize_method = ifnone(resize_method, default_rsz)\n if resize_method <= 2 and size is not None: tfms = self._maybe_add_crop_pad(tfms)\n tfms = sorted(tfms, key=lambda o: o.tfm.order)\n if do_resolve: _resolve_tfms(tfms)\n x = self.clone()\n x.set_sample(padding_mode=padding_mode, mode=mode, remove_out=remove_out)\n if size is not None:\n crop_target = _get_crop_target(size, mult=mult)\n if resize_method in (ResizeMethod.CROP,ResizeMethod.PAD):\n target = _get_resize_target(x, crop_target, do_crop=(resize_method==ResizeMethod.CROP))\n x.resize(target)\n elif resize_method==ResizeMethod.SQUISH: x.resize((x.shape[0],) + crop_target)\n else: size = x.size\n size_tfms = [o for o in tfms if isinstance(o.tfm,TfmCrop)]\n for tfm in tfms:\n if tfm.tfm in xtra: x = tfm(x, **xtra[tfm.tfm])\n elif tfm in size_tfms:\n if resize_method in (ResizeMethod.CROP,ResizeMethod.PAD):\n x = tfm(x, size=_get_crop_target(size,mult=mult), padding_mode=padding_mode)\n else: x = tfm(x)\n return x.refresh()"} {"instruction": "Write a Python 3 function that can\napply any logit flow or affine transfers that have been sent to the Image.", "response": "def refresh(self)->None:\n \"Apply any logit, flow, or affine transfers that have been sent to the `Image`.\"\n if self._logit_px is not None:\n self._px = self._logit_px.sigmoid_()\n self._logit_px = None\n if self._affine_mat is not None or self._flow is not None:\n self._px = _grid_sample(self._px, self.flow, **self.sample_kwargs)\n self.sample_kwargs = {}\n self._flow = None\n return self"} {"instruction": "Here you have a function in Python 3, explain what it does\ndef save(self, fn:PathOrStr):\n \"Save the image to `fn`.\"\n x = image2np(self.data*255).astype(np.uint8)\n PIL.Image.fromarray(x).save(fn)", "response": "Save the image to fn."} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef flow(self)->FlowField:\n \"Access the flow-field grid after applying queued affine transforms.\"\n if self._flow is None:\n self._flow = _affine_grid(self.shape)\n if self._affine_mat is not None:\n self._flow = _affine_mult(self._flow,self._affine_mat)\n self._affine_mat = None\n return self._flow", "response": "Access the flow - field grid after applying queued affine transforms."} {"instruction": "Can you tell what is the following Python 3 function doing\ndef lighting(self, func:LightingFunc, *args:Any, **kwargs:Any):\n \"Equivalent to `image = sigmoid(func(logit(image)))`.\"\n self.logit_px = func(self.logit_px, *args, **kwargs)\n return self", "response": "Equivalent to image = sigmoid ( func ( logit ( image)))."} {"instruction": "Can you tell what is the following Python 3 function doing\ndef pixel(self, func:PixelFunc, *args, **kwargs)->'Image':\n \"Equivalent to `image.px = func(image.px)`.\"\n self.px = func(self.px, *args, **kwargs)\n return self", "response": "Equivalent to image. px = func ( image. px"} {"instruction": "Explain what the following Python 3 code does\ndef coord(self, func:CoordFunc, *args, **kwargs)->'Image':\n \"Equivalent to `image.flow = func(image.flow, image.size)`.\"\n self.flow = func(self.flow, *args, **kwargs)\n return self", "response": "Equivalent to image. flow = func ( image. flow image. size )."} {"instruction": "Explain what the following Python 3 code does\ndef affine(self, func:AffineFunc, *args, **kwargs)->'Image':\n \"Equivalent to `image.affine_mat = image.affine_mat @ func()`.\"\n m = tensor(func(*args, **kwargs)).to(self.device)\n self.affine_mat = self.affine_mat @ m\n return self", "response": "Equivalent to image. affine_mat = image. affine_mat @ func."} {"instruction": "Here you have a function in Python 3, explain what it does\ndef resize(self, size:Union[int,TensorImageSize])->'Image':\n \"Resize the image to `size`, size can be a single int.\"\n assert self._flow is None\n if isinstance(size, int): size=(self.shape[0], size, size)\n if tuple(size)==tuple(self.shape): return self\n self.flow = _affine_grid(size)\n return self", "response": "Resize the image to size can be a single int."} {"instruction": "Given the following Python 3 function, write the documentation\ndef affine_mat(self)->AffineMatrix:\n \"Get the affine matrix that will be applied by `refresh`.\"\n if self._affine_mat is None:\n self._affine_mat = torch.eye(3).to(self.device)\n return self._affine_mat", "response": "Get the affine matrix that will be applied by refresh."} {"instruction": "Create a Python 3 function to\nget logit ( image. px.", "response": "def logit_px(self)->LogitTensorImage:\n \"Get logit(image.px).\"\n if self._logit_px is None: self._logit_px = logit_(self.px)\n return self._logit_px"} {"instruction": "Given the following Python 3 function, write the documentation\ndef show(self, ax:plt.Axes=None, figsize:tuple=(3,3), title:Optional[str]=None, hide_axis:bool=True,\n cmap:str=None, y:Any=None, **kwargs):\n \"Show image on `ax` with `title`, using `cmap` if single-channel, overlaid with optional `y`\"\n cmap = ifnone(cmap, defaults.cmap)\n ax = show_image(self, ax=ax, hide_axis=hide_axis, cmap=cmap, figsize=figsize)\n if y is not None: y.show(ax=ax, **kwargs)\n if title is not None: ax.set_title(title)", "response": "Show image on ax with title using cmap if single - channel overlaid with optional y"} {"instruction": "Make a summary of the following Python 3 code\ndef show(self, ax:plt.Axes=None, figsize:tuple=(3,3), title:Optional[str]=None, hide_axis:bool=True,\n cmap:str='tab20', alpha:float=0.5, **kwargs):\n \"Show the `ImageSegment` on `ax`.\"\n ax = show_image(self, ax=ax, hide_axis=hide_axis, cmap=cmap, figsize=figsize,\n interpolation='nearest', alpha=alpha, vmin=0)\n if title: ax.set_title(title)", "response": "Show the ImageSegment on ax."} {"instruction": "Here you have a function in Python 3, explain what it does\ndef clone(self):\n \"Mimic the behavior of torch.clone for `ImagePoints` objects.\"\n return self.__class__(FlowField(self.size, self.flow.flow.clone()), scale=False, y_first=False)", "response": "Mimic the behavior of torch. clone for ImagePoints objects."} {"instruction": "How would you explain what the following Python 3 function does\ndef flow(self)->FlowField:\n \"Access the flow-field grid after applying queued affine and coord transforms.\"\n if self._affine_mat is not None:\n self._flow = _affine_inv_mult(self._flow, self._affine_mat)\n self._affine_mat = None\n self.transformed = True\n if len(self.flow_func) != 0:\n for f in self.flow_func[::-1]: self._flow = f(self._flow)\n self.transformed = True\n self.flow_func = []\n return self._flow", "response": "Access the flow - field grid after applying queued affine and coord transforms."} {"instruction": "Here you have a function in Python 3, explain what it does\ndef coord(self, func:CoordFunc, *args, **kwargs)->'ImagePoints':\n \"Put `func` with `args` and `kwargs` in `self.flow_func` for later.\"\n if 'invert' in kwargs: kwargs['invert'] = True\n else: warn(f\"{func.__name__} isn't implemented for {self.__class__}.\")\n self.flow_func.append(partial(func, *args, **kwargs))\n return self", "response": "Put func with args and kwargs in self. flow_func for later."} {"instruction": "How would you explain what the following Python 3 function does\ndef pixel(self, func:PixelFunc, *args, **kwargs)->'ImagePoints':\n \"Equivalent to `self = func_flow(self)`.\"\n self = func(self, *args, **kwargs)\n self.transformed=True\n return self", "response": "Equivalent to self = func_flow ( self )."} {"instruction": "Can you implement a function in Python 3 that\nreturns the points associated to this object.", "response": "def data(self)->Tensor:\n \"Return the points associated to this object.\"\n flow = self.flow #This updates flow before we test if some transforms happened\n if self.transformed:\n if 'remove_out' not in self.sample_kwargs or self.sample_kwargs['remove_out']:\n flow = _remove_points_out(flow)\n self.transformed=False\n return flow.flow.flip(1)"} {"instruction": "How would you code a function in Python 3 to\nshow the ImagePoints on ax.", "response": "def show(self, ax:plt.Axes=None, figsize:tuple=(3,3), title:Optional[str]=None, hide_axis:bool=True, **kwargs):\n \"Show the `ImagePoints` on `ax`.\"\n if ax is None: _,ax = plt.subplots(figsize=figsize)\n pnt = scale_flow(FlowField(self.size, self.data), to_unit=False).flow.flip(1)\n params = {'s': 10, 'marker': '.', 'c': 'r', **kwargs}\n ax.scatter(pnt[:, 0], pnt[:, 1], **params)\n if hide_axis: ax.axis('off')\n if title: ax.set_title(title)"} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef clone(self) -> 'ImageBBox':\n \"Mimic the behavior of torch.clone for `Image` objects.\"\n flow = FlowField(self.size, self.flow.flow.clone())\n return self.__class__(flow, scale=False, y_first=False, labels=self.labels, pad_idx=self.pad_idx)", "response": "Mimic the behavior of torch. clone for Image objects."} {"instruction": "Write a Python 3 script to\ncreate an ImageBBox object from bboxes.", "response": "def create(cls, h:int, w:int, bboxes:Collection[Collection[int]], labels:Collection=None, classes:dict=None,\n pad_idx:int=0, scale:bool=True)->'ImageBBox':\n \"Create an ImageBBox object from `bboxes`.\"\n if isinstance(bboxes, np.ndarray) and bboxes.dtype == np.object: bboxes = np.array([bb for bb in bboxes])\n bboxes = tensor(bboxes).float()\n tr_corners = torch.cat([bboxes[:,0][:,None], bboxes[:,3][:,None]], 1)\n bl_corners = bboxes[:,1:3].flip(1)\n bboxes = torch.cat([bboxes[:,:2], tr_corners, bl_corners, bboxes[:,2:]], 1)\n flow = FlowField((h,w), bboxes.view(-1,2))\n return cls(flow, labels=labels, classes=classes, pad_idx=pad_idx, y_first=True, scale=scale)"} {"instruction": "Write a Python 3 script for\nshowing the ImageBBox on ax.", "response": "def show(self, y:Image=None, ax:plt.Axes=None, figsize:tuple=(3,3), title:Optional[str]=None, hide_axis:bool=True,\n color:str='white', **kwargs):\n \"Show the `ImageBBox` on `ax`.\"\n if ax is None: _,ax = plt.subplots(figsize=figsize)\n bboxes, lbls = self._compute_boxes()\n h,w = self.flow.size\n bboxes.add_(1).mul_(torch.tensor([h/2, w/2, h/2, w/2])).long()\n for i, bbox in enumerate(bboxes):\n if lbls is not None: text = str(lbls[i])\n else: text=None\n _draw_rect(ax, bb2hw(bbox), text=text, color=color)"} {"instruction": "Make a summary of the following Python 3 code\ndef calc(self, x:Image, *args:Any, **kwargs:Any)->Image:\n \"Apply to image `x`, wrapping it if necessary.\"\n if self._wrap: return getattr(x, self._wrap)(self.func, *args, **kwargs)\n else: return self.func(x, *args, **kwargs)", "response": "Apply to image x wrapping it if necessary."} {"instruction": "Make a summary of the following Python 3 code\ndef url2path(url, data=True, ext:str='.tgz'):\n \"Change `url` to a path.\"\n name = url2name(url)\n return datapath4file(name, ext=ext, archive=False) if data else modelpath4file(name, ext=ext)", "response": "Change url to a path."} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef modelpath4file(filename, ext:str='.tgz'):\n \"Return model path to `filename`, checking locally first then in the config file.\"\n local_path = URLs.LOCAL_PATH/'models'/filename\n if local_path.exists() or local_path.with_suffix(ext).exists(): return local_path\n else: return Config.model_path()/filename", "response": "Return model path to filename checking locally first then in the config file."} {"instruction": "Given the following Python 3 function, write the documentation\ndef datapath4file(filename, ext:str='.tgz', archive=True):\n \"Return data path to `filename`, checking locally first then in the config file.\"\n local_path = URLs.LOCAL_PATH/'data'/filename\n if local_path.exists() or local_path.with_suffix(ext).exists(): return local_path\n elif archive: return Config.data_archive_path() / filename\n else: return Config.data_path() / filename", "response": "Return data path to filename checking locally first then in the config file."} {"instruction": "Write a Python 3 function that can\ndownload url to destination fname.", "response": "def download_data(url:str, fname:PathOrStr=None, data:bool=True, ext:str='.tgz') -> Path:\n \"Download `url` to destination `fname`.\"\n fname = Path(ifnone(fname, _url2tgz(url, data, ext=ext)))\n os.makedirs(fname.parent, exist_ok=True)\n if not fname.exists():\n print(f'Downloading {url}')\n download_url(f'{url}{ext}', fname)\n return fname"} {"instruction": "Write a Python 3 script for\ndownloading url to fname if dest doesn t exist and un - tgz to folder dest.", "response": "def untar_data(url:str, fname:PathOrStr=None, dest:PathOrStr=None, data=True, force_download=False) -> Path:\n \"Download `url` to `fname` if `dest` doesn't exist, and un-tgz to folder `dest`.\"\n dest = url2path(url, data) if dest is None else Path(dest)/url2name(url)\n fname = Path(ifnone(fname, _url2tgz(url, data)))\n if force_download or (fname.exists() and url in _checks and _check_file(fname) != _checks[url]):\n print(f\"A new version of the {'dataset' if data else 'model'} is available.\")\n if fname.exists(): os.remove(fname)\n if dest.exists(): shutil.rmtree(dest)\n if not dest.exists():\n fname = download_data(url, fname=fname, data=data)\n if url in _checks:\n assert _check_file(fname) == _checks[url], f\"Downloaded file {fname} does not match checksum expected! Remove that file from {Config().data_archive_path()} and try your code again.\"\n tarfile.open(fname, 'r:gz').extractall(dest.parent)\n return dest"} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef get_key(cls, key):\n \"Get the path to `key` in the config file.\"\n return cls.get().get(key, cls.DEFAULT_CONFIG.get(key,None))", "response": "Get the path to key in the config file."} {"instruction": "Can you create a Python 3 function that\nretrieves the Config in fpath.", "response": "def get(cls, fpath=None, create_missing=True):\n \"Retrieve the `Config` in `fpath`.\"\n fpath = _expand_path(fpath or cls.DEFAULT_CONFIG_PATH)\n if not fpath.exists() and create_missing: cls.create(fpath)\n assert fpath.exists(), f'Could not find config at: {fpath}. Please create'\n with open(fpath, 'r') as yaml_file: return yaml.safe_load(yaml_file)"} {"instruction": "How would you implement a function in Python 3 that\ncreates a Config from fpath.", "response": "def create(cls, fpath):\n \"Creates a `Config` from `fpath`.\"\n fpath = _expand_path(fpath)\n assert(fpath.suffix == '.yml')\n if fpath.exists(): return\n fpath.parent.mkdir(parents=True, exist_ok=True)\n with open(fpath, 'w') as yaml_file:\n yaml.dump(cls.DEFAULT_CONFIG, yaml_file, default_flow_style=False)"} {"instruction": "Write a Python 3 function that can\napply mixup to last_input and last_target if train.", "response": "def on_batch_begin(self, last_input, last_target, train, **kwargs):\n \"Applies mixup to `last_input` and `last_target` if `train`.\"\n if not train: return\n lambd = np.random.beta(self.alpha, self.alpha, last_target.size(0))\n lambd = np.concatenate([lambd[:,None], 1-lambd[:,None]], 1).max(1)\n lambd = last_input.new(lambd)\n shuffle = torch.randperm(last_target.size(0)).to(last_input.device)\n x1, y1 = last_input[shuffle], last_target[shuffle]\n if self.stack_x:\n new_input = [last_input, last_input[shuffle], lambd]\n else: \n new_input = (last_input * lambd.view(lambd.size(0),1,1,1) + x1 * (1-lambd).view(lambd.size(0),1,1,1))\n if self.stack_y:\n new_target = torch.cat([last_target[:,None].float(), y1[:,None].float(), lambd[:,None].float()], 1)\n else:\n if len(last_target.shape) == 2:\n lambd = lambd.unsqueeze(1).float()\n new_target = last_target.float() * lambd + y1.float() * (1-lambd)\n return {'last_input': new_input, 'last_target': new_target}"} {"instruction": "Explain what the following Python 3 code does\ndef make_date(df:DataFrame, date_field:str):\n \"Make sure `df[field_name]` is of the right date type.\"\n field_dtype = df[date_field].dtype\n if isinstance(field_dtype, pd.core.dtypes.dtypes.DatetimeTZDtype):\n field_dtype = np.datetime64\n if not np.issubdtype(field_dtype, np.datetime64):\n df[date_field] = pd.to_datetime(df[date_field], infer_datetime_format=True)", "response": "Make sure df [ field_name ] is of the right date type."} {"instruction": "Explain what the following Python 3 code does\ndef cyclic_dt_feat_names(time:bool=True, add_linear:bool=False)->List[str]:\n \"Return feature names of date/time cycles as produced by `cyclic_dt_features`.\"\n fs = ['cos','sin']\n attr = [f'{r}_{f}' for r in 'weekday day_month month_year day_year'.split() for f in fs]\n if time: attr += [f'{r}_{f}' for r in 'hour clock min sec'.split() for f in fs]\n if add_linear: attr.append('year_lin')\n return attr", "response": "Return feature names of date or time cycles as produced by cyclic_dt_features."} {"instruction": "Make a summary of the following Python 3 code\ndef cyclic_dt_features(d:Union[date,datetime], time:bool=True, add_linear:bool=False)->List[float]:\n \"Calculate the cos and sin of date/time cycles.\"\n tt,fs = d.timetuple(), [np.cos, np.sin]\n day_year,days_month = tt.tm_yday, calendar.monthrange(d.year, d.month)[1]\n days_year = 366 if calendar.isleap(d.year) else 365\n rs = d.weekday()/7, (d.day-1)/days_month, (d.month-1)/12, (day_year-1)/days_year\n feats = [f(r * 2 * np.pi) for r in rs for f in fs]\n if time and isinstance(d, datetime) and type(d) != date:\n rs = tt.tm_hour/24, tt.tm_hour%12/12, tt.tm_min/60, tt.tm_sec/60\n feats += [f(r * 2 * np.pi) for r in rs for f in fs]\n if add_linear:\n if type(d) == date: feats.append(d.year + rs[-1])\n else:\n secs_in_year = (datetime(d.year+1, 1, 1) - datetime(d.year, 1, 1)).total_seconds()\n feats.append(d.year + ((d - datetime(d.year, 1, 1)).total_seconds() / secs_in_year))\n return feats", "response": "Calculate the cos and sin of date / time cycles."} {"instruction": "Given the following Python 3 function, write the documentation\ndef apply_train(self, df:DataFrame):\n \"Transform `self.cat_names` columns in categorical.\"\n self.categories = {}\n for n in self.cat_names:\n df.loc[:,n] = df.loc[:,n].astype('category').cat.as_ordered()\n self.categories[n] = df[n].cat.categories", "response": "Transform self. cat_names columns in categorical."} {"instruction": "Can you create a Python 3 function that\ncomputes the means and stds of self. cont_names columns to normalize them.", "response": "def apply_train(self, df:DataFrame):\n \"Compute the means and stds of `self.cont_names` columns to normalize them.\"\n self.means,self.stds = {},{}\n for n in self.cont_names:\n assert is_numeric_dtype(df[n]), (f\"\"\"Cannot normalize '{n}' column as it isn't numerical.\n Are you sure it doesn't belong in the categorical set of columns?\"\"\")\n self.means[n],self.stds[n] = df[n].mean(),df[n].std()\n df[n] = (df[n]-self.means[n]) / (1e-7 + self.stds[n])"} {"instruction": "Can you create a Python 3 function that\npicks an embedding size for n depending on classes if not given in sz_dict.", "response": "def def_emb_sz(classes, n, sz_dict=None):\n \"Pick an embedding size for `n` depending on `classes` if not given in `sz_dict`.\"\n sz_dict = ifnone(sz_dict, {})\n n_cat = len(classes[n])\n sz = sz_dict.get(n, int(emb_sz_rule(n_cat))) # rule of thumb\n return n_cat,sz"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef tabular_learner(data:DataBunch, layers:Collection[int], emb_szs:Dict[str,int]=None, metrics=None,\n ps:Collection[float]=None, emb_drop:float=0., y_range:OptRange=None, use_bn:bool=True, **learn_kwargs):\n \"Get a `Learner` using `data`, with `metrics`, including a `TabularModel` created using the remaining params.\"\n emb_szs = data.get_emb_szs(ifnone(emb_szs, {}))\n model = TabularModel(emb_szs, len(data.cont_names), out_sz=data.c, layers=layers, ps=ps, emb_drop=emb_drop,\n y_range=y_range, use_bn=use_bn)\n return Learner(data, model, metrics=metrics, **learn_kwargs)", "response": "Get a Learner using data with metrics including a TabularModel created using the remaining params."} {"instruction": "Can you write a function in Python 3 where it\ncreates a DataBunch from df and valid_idx with dep_var. kwargs are passed to DataBunch. create.", "response": "def from_df(cls, path, df:DataFrame, dep_var:str, valid_idx:Collection[int], procs:OptTabTfms=None,\n cat_names:OptStrList=None, cont_names:OptStrList=None, classes:Collection=None, \n test_df=None, bs:int=64, val_bs:int=None, num_workers:int=defaults.cpus, dl_tfms:Optional[Collection[Callable]]=None, \n device:torch.device=None, collate_fn:Callable=data_collate, no_check:bool=False)->DataBunch:\n \"Create a `DataBunch` from `df` and `valid_idx` with `dep_var`. `kwargs` are passed to `DataBunch.create`.\"\n cat_names = ifnone(cat_names, []).copy()\n cont_names = ifnone(cont_names, list(set(df)-set(cat_names)-{dep_var}))\n procs = listify(procs)\n src = (TabularList.from_df(df, path=path, cat_names=cat_names, cont_names=cont_names, procs=procs)\n .split_by_idx(valid_idx))\n src = src.label_from_df(cols=dep_var) if classes is None else src.label_from_df(cols=dep_var, classes=classes)\n if test_df is not None: src.add_test(TabularList.from_df(test_df, cat_names=cat_names, cont_names=cont_names,\n processor = src.train.x.processor))\n return src.databunch(path=path, bs=bs, val_bs=val_bs, num_workers=num_workers, device=device, \n collate_fn=collate_fn, no_check=no_check)"} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef from_df(cls, df:DataFrame, cat_names:OptStrList=None, cont_names:OptStrList=None, procs=None, **kwargs)->'ItemList':\n \"Get the list of inputs in the `col` of `path/csv_name`.\"\n return cls(items=range(len(df)), cat_names=cat_names, cont_names=cont_names, procs=procs, inner_df=df.copy(), **kwargs)", "response": "Get the list of inputs in the col of path / csv_name."} {"instruction": "Implement a function in Python 3 to\nreturn the default embedding sizes suitable for this data or takes the ones in sz_dict.", "response": "def get_emb_szs(self, sz_dict=None):\n \"Return the default embedding sizes suitable for this data or takes the ones in `sz_dict`.\"\n return [def_emb_sz(self.classes, n, sz_dict) for n in self.cat_names]"} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef show_xys(self, xs, ys)->None:\n \"Show the `xs` (inputs) and `ys` (targets).\"\n from IPython.display import display, HTML\n items,names = [], xs[0].names + ['target']\n for i, (x,y) in enumerate(zip(xs,ys)):\n res = []\n cats = x.cats if len(x.cats.size()) > 0 else []\n conts = x.conts if len(x.conts.size()) > 0 else []\n for c, n in zip(cats, x.names[:len(cats)]):\n res.append(x.classes[n][c])\n res += [f'{c:.4f}' for c in conts] + [y]\n items.append(res)\n items = np.array(items)\n df = pd.DataFrame({n:items[:,i] for i,n in enumerate(names)}, columns=names)\n with pd.option_context('display.max_colwidth', -1):\n display(HTML(df.to_html(index=False)))", "response": "Show the xs and ys ( inputs ) and ys."} {"instruction": "Implement a function in Python 3 to\nload the classifier and int to string mapping file and the trained classifer model.", "response": "def load_model(itos_filename, classifier_filename, num_classes):\n \"\"\"Load the classifier and int to string mapping\n\n Args:\n itos_filename (str): The filename of the int to string mapping file (usually called itos.pkl)\n classifier_filename (str): The filename of the trained classifier\n\n Returns:\n string to int mapping, trained classifer model\n \"\"\"\n\n # load the int to string mapping file\n itos = pickle.load(Path(itos_filename).open('rb'))\n # turn it into a string to int mapping (which is what we need)\n stoi = collections.defaultdict(lambda:0, {str(v):int(k) for k,v in enumerate(itos)})\n\n # these parameters aren't used, but this is the easiest way to get a model\n bptt,em_sz,nh,nl = 70,400,1150,3\n dps = np.array([0.4,0.5,0.05,0.3,0.4])*0.5\n vs = len(itos)\n\n model = get_rnn_classifer(bptt, 20*70, num_classes, vs, emb_sz=em_sz, n_hid=nh, n_layers=nl, pad_token=1,\n layers=[em_sz*3, 50, num_classes], drops=[dps[4], 0.1],\n dropouti=dps[0], wdrop=dps[1], dropoute=dps[2], dropouth=dps[3])\n\n # load the trained classifier\n model.load_state_dict(torch.load(classifier_filename, map_location=lambda storage, loc: storage))\n\n # put the classifier into evaluation mode\n model.reset()\n model.eval()\n\n return stoi, model"} {"instruction": "Here you have a function in Python 3, explain what it does\ndef softmax(x):\n '''\n Numpy Softmax, via comments on https://gist.github.com/stober/1946926\n\n >>> res = softmax(np.array([0, 200, 10]))\n >>> np.sum(res)\n 1.0\n >>> np.all(np.abs(res - np.array([0, 1, 0])) < 0.0001)\n True\n >>> res = softmax(np.array([[0, 200, 10], [0, 10, 200], [200, 0, 10]]))\n >>> np.sum(res, axis=1)\n array([ 1., 1., 1.])\n >>> res = softmax(np.array([[0, 200, 10], [0, 10, 200]]))\n >>> np.sum(res, axis=1)\n array([ 1., 1.])\n '''\n if x.ndim == 1:\n x = x.reshape((1, -1))\n max_x = np.max(x, axis=1).reshape((-1, 1))\n exp_x = np.exp(x - max_x)\n return exp_x / np.sum(exp_x, axis=1).reshape((-1, 1))", "response": "Numpy Softmax implementation of the softmax function."} {"instruction": "Explain what the following Python 3 code does\ndef predict_text(stoi, model, text):\n\n # prefix text with tokens:\n # xbos: beginning of sentence\n # xfld 1: we are using a single field here\n input_str = 'xbos xfld 1 ' + text\n\n # predictions are done on arrays of input.\n # We only have a single input, so turn it into a 1x1 array\n texts = [input_str]\n\n # tokenize using the fastai wrapper around spacy\n tok = Tokenizer().proc_all_mp(partition_by_cores(texts))\n\n # turn into integers for each word\n encoded = [stoi[p] for p in tok[0]]\n\n # we want a [x,1] array where x is the number\n # of words inputted (including the prefix tokens)\n ary = np.reshape(np.array(encoded),(-1,1))\n\n # turn this array into a tensor\n tensor = torch.from_numpy(ary)\n\n # wrap in a torch Variable\n variable = Variable(tensor)\n\n # do the predictions\n predictions = model(variable)\n\n # convert back to numpy\n numpy_preds = predictions[0].data.numpy()\n\n return softmax(numpy_preds[0])[0]", "response": "Do the actual prediction on the text using the passed in model and mapping files passed in."} {"instruction": "Can you write a function in Python 3 where it\nloads a model and produces predictions on arbitrary input.", "response": "def predict_input(itos_filename, trained_classifier_filename, num_classes=2):\n \"\"\"\n Loads a model and produces predictions on arbitrary input.\n :param itos_filename: the path to the id-to-string mapping file\n :param trained_classifier_filename: the filename of the trained classifier;\n typically ends with \"clas_1.h5\"\n :param num_classes: the number of classes that the model predicts\n \"\"\"\n # Check the itos file exists\n if not os.path.exists(itos_filename):\n print(\"Could not find \" + itos_filename)\n exit(-1)\n\n # Check the classifier file exists\n if not os.path.exists(trained_classifier_filename):\n print(\"Could not find \" + trained_classifier_filename)\n exit(-1)\n\n stoi, model = load_model(itos_filename, trained_classifier_filename, num_classes)\n\n while True:\n text = input(\"Enter text to analyse (or q to quit): \")\n if text.strip() == 'q':\n break\n else:\n scores = predict_text(stoi, model, text)\n print(\"Result id {0}, Scores: {1}\".format(np.argmax(scores), scores))"} {"instruction": "Given the following Python 3 function, write the documentation\ndef _make_w3c_caps(caps):\n caps = copy.deepcopy(caps)\n profile = caps.get('firefox_profile')\n always_match = {}\n if caps.get('proxy') and caps['proxy'].get('proxyType'):\n caps['proxy']['proxyType'] = caps['proxy']['proxyType'].lower()\n for k, v in caps.items():\n if v and k in _OSS_W3C_CONVERSION:\n always_match[_OSS_W3C_CONVERSION[k]] = v.lower() if k == 'platform' else v\n if k in _W3C_CAPABILITY_NAMES or ':' in k:\n always_match[k] = v\n if profile:\n moz_opts = always_match.get('moz:firefoxOptions', {})\n # If it's already present, assume the caller did that intentionally.\n if 'profile' not in moz_opts:\n # Don't mutate the original capabilities.\n new_opts = copy.deepcopy(moz_opts)\n new_opts['profile'] = profile\n always_match['moz:firefoxOptions'] = new_opts\n return {\"firstMatch\": [{}], \"alwaysMatch\": always_match}", "response": "Makes a W3C alwaysMatch capabilities object."} {"instruction": "Write a Python 3 function that can\noverride the current file detector (if necessary) in limited context. Ensures the original file detector is set afterwards. Example: with webdriver.file_detector_context(UselessFileDetector): someinput.send_keys('/etc/hosts') :Args: - file_detector_class - Class of the desired file detector. If the class is different from the current file_detector, then the class is instantiated with args and kwargs and used as a file detector during the duration of the context manager. - args - Optional arguments that get passed to the file detector class during instantiation. - kwargs - Keyword arguments, passed the same way as args.", "response": "def file_detector_context(self, file_detector_class, *args, **kwargs):\n \"\"\"\n Overrides the current file detector (if necessary) in limited context.\n Ensures the original file detector is set afterwards.\n\n Example:\n\n with webdriver.file_detector_context(UselessFileDetector):\n someinput.send_keys('/etc/hosts')\n\n :Args:\n - file_detector_class - Class of the desired file detector. If the class is different\n from the current file_detector, then the class is instantiated with args and kwargs\n and used as a file detector during the duration of the context manager.\n - args - Optional arguments that get passed to the file detector class during\n instantiation.\n - kwargs - Keyword arguments, passed the same way as args.\n \"\"\"\n last_detector = None\n if not isinstance(self.file_detector, file_detector_class):\n last_detector = self.file_detector\n self.file_detector = file_detector_class(*args, **kwargs)\n try:\n yield\n finally:\n if last_detector is not None:\n self.file_detector = last_detector"} {"instruction": "Can you write a function in Python 3 where it\nstarts a new session with the desired capabilities.", "response": "def start_session(self, capabilities, browser_profile=None):\n \"\"\"\n Creates a new session with the desired capabilities.\n\n :Args:\n - browser_name - The name of the browser to request.\n - version - Which browser version to request.\n - platform - Which platform to request the browser on.\n - javascript_enabled - Whether the new session should support JavaScript.\n - browser_profile - A selenium.webdriver.firefox.firefox_profile.FirefoxProfile object. Only used if Firefox is requested.\n \"\"\"\n if not isinstance(capabilities, dict):\n raise InvalidArgumentException(\"Capabilities must be a dictionary\")\n if browser_profile:\n if \"moz:firefoxOptions\" in capabilities:\n capabilities[\"moz:firefoxOptions\"][\"profile\"] = browser_profile.encoded\n else:\n capabilities.update({'firefox_profile': browser_profile.encoded})\n w3c_caps = _make_w3c_caps(capabilities)\n parameters = {\"capabilities\": w3c_caps,\n \"desiredCapabilities\": capabilities}\n response = self.execute(Command.NEW_SESSION, parameters)\n if 'sessionId' not in response:\n response = response['value']\n self.session_id = response['sessionId']\n self.capabilities = response.get('value')\n\n # if capabilities is none we are probably speaking to\n # a W3C endpoint\n if self.capabilities is None:\n self.capabilities = response.get('capabilities')\n\n # Double check to see if we have a W3C Compliant browser\n self.w3c = response.get('status') is None\n self.command_executor.w3c = self.w3c"} {"instruction": "How would you code a function in Python 3 to\ncreate a web element with the specified element_id.", "response": "def create_web_element(self, element_id):\n \"\"\"Creates a web element with the specified `element_id`.\"\"\"\n return self._web_element_cls(self, element_id, w3c=self.w3c)"} {"instruction": "Create a Python 3 function to\nexecute a command and returns the response.", "response": "def execute(self, driver_command, params=None):\n \"\"\"\n Sends a command to be executed by a command.CommandExecutor.\n\n :Args:\n - driver_command: The name of the command to execute as a string.\n - params: A dictionary of named parameters to send with the command.\n\n :Returns:\n The command's JSON response loaded into a dictionary object.\n \"\"\"\n if self.session_id is not None:\n if not params:\n params = {'sessionId': self.session_id}\n elif 'sessionId' not in params:\n params['sessionId'] = self.session_id\n\n params = self._wrap_value(params)\n response = self.command_executor.execute(driver_command, params)\n if response:\n self.error_handler.check_response(response)\n response['value'] = self._unwrap_value(\n response.get('value', None))\n return response\n # If the server doesn't send a response, assume the command was\n # a success\n return {'success': 0, 'value': None, 'sessionId': self.session_id}"} {"instruction": "Can you generate the documentation for the following Python 3 function\ndef find_element_by_id(self, id_):\n return self.find_element(by=By.ID, value=id_)", "response": "Finds an element by id."} {"instruction": "Explain what the following Python 3 code does\ndef find_elements_by_id(self, id_):\n return self.find_elements(by=By.ID, value=id_)", "response": "Finds multiple elements by id."} {"instruction": "Implement a function in Python 3 to\nfind an element by xpath.", "response": "def find_element_by_xpath(self, xpath):\n \"\"\"\n Finds an element by xpath.\n\n :Args:\n - xpath - The xpath locator of the element to find.\n\n :Returns:\n - WebElement - the element if it was found\n\n :Raises:\n - NoSuchElementException - if the element wasn't found\n\n :Usage:\n ::\n\n element = driver.find_element_by_xpath('//div/td[1]')\n \"\"\"\n return self.find_element(by=By.XPATH, value=xpath)"} {"instruction": "Write a Python 3 script for\nfinding multiple elements by xpath.", "response": "def find_elements_by_xpath(self, xpath):\n \"\"\"\n Finds multiple elements by xpath.\n\n :Args:\n - xpath - The xpath locator of the elements to be found.\n\n :Returns:\n - list of WebElement - a list with elements if any was found. An\n empty list if not\n\n :Usage:\n ::\n\n elements = driver.find_elements_by_xpath(\"//div[contains(@class, 'foo')]\")\n \"\"\"\n return self.find_elements(by=By.XPATH, value=xpath)"} {"instruction": "Here you have a function in Python 3, explain what it does\ndef find_element_by_link_text(self, link_text):\n return self.find_element(by=By.LINK_TEXT, value=link_text)", "response": "Finds an element by link text."} {"instruction": "Write a Python 3 function for\nfinding elements by link text.", "response": "def find_elements_by_link_text(self, text):\n \"\"\"\n Finds elements by link text.\n\n :Args:\n - link_text: The text of the elements to be found.\n\n :Returns:\n - list of webelement - a list with elements if any was found. an\n empty list if not\n\n :Usage:\n ::\n\n elements = driver.find_elements_by_link_text('Sign In')\n \"\"\"\n return self.find_elements(by=By.LINK_TEXT, value=text)"} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef find_element_by_partial_link_text(self, link_text):\n return self.find_element(by=By.PARTIAL_LINK_TEXT, value=link_text)", "response": "Finds an element by a partial match of its link text."} {"instruction": "Write a Python 3 script to\nfind elements by a partial match of their link text.", "response": "def find_elements_by_partial_link_text(self, link_text):\n \"\"\"\n Finds elements by a partial match of their link text.\n\n :Args:\n - link_text: The text of the element to partial match on.\n\n :Returns:\n - list of webelement - a list with elements if any was found. an\n empty list if not\n\n :Usage:\n ::\n\n elements = driver.find_elements_by_partial_link_text('Sign')\n \"\"\"\n return self.find_elements(by=By.PARTIAL_LINK_TEXT, value=link_text)"} {"instruction": "How would you implement a function in Python 3 that\nfinds an element by name.", "response": "def find_element_by_name(self, name):\n \"\"\"\n Finds an element by name.\n\n :Args:\n - name: The name of the element to find.\n\n :Returns:\n - WebElement - the element if it was found\n\n :Raises:\n - NoSuchElementException - if the element wasn't found\n\n :Usage:\n ::\n\n element = driver.find_element_by_name('foo')\n \"\"\"\n return self.find_element(by=By.NAME, value=name)"} {"instruction": "Given the following Python 3 function, write the documentation\ndef find_elements_by_name(self, name):\n return self.find_elements(by=By.NAME, value=name)", "response": "Finds elements by name."} {"instruction": "Write a Python 3 script for\nfinding an element by tag name.", "response": "def find_element_by_tag_name(self, name):\n \"\"\"\n Finds an element by tag name.\n\n :Args:\n - name - name of html tag (eg: h1, a, span)\n\n :Returns:\n - WebElement - the element if it was found\n\n :Raises:\n - NoSuchElementException - if the element wasn't found\n\n :Usage:\n ::\n\n element = driver.find_element_by_tag_name('h1')\n \"\"\"\n return self.find_element(by=By.TAG_NAME, value=name)"} {"instruction": "Can you implement a function in Python 3 that\nfinds elements by tag name.", "response": "def find_elements_by_tag_name(self, name):\n \"\"\"\n Finds elements by tag name.\n\n :Args:\n - name - name of html tag (eg: h1, a, span)\n\n :Returns:\n - list of WebElement - a list with elements if any was found. An\n empty list if not\n\n :Usage:\n ::\n\n elements = driver.find_elements_by_tag_name('h1')\n \"\"\"\n return self.find_elements(by=By.TAG_NAME, value=name)"} {"instruction": "Can you create a Python 3 function that\nfinds an element by class name.", "response": "def find_element_by_class_name(self, name):\n \"\"\"\n Finds an element by class name.\n\n :Args:\n - name: The class name of the element to find.\n\n :Returns:\n - WebElement - the element if it was found\n\n :Raises:\n - NoSuchElementException - if the element wasn't found\n\n :Usage:\n ::\n\n element = driver.find_element_by_class_name('foo')\n \"\"\"\n return self.find_element(by=By.CLASS_NAME, value=name)"} {"instruction": "Implement a Python 3 function for\nfinding elements by class name.", "response": "def find_elements_by_class_name(self, name):\n \"\"\"\n Finds elements by class name.\n\n :Args:\n - name: The class name of the elements to find.\n\n :Returns:\n - list of WebElement - a list with elements if any was found. An\n empty list if not\n\n :Usage:\n ::\n\n elements = driver.find_elements_by_class_name('foo')\n \"\"\"\n return self.find_elements(by=By.CLASS_NAME, value=name)"} {"instruction": "Write a Python 3 function that can\nfind an element by css selector.", "response": "def find_element_by_css_selector(self, css_selector):\n \"\"\"\n Finds an element by css selector.\n\n :Args:\n - css_selector - CSS selector string, ex: 'a.nav#home'\n\n :Returns:\n - WebElement - the element if it was found\n\n :Raises:\n - NoSuchElementException - if the element wasn't found\n\n :Usage:\n ::\n\n element = driver.find_element_by_css_selector('#foo')\n \"\"\"\n return self.find_element(by=By.CSS_SELECTOR, value=css_selector)"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef find_elements_by_css_selector(self, css_selector):\n return self.find_elements(by=By.CSS_SELECTOR, value=css_selector)", "response": "Find elements by css selector."} {"instruction": "Can you generate a brief explanation for the following Python 3 code\ndef execute_async_script(self, script, *args):\n converted_args = list(args)\n if self.w3c:\n command = Command.W3C_EXECUTE_SCRIPT_ASYNC\n else:\n command = Command.EXECUTE_ASYNC_SCRIPT\n\n return self.execute(command, {\n 'script': script,\n 'args': converted_args})['value']", "response": "Asynchronously Executes a JavaScript script in the current window or frame."} {"instruction": "Write a Python 3 function for\nquitting the driver and closes every associated window.", "response": "def quit(self):\n \"\"\"\n Quits the driver and closes every associated window.\n\n :Usage:\n ::\n\n driver.quit()\n \"\"\"\n try:\n self.execute(Command.QUIT)\n finally:\n self.stop_client()\n self.command_executor.close()"} {"instruction": "How would you implement a function in Python 3 that\nreturns the handle of the current window.", "response": "def current_window_handle(self):\n \"\"\"\n Returns the handle of the current window.\n\n :Usage:\n ::\n\n driver.current_window_handle\n \"\"\"\n if self.w3c:\n return self.execute(Command.W3C_GET_CURRENT_WINDOW_HANDLE)['value']\n else:\n return self.execute(Command.GET_CURRENT_WINDOW_HANDLE)['value']"} {"instruction": "Can you write a function in Python 3 where it\nreturns the handles of all windows within the current session.", "response": "def window_handles(self):\n \"\"\"\n Returns the handles of all windows within the current session.\n\n :Usage:\n ::\n\n driver.window_handles\n \"\"\"\n if self.w3c:\n return self.execute(Command.W3C_GET_WINDOW_HANDLES)['value']\n else:\n return self.execute(Command.GET_WINDOW_HANDLES)['value']"} {"instruction": "Write a Python 3 function for\nmaximizing the current window that webdriver is using", "response": "def maximize_window(self):\n \"\"\"\n Maximizes the current window that webdriver is using\n \"\"\"\n params = None\n command = Command.W3C_MAXIMIZE_WINDOW\n if not self.w3c:\n command = Command.MAXIMIZE_WINDOW\n params = {'windowHandle': 'current'}\n self.execute(command, params)"} {"instruction": "Can you tell what is the following Python 3 function doing\ndef get_cookie(self, name):\n if self.w3c:\n try:\n return self.execute(Command.GET_COOKIE, {'name': name})['value']\n except NoSuchCookieException:\n return None\n else:\n cookies = self.get_cookies()\n for cookie in cookies:\n if cookie['name'] == name:\n return cookie\n return None", "response": "Get a single cookie by name. Returns None if not found."} {"instruction": "Can you tell what is the following Python 3 function doing\ndef implicitly_wait(self, time_to_wait):\n if self.w3c:\n self.execute(Command.SET_TIMEOUTS, {\n 'implicit': int(float(time_to_wait) * 1000)})\n else:\n self.execute(Command.IMPLICIT_WAIT, {\n 'ms': float(time_to_wait) * 1000})", "response": "Sets a sticky timeout to implicitly wait for an element to be found or a command to complete."} {"instruction": "How would you implement a function in Python 3 that\nsets the amount of time that the script should wait during execute_async_script call before throwing an error.", "response": "def set_script_timeout(self, time_to_wait):\n \"\"\"\n Set the amount of time that the script should wait during an\n execute_async_script call before throwing an error.\n\n :Args:\n - time_to_wait: The amount of time to wait (in seconds)\n\n :Usage:\n ::\n\n driver.set_script_timeout(30)\n \"\"\"\n if self.w3c:\n self.execute(Command.SET_TIMEOUTS, {\n 'script': int(float(time_to_wait) * 1000)})\n else:\n self.execute(Command.SET_SCRIPT_TIMEOUT, {\n 'ms': float(time_to_wait) * 1000})"} {"instruction": "Write a Python 3 script to\nset the amount of time to wait for a page load to complete before throwing an error.", "response": "def set_page_load_timeout(self, time_to_wait):\n \"\"\"\n Set the amount of time to wait for a page load to complete\n before throwing an error.\n\n :Args:\n - time_to_wait: The amount of time to wait\n\n :Usage:\n ::\n\n driver.set_page_load_timeout(30)\n \"\"\"\n try:\n self.execute(Command.SET_TIMEOUTS, {\n 'pageLoad': int(float(time_to_wait) * 1000)})\n except WebDriverException:\n self.execute(Command.SET_TIMEOUTS, {\n 'ms': float(time_to_wait) * 1000,\n 'type': 'page load'})"} {"instruction": "How would you implement a function in Python 3 that\nfinds an element given a By strategy and locator.", "response": "def find_element(self, by=By.ID, value=None):\n \"\"\"\n Find an element given a By strategy and locator. Prefer the find_element_by_* methods when\n possible.\n\n :Usage:\n ::\n\n element = driver.find_element(By.ID, 'foo')\n\n :rtype: WebElement\n \"\"\"\n if self.w3c:\n if by == By.ID:\n by = By.CSS_SELECTOR\n value = '[id=\"%s\"]' % value\n elif by == By.TAG_NAME:\n by = By.CSS_SELECTOR\n elif by == By.CLASS_NAME:\n by = By.CSS_SELECTOR\n value = \".%s\" % value\n elif by == By.NAME:\n by = By.CSS_SELECTOR\n value = '[name=\"%s\"]' % value\n return self.execute(Command.FIND_ELEMENT, {\n 'using': by,\n 'value': value})['value']"}