Uploaded image for project: 'Spark'
  1. Spark
  2. SPARK-45981 Improve Python language test coverage
  3. SPARK-46148

Fix pyspark.pandas.mlflow.load_model test (Python 3.12)

    XMLWordPrintableJSON

Details

    Description

      **********************************************************************
      File "/__w/spark/spark/python/pyspark/pandas/mlflow.py", line 172, in pyspark.pandas.mlflow.load_model
      Failed example:
          prediction_df
      Exception raised:
          Traceback (most recent call last):
            File "/usr/lib/python3.10/doctest.py", line 1350, in __run
              exec(compile(example.source, filename, "single",
            File "<doctest pyspark.pandas.mlflow.load_model[18]>", line 1, in <module>
              prediction_df
            File "/__w/spark/spark/python/pyspark/pandas/frame.py", line 13291, in __repr__
              pdf = cast("DataFrame", self._get_or_create_repr_pandas_cache(max_display_count))
            File "/__w/spark/spark/python/pyspark/pandas/frame.py", line 13282, in _get_or_create_repr_pandas_cache
              self, "_repr_pandas_cache", {n: self.head(n + 1)._to_internal_pandas()}
            File "/__w/spark/spark/python/pyspark/pandas/frame.py", line 13277, in _to_internal_pandas
              return self._internal.to_pandas_frame
            File "/__w/spark/spark/python/pyspark/pandas/utils.py", line 599, in wrapped_lazy_property
              setattr(self, attr_name, fn(self))
            File "/__w/spark/spark/python/pyspark/pandas/internal.py", line 1110, in to_pandas_frame
              pdf = sdf.toPandas()
            File "/__w/spark/spark/python/pyspark/sql/pandas/conversion.py", line 213, in toPandas
              rows = self.collect()
            File "/__w/spark/spark/python/pyspark/sql/dataframe.py", line 1369, in collect
              sock_info = self._jdf.collectToPython()
            File "/__w/spark/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/java_gateway.py", line 1322, in __call__
              return_value = get_return_value(
            File "/__w/spark/spark/python/pyspark/errors/exceptions/captured.py", line 188, in deco
              raise converted from None
          pyspark.errors.exceptions.captured.PythonException: 
            An exception was thrown from the Python worker. Please see the stack trace below.
          Traceback (most recent call last):
            File "/__w/spark/spark/python/lib/pyspark.zip/pyspark/worker.py", line 1523, in main
              process()
            File "/__w/spark/spark/python/lib/pyspark.zip/pyspark/worker.py", line 1515, in process
              serializer.dump_stream(out_iter, outfile)
            File "/__w/spark/spark/python/lib/pyspark.zip/pyspark/sql/pandas/serializers.py", line 485, in dump_stream
              return ArrowStreamSerializer.dump_stream(self, init_stream_yield_batches(), stream)
            File "/__w/spark/spark/python/lib/pyspark.zip/pyspark/sql/pandas/serializers.py", line 101, in dump_stream
              for batch in iterator:
            File "/__w/spark/spark/python/lib/pyspark.zip/pyspark/sql/pandas/serializers.py", line 478, in init_stream_yield_batches
              for series in iterator:
            File "/__w/spark/spark/python/lib/pyspark.zip/pyspark/worker.py", line 1284, in func
              for result_batch, result_type in result_iter:
            File "/usr/local/lib/python3.10/dist-packages/mlflow/pyfunc/__init__.py", line 1619, in udf
              yield _predict_row_batch(batch_predict_fn, row_batch_args)
            File "/usr/local/lib/python3.10/dist-packages/mlflow/pyfunc/__init__.py", line 1383, in _predict_row_batch
              result = predict_fn(pdf, params)
            File "/usr/local/lib/python3.10/dist-packages/mlflow/pyfunc/__init__.py", line 1601, in batch_predict_fn
              return loaded_model.predict(pdf, params=params)
            File "/usr/local/lib/python3.10/dist-packages/mlflow/pyfunc/__init__.py", line 491, in predict
              return _predict()
            File "/usr/local/lib/python3.10/dist-packages/mlflow/pyfunc/__init__.py", line 477, in _predict
              return self._predict_fn(data, params=params)
            File "/usr/local/lib/python3.10/dist-packages/mlflow/sklearn/__init__.py", line 517, in predict
              return self.sklearn_model.predict(data)
            File "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_base.py", line 386, in predict
              return self._decision_function(X)
            File "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_base.py", line 369, in _decision_function
              X = self._validate_data(X, accept_sparse=["csr", "csc", "coo"], reset=False)
            File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 580, in _validate_data
              self._check_feature_names(X, reset=reset)
            File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 507, in _check_feature_names
              raise ValueError(message)
          ValueError: The feature names should match those that were passed during fit.
          Feature names unseen at fit time:
          - 0
          - 1
          Feature names seen at fit time, yet now missing:
          - x1
          - x2
      
      
      
          JVM stacktrace:
          org.apache.spark.SparkException: Job aborted due to stage failure: Task 2 in stage 1.0 failed 1 times, most recent failure: Lost task 2.0 in stage 1.0 (TID 3) (localhost executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
            File "/__w/spark/spark/python/lib/pyspark.zip/pyspark/worker.py", line 1523, in main
              process()
            File "/__w/spark/spark/python/lib/pyspark.zip/pyspark/worker.py", line 1515, in process
              serializer.dump_stream(out_iter, outfile)
            File "/__w/spark/spark/python/lib/pyspark.zip/pyspark/sql/pandas/serializers.py", line 485, in dump_stream
              return ArrowStreamSerializer.dump_stream(self, init_stream_yield_batches(), stream)
            File "/__w/spark/spark/python/lib/pyspark.zip/pyspark/sql/pandas/serializers.py", line 101, in dump_stream
              for batch in iterator:
            File "/__w/spark/spark/python/lib/pyspark.zip/pyspark/sql/pandas/serializers.py", line 478, in init_stream_yield_batches
              for series in iterator:
            File "/__w/spark/spark/python/lib/pyspark.zip/pyspark/worker.py", line 1284, in func
              for result_batch, result_type in result_iter:
            File "/usr/local/lib/python3.10/dist-packages/mlflow/pyfunc/__init__.py", line 1619, in udf
              yield _predict_row_batch(batch_predict_fn, row_batch_args)
            File "/usr/local/lib/python3.10/dist-packages/mlflow/pyfunc/__init__.py", line 1383, in _predict_row_batch
          	at scala.collection.Iterator$$anon$9.hasNext(Iterator.scala:583)
          	at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:140)
          	at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:57)
          	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:111)
          	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:54)
          	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
          	at org.apache.spark.scheduler.Task.run(Task.scala:141)
          	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:628)
          	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
          	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
          	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:96)
          	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:631)
          	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136)
          	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635)
          	at java.base/java.lang.Thread.run(Thread.java:840)
      
      **********************************************************************
      

      See https://github.com/apache/spark/actions/runs/7020654429/job/19100965399

      Attachments

        Issue Links

          Activity

            People

              gurwls223 Hyukjin Kwon
              gurwls223 Hyukjin Kwon
              Votes:
              0 Vote for this issue
              Watchers:
              1 Start watching this issue

              Dates

                Created:
                Updated:
                Resolved: