Uploaded image for project: 'Spark'
  1. Spark
  2. SPARK-28952

Getting Error using 'Linearregression' in spark 2.3.4

    XMLWordPrintableJSON

Details

    • Bug
    • Status: Resolved
    • Major
    • Resolution: Not A Problem
    • 2.4.3
    • None
    • ML
    • None
    • Hide
      #Load the csv data into a RDD
      autoData= sc.textFile("auto-miles-per-gallon.csv")
      autoData.cache()

      #Remove the first line that contains header
      dataLines = autoData.filter(lambda x: "CYLINDERS" not in x)
      dataLines.count()

      # Convert the RDD into a Dense Vector. As a part of this excercise we'll
      # 1. Remove unwanted columns
      # 2. change non-numeric values to numeric

      import math
      from pyspark.mllib.linalg import Vectors

      #Use default for average HP
      avgHP=sc.broadcast(80.0)

      def transformToNumeric(inputStr):
          global avgHP
          attList= inputStr.split(",")
          
          #Replace ? values with a normal value
          hpvalue= attList[3]
          if hpvalue =="?":
              hpvalue= avgHP.value
          # Filterout the columns which are not needed at this stage
          values = Vectors.dense([float(attList[0]), float(attList[1]), \
                                  hpvalue, \
                                  float(attList[5]),
                                  float(attList[6])])
          return values

      #Keep only MPG, CYLINDERS, HP, ACCELERATION and MODEL YEAR
      autoVectors = dataLines.map(transformToNumeric)
      autoVectors.collect()

      # Perform statistical analysis
      from pyspark.mllib.stat import Statistics
      autoStats = Statistics.colStats(autoVectors)
      autoStats.mean()
      autoStats.variance()
      autoStats.min()
      autoStats.max()

      Statistics.corr(autoVectors)

      #Transform to a dtatframe for input to machine learning
      # Drop columns that are nor required (Low-correlation)

      from pyspark.sql import SQLContext
      sqlContext = SQLContext(sc)

      def transformToLabeledPoint(inStr):
          # We're ignoring 3rd column because of low co-relation value
          lp= ( float(inStr[0]), Vectors.dense([inStr[1],inStr[2],inStr[4]]))
          return lp

      autoLp = autoVectors.map (transformToLabeledPoint) #this will be a new RDD


      autoDF = sqlContext.createDataFrame(autoLp,["label","features"])
      autoDF.select("label","features").show(10)

      #Find Corrrelations

      numFeatures = autoDF.take(1)[0].features.size
      labelRDD =autoDF.rdd.map(lambda lp: float(lp.label))

      for i in range(numFeatures):
          featureRDD = autoDF.rdd.map(lambda lp: lp.features[i])
          corr = Statistics.corr(labelRDD,featureRDD, 'pearson')
          print("%d\t%g" % (i,corr))
          
      # Split into Traning and Testing data

      (trainingData,testData) =autoDF.randomSplit([0.9,0.1])
      trainingData.count()
      testData.count()

      # Build the Model on training data
      from pyspark.ml.regression import LinearRegression
      lr = LinearRegression(maxIter =10)
      lrModel = lr.fit(trainingData)
      Show
      #Load the csv data into a RDD autoData= sc.textFile("auto-miles-per-gallon.csv") autoData.cache() #Remove the first line that contains header dataLines = autoData.filter(lambda x: "CYLINDERS" not in x) dataLines.count() # Convert the RDD into a Dense Vector. As a part of this excercise we'll # 1. Remove unwanted columns # 2. change non-numeric values to numeric import math from pyspark.mllib.linalg import Vectors #Use default for average HP avgHP=sc.broadcast(80.0) def transformToNumeric(inputStr):     global avgHP     attList= inputStr.split(",")          #Replace ? values with a normal value     hpvalue= attList[3]     if hpvalue =="?":         hpvalue= avgHP.value     # Filterout the columns which are not needed at this stage     values = Vectors.dense([float(attList[0]), float(attList[1]), \                             hpvalue, \                             float(attList[5]),                             float(attList[6])])     return values #Keep only MPG, CYLINDERS, HP, ACCELERATION and MODEL YEAR autoVectors = dataLines.map(transformToNumeric) autoVectors.collect() # Perform statistical analysis from pyspark.mllib.stat import Statistics autoStats = Statistics.colStats(autoVectors) autoStats.mean() autoStats.variance() autoStats.min() autoStats.max() Statistics.corr(autoVectors) #Transform to a dtatframe for input to machine learning # Drop columns that are nor required (Low-correlation) from pyspark.sql import SQLContext sqlContext = SQLContext(sc) def transformToLabeledPoint(inStr):     # We're ignoring 3rd column because of low co-relation value     lp= ( float(inStr[0]), Vectors.dense([inStr[1],inStr[2],inStr[4]]))     return lp autoLp = autoVectors.map (transformToLabeledPoint) #this will be a new RDD autoDF = sqlContext.createDataFrame(autoLp,["label","features"]) autoDF.select("label","features").show(10) #Find Corrrelations numFeatures = autoDF.take(1)[0].features.size labelRDD =autoDF.rdd.map(lambda lp: float(lp.label)) for i in range(numFeatures):     featureRDD = autoDF.rdd.map(lambda lp: lp.features[i])     corr = Statistics.corr(labelRDD,featureRDD, 'pearson')     print("%d\t%g" % (i,corr))      # Split into Traning and Testing data (trainingData,testData) =autoDF.randomSplit([0.9,0.1]) trainingData.count() testData.count() # Build the Model on training data from pyspark.ml.regression import LinearRegression lr = LinearRegression(maxIter =10) lrModel = lr.fit(trainingData)

    Description

      Getting following error While fitting the 'LinearRegression':

       

      File "C:\Spark\spark-2.4.3-bin-hadoop2.7\python\pyspark\sql\utils.py", line 79, in deco
      raise IllegalArgumentException(s.split(': ', 1)[1], stackTrace)

      IllegalArgumentException: 'requirement failed: Column features must be of type struct<type:tinyint,size:int,indices:array<int>,values:array<double>> but was actually struct<type:tinyint,size:int,indices:array<int>,values:array<double>>.'

      Attachments

        Activity

          People

            Unassigned Unassigned
            sandeep.50g@gmail.com Sandeep Singh
            Votes:
            0 Vote for this issue
            Watchers:
            1 Start watching this issue

            Dates

              Created:
              Updated:
              Resolved: