[SPARK-28952] Getting Error using 'Linearregression' in spark 2.3.4 - ASF JIRA

Details

Type: Bug
Status: Resolved
Priority: Major
Resolution: Not A Problem
Affects Version/s: 2.4.3
Fix Version/s: None
Component/s: ML
Labels:
None

Docs Text:

Hide
#Load the csv data into a RDD
autoData= sc.textFile("auto-miles-per-gallon.csv")
autoData.cache()

#Remove the first line that contains header
dataLines = autoData.filter(lambda x: "CYLINDERS" not in x)
dataLines.count()

# Convert the RDD into a Dense Vector. As a part of this excercise we'll
# 1. Remove unwanted columns
# 2. change non-numeric values to numeric

import math
from pyspark.mllib.linalg import Vectors

#Use default for average HP
avgHP=sc.broadcast(80.0)

def transformToNumeric(inputStr):
    global avgHP
    attList= inputStr.split(",")

    #Replace ? values with a normal value
    hpvalue= attList[3]
    if hpvalue =="?":
        hpvalue= avgHP.value
    # Filterout the columns which are not needed at this stage
    values = Vectors.dense([float(attList[0]), float(attList[1]), \
                            hpvalue, \
                            float(attList[5]),
                            float(attList[6])])
    return values

#Keep only MPG, CYLINDERS, HP, ACCELERATION and MODEL YEAR
autoVectors = dataLines.map(transformToNumeric)
autoVectors.collect()

# Perform statistical analysis
from pyspark.mllib.stat import Statistics
autoStats = Statistics.colStats(autoVectors)
autoStats.mean()
autoStats.variance()
autoStats.min()
autoStats.max()

Statistics.corr(autoVectors)

#Transform to a dtatframe for input to machine learning
# Drop columns that are nor required (Low-correlation)

from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)

def transformToLabeledPoint(inStr):
    # We're ignoring 3rd column because of low co-relation value
    lp= ( float(inStr[0]), Vectors.dense([inStr[1],inStr[2],inStr[4]]))
    return lp

autoLp = autoVectors.map (transformToLabeledPoint) #this will be a new RDD

autoDF = sqlContext.createDataFrame(autoLp,["label","features"])
autoDF.select("label","features").show(10)

#Find Corrrelations

numFeatures = autoDF.take(1)[0].features.size
labelRDD =autoDF.rdd.map(lambda lp: float(lp.label))

for i in range(numFeatures):
    featureRDD = autoDF.rdd.map(lambda lp: lp.features[i])
    corr = Statistics.corr(labelRDD,featureRDD, 'pearson')
    print("%d\t%g" % (i,corr))

# Split into Traning and Testing data

(trainingData,testData) =autoDF.randomSplit([0.9,0.1])
trainingData.count()
testData.count()

# Build the Model on training data
from pyspark.ml.regression import LinearRegression
lr = LinearRegression(maxIter =10)
lrModel = lr.fit(trainingData)

Show
#Load the csv data into a RDD autoData= sc.textFile("auto-miles-per-gallon.csv") autoData.cache() #Remove the first line that contains header dataLines = autoData.filter(lambda x: "CYLINDERS" not in x) dataLines.count() # Convert the RDD into a Dense Vector. As a part of this excercise we'll # 1. Remove unwanted columns # 2. change non-numeric values to numeric import math from pyspark.mllib.linalg import Vectors #Use default for average HP avgHP=sc.broadcast(80.0) def transformToNumeric(inputStr):     global avgHP     attList= inputStr.split(",")          #Replace ? values with a normal value     hpvalue= attList[3]     if hpvalue =="?":         hpvalue= avgHP.value     # Filterout the columns which are not needed at this stage     values = Vectors.dense([float(attList[0]), float(attList[1]), \                             hpvalue, \                             float(attList[5]),                             float(attList[6])])     return values #Keep only MPG, CYLINDERS, HP, ACCELERATION and MODEL YEAR autoVectors = dataLines.map(transformToNumeric) autoVectors.collect() # Perform statistical analysis from pyspark.mllib.stat import Statistics autoStats = Statistics.colStats(autoVectors) autoStats.mean() autoStats.variance() autoStats.min() autoStats.max() Statistics.corr(autoVectors) #Transform to a dtatframe for input to machine learning # Drop columns that are nor required (Low-correlation) from pyspark.sql import SQLContext sqlContext = SQLContext(sc) def transformToLabeledPoint(inStr):     # We're ignoring 3rd column because of low co-relation value     lp= ( float(inStr[0]), Vectors.dense([inStr[1],inStr[2],inStr[4]]))     return lp autoLp = autoVectors.map (transformToLabeledPoint) #this will be a new RDD autoDF = sqlContext.createDataFrame(autoLp,["label","features"]) autoDF.select("label","features").show(10) #Find Corrrelations numFeatures = autoDF.take(1)[0].features.size labelRDD =autoDF.rdd.map(lambda lp: float(lp.label)) for i in range(numFeatures):     featureRDD = autoDF.rdd.map(lambda lp: lp.features[i])     corr = Statistics.corr(labelRDD,featureRDD, 'pearson')     print("%d\t%g" % (i,corr))      # Split into Traning and Testing data (trainingData,testData) =autoDF.randomSplit([0.9,0.1]) trainingData.count() testData.count() # Build the Model on training data from pyspark.ml.regression import LinearRegression lr = LinearRegression(maxIter =10) lrModel = lr.fit(trainingData)

Description

Getting following error While fitting the 'LinearRegression':

File "C:\Spark\spark-2.4.3-bin-hadoop2.7\python\pyspark\sql\utils.py", line 79, in deco
raise IllegalArgumentException(s.split(': ', 1)[1], stackTrace)

IllegalArgumentException: 'requirement failed: Column features must be of type struct<type:tinyint,size:int,indices:array<int>,values:array<double>> but was actually struct<type:tinyint,size:int,indices:array<int>,values:array<double>>.'

Getting Error using 'Linearregression' in spark 2.3.4

Details

Description

Attachments

Activity

People

Dates