Uploaded image for project: 'Spark'
  1. Spark
  2. SPARK-12110

spark-1.5.1-bin-hadoop2.6; pyspark.ml.feature Exception: ("You must build Spark with Hive

    XMLWordPrintableJSON

Details

    • Bug
    • Status: Resolved
    • Major
    • Resolution: Won't Fix
    • 1.5.1
    • None
    • EC2
    • None
    • cluster created using spark-1.5.1-bin-hadoop2.6/ec2/spark-ec2

    Description

      I am using spark-1.5.1-bin-hadoop2.6. I used spark-1.5.1-bin-hadoop2.6/ec2/spark-ec2 to create a cluster and configured spark-env to use python3. I can not run the tokenizer sample code. Is there a work around?

      Kind regards

      Andy

      /root/spark/python/pyspark/sql/context.py in _ssql_ctx(self)
          658             raise Exception("You must build Spark with Hive. "
          659                             "Export 'SPARK_HIVE=true' and run "
      --> 660                             "build/sbt assembly", e)
          661 
          662     def _get_hive_ctx(self):
      
      Exception: ("You must build Spark with Hive. Export 'SPARK_HIVE=true' and run build/sbt assembly", Py4JJavaError('An error occurred while calling None.org.apache.spark.sql.hive.HiveContext.\n', JavaObject id=o38))
      
      
      
      
      http://spark.apache.org/docs/latest/ml-features.html#tokenizer
      
      from pyspark.ml.feature import Tokenizer, RegexTokenizer
      
      sentenceDataFrame = sqlContext.createDataFrame([
        (0, "Hi I heard about Spark"),
        (1, "I wish Java could use case classes"),
        (2, "Logistic,regression,models,are,neat")
      ], ["label", "sentence"])
      tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
      wordsDataFrame = tokenizer.transform(sentenceDataFrame)
      for words_label in wordsDataFrame.select("words", "label").take(3):
        print(words_label)
      
      ---------------------------------------------------------------------------
      Py4JJavaError                             Traceback (most recent call last)
      /root/spark/python/pyspark/sql/context.py in _ssql_ctx(self)
          654             if not hasattr(self, '_scala_HiveContext'):
      --> 655                 self._scala_HiveContext = self._get_hive_ctx()
          656             return self._scala_HiveContext
      
      /root/spark/python/pyspark/sql/context.py in _get_hive_ctx(self)
          662     def _get_hive_ctx(self):
      --> 663         return self._jvm.HiveContext(self._jsc.sc())
          664 
      
      /root/spark/python/lib/py4j-0.8.2.1-src.zip/py4j/java_gateway.py in __call__(self, *args)
          700         return_value = get_return_value(answer, self._gateway_client, None,
      --> 701                 self._fqn)
          702 
      
      /root/spark/python/pyspark/sql/utils.py in deco(*a, **kw)
           35         try:
      ---> 36             return f(*a, **kw)
           37         except py4j.protocol.Py4JJavaError as e:
      
      /root/spark/python/lib/py4j-0.8.2.1-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
          299                     'An error occurred while calling {0}{1}{2}.\n'.
      --> 300                     format(target_id, '.', name), value)
          301             else:
      
      Py4JJavaError: An error occurred while calling None.org.apache.spark.sql.hive.HiveContext.
      : java.lang.RuntimeException: java.io.IOException: Filesystem closed
      	at org.apache.hadoop.hive.ql.session.SessionState.start(SessionState.java:522)
      	at org.apache.spark.sql.hive.client.ClientWrapper.<init>(ClientWrapper.scala:171)
      	at org.apache.spark.sql.hive.HiveContext.executionHive$lzycompute(HiveContext.scala:162)
      	at org.apache.spark.sql.hive.HiveContext.executionHive(HiveContext.scala:160)
      	at org.apache.spark.sql.hive.HiveContext.<init>(HiveContext.scala:167)
      	at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
      	at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
      	at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
      	at java.lang.reflect.Constructor.newInstance(Constructor.java:422)
      	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:234)
      	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:379)
      	at py4j.Gateway.invoke(Gateway.java:214)
      	at py4j.commands.ConstructorCommand.invokeConstructor(ConstructorCommand.java:79)
      	at py4j.commands.ConstructorCommand.execute(ConstructorCommand.java:68)
      	at py4j.GatewayConnection.run(GatewayConnection.java:207)
      	at java.lang.Thread.run(Thread.java:745)
      Caused by: java.io.IOException: Filesystem closed
      	at org.apache.hadoop.hdfs.DFSClient.checkOpen(DFSClient.java:323)
      	at org.apache.hadoop.hdfs.DFSClient.getFileInfo(DFSClient.java:1057)
      	at org.apache.hadoop.hdfs.DistributedFileSystem.getFileStatus(DistributedFileSystem.java:554)
      	at org.apache.hadoop.hive.ql.session.SessionState.createRootHDFSDir(SessionState.java:599)
      	at org.apache.hadoop.hive.ql.session.SessionState.createSessionDirs(SessionState.java:554)
      	at org.apache.hadoop.hive.ql.session.SessionState.start(SessionState.java:508)
      	... 15 more
      
      
      During handling of the above exception, another exception occurred:
      
      Exception                                 Traceback (most recent call last)
      <ipython-input-1-0beb490d573c> in <module>()
            5   (1, "I wish Java could use case classes"),
            6   (2, "Logistic,regression,models,are,neat")
      ----> 7 ], ["label", "sentence"])
            8 tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
            9 wordsDataFrame = tokenizer.transform(sentenceDataFrame)
      
      /root/spark/python/pyspark/sql/context.py in createDataFrame(self, data, schema, samplingRatio)
          406             rdd, schema = self._createFromLocal(data, schema)
          407         jrdd = self._jvm.SerDeUtil.toJavaArray(rdd._to_java_object_rdd())
      --> 408         jdf = self._ssql_ctx.applySchemaToPythonRDD(jrdd.rdd(), schema.json())
          409         df = DataFrame(jdf, self)
          410         df._schema = schema
      
      /root/spark/python/pyspark/sql/context.py in _ssql_ctx(self)
          658             raise Exception("You must build Spark with Hive. "
          659                             "Export 'SPARK_HIVE=true' and run "
      --> 660                             "build/sbt assembly", e)
          661 
          662     def _get_hive_ctx(self):
      
      Exception: ("You must build Spark with Hive. Export 'SPARK_HIVE=true' and run build/sbt assembly", Py4JJavaError('An error occurred while calling None.org.apache.spark.sql.hive.HiveContext.\n', JavaObject id=o38))
      

      Attachments

        1. launchCluster.sh
          0.7 kB
          Andrew Davidson
        2. launchCluster.sh.out
          80 kB
          Andrew Davidson
        3. launchingSparkCluster.md
          11 kB
          Andrew Davidson

        Activity

          People

            Unassigned Unassigned
            aedwip Andrew Davidson
            Votes:
            0 Vote for this issue
            Watchers:
            4 Start watching this issue

            Dates

              Created:
              Updated:
              Resolved: