Uploaded image for project: 'Apache Arrow'
  1. Apache Arrow
  2. ARROW-18276

[Python] Reading from hdfs using pyarrow 10.0.0 throws OSError: [Errno 22] Opening HDFS file

    XMLWordPrintableJSON

Details

    • Bug
    • Status: Open
    • Major
    • Resolution: Unresolved
    • 10.0.0
    • None
    • Python
    • None
    • pyarrow 10.0.0
      fsspec 2022.7.1
      pandas 1.3.3
      python 3.8.11.

    Description

      Hey!
      I am trying to read a CSV file using pyarrow together with fsspec from HDFS.
      I used to do this with pyarrow 9.0.0 and fsspec 2022.7.1, however, after I upgraded to pyarrow 10.0.0 this stopped working.

      I am not quite sure if this is an incompatibility introduced in the new pyarrow version or if it is a Bug in fsspec. So if I am in the wrong place here, please let me know.

      Apart from pyarrow 10.0.0 and fsspec 2022.7.1, I am using pandas version 1.3.3 and python 3.8.11.

      Here is the full stack trace

      pd.read_csv("hdfs://10.0.2.15:8020/Projects/testing/testing_Training_Datasets/transactions_view_fraud_batch_fv_1_1/validation/part-00000-42b57ad2-57eb-4a63-bfaa-7375e82863e8-c000.csv")
      ---------------------------------------------------------------------------
      OSError                                   Traceback (most recent call last)
      /srv/hops/anaconda/envs/theenv/lib/python3.8/site-packages/pandas/io/parsers/readers.py in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, error_bad_lines, warn_bad_lines, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options)
          584     kwds.update(kwds_defaults)
          585 
      --> 586     return _read(filepath_or_buffer, kwds)
          587 
          588 
      /srv/hops/anaconda/envs/theenv/lib/python3.8/site-packages/pandas/io/parsers/readers.py in _read(filepath_or_buffer, kwds)
          480 
          481     # Create the parser.
      --> 482     parser = TextFileReader(filepath_or_buffer, **kwds)
          483 
          484     if chunksize or iterator:
      /srv/hops/anaconda/envs/theenv/lib/python3.8/site-packages/pandas/io/parsers/readers.py in _init_(self, f, engine, **kwds)
          809             self.options["has_index_names"] = kwds["has_index_names"]
          810 
      --> 811         self._engine = self._make_engine(self.engine)
          812 
          813     def close(self):
      /srv/hops/anaconda/envs/theenv/lib/python3.8/site-packages/pandas/io/parsers/readers.py in _make_engine(self, engine)
         1038             )
         1039         # error: Too many arguments for "ParserBase"
      -> 1040         return mapping[engine](self.f, **self.options)  # type: ignore[call-arg]
         1041 
         1042     def _failover_to_python(self):
      /srv/hops/anaconda/envs/theenv/lib/python3.8/site-packages/pandas/io/parsers/c_parser_wrapper.py in _init_(self, src, **kwds)
           49 
           50         # open handles
      ---> 51         self._open_handles(src, kwds)
           52         assert self.handles is not None
           53 
      /srv/hops/anaconda/envs/theenv/lib/python3.8/site-packages/pandas/io/parsers/base_parser.py in _open_handles(self, src, kwds)
          220         Let the readers open IOHandles after they are done with their potential raises.
          221         """
      --> 222         self.handles = get_handle(
          223             src,
          224             "r",
      /srv/hops/anaconda/envs/theenv/lib/python3.8/site-packages/pandas/io/common.py in get_handle(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)
          607 
          608     # open URLs
      --> 609     ioargs = _get_filepath_or_buffer(
          610         path_or_buf,
          611         encoding=encoding,
      /srv/hops/anaconda/envs/theenv/lib/python3.8/site-packages/pandas/io/common.py in _get_filepath_or_buffer(filepath_or_buffer, encoding, compression, mode, storage_options)
          356 
          357         try:
      --> 358             file_obj = fsspec.open(
          359                 filepath_or_buffer, mode=fsspec_mode, **(storage_options or {})
          360             ).open()
      /srv/hops/anaconda/envs/theenv/lib/python3.8/site-packages/fsspec/core.py in open(self)
          133         during the life of the file-like it generates.
          134         """
      --> 135         return self._enter_()
          136 
          137     def close(self):
      /srv/hops/anaconda/envs/theenv/lib/python3.8/site-packages/fsspec/core.py in _enter_(self)
          101         mode = self.mode.replace("t", "").replace("b", "") + "b"
          102 
      --> 103         f = self.fs.open(self.path, mode=mode)
          104 
          105         self.fobjects = [f]
      /srv/hops/anaconda/envs/theenv/lib/python3.8/site-packages/fsspec/spec.py in open(self, path, mode, block_size, cache_options, compression, **kwargs)
         1092         else:
         1093             ac = kwargs.pop("autocommit", not self._intrans)
      -> 1094             f = self._open(
         1095                 path,
         1096                 mode=mode,
      /srv/hops/anaconda/envs/theenv/lib/python3.8/site-packages/fsspec/implementations/arrow.py in wrapper(*args, **kwargs)
           15     def wrapper(*args, **kwargs):
           16         try:
      ---> 17             return func(*args, **kwargs)
           18         except OSError as exception:
           19             if not exception.args:
      /srv/hops/anaconda/envs/theenv/lib/python3.8/site-packages/fsspec/implementations/arrow.py in _open(self, path, mode, block_size, **kwargs)
          157             # disable compression auto-detection
          158             _kwargs["compression"] = None
      --> 159         stream = method(path, **_kwargs)
          160 
          161         return ArrowFile(self, stream, path, mode, block_size, **kwargs)
      /srv/hops/anaconda/envs/theenv/lib/python3.8/site-packages/pyarrow/_fs.pyx in pyarrow._fs.FileSystem.open_input_stream()
      /srv/hops/anaconda/envs/theenv/lib/python3.8/site-packages/pyarrow/error.pxi in pyarrow.lib.pyarrow_internal_check_status()
      /srv/hops/anaconda/envs/theenv/lib/python3.8/site-packages/pyarrow/error.pxi in pyarrow.lib.check_status()
      OSError: [Errno 22] Opening HDFS file '10.0.2.15:8020/Projects/test/test_Training_Datasets/td_1/td/part-00000-acb66eca-636e-4917-9673-e6646f0b4486-c000.csv' failed. Detail: [errno 22] Invalid argument
      

      However, if I leave out the namenode IP and port, it works as expected:

      pd.read_csv("hdfs:///Projects/testing/testing_Training_Datasets/transactions_view_fraud_batch_fv_1_1/validation/part-00000-42b57ad2-57eb-4a63-bfaa-7375e82863e8-c000.csv")

      Any help is appreciated, thank you!

      Attachments

        Activity

          People

            Unassigned Unassigned
            moritzmeister Moritz Meister
            Votes:
            0 Vote for this issue
            Watchers:
            4 Start watching this issue

            Dates

              Created:
              Updated: