Details
-
Bug
-
Status: Open
-
Major
-
Resolution: Unresolved
-
10.0.0
-
None
-
None
-
pyarrow 10.0.0
fsspec 2022.7.1
pandas 1.3.3
python 3.8.11.
Description
Hey!
I am trying to read a CSV file using pyarrow together with fsspec from HDFS.
I used to do this with pyarrow 9.0.0 and fsspec 2022.7.1, however, after I upgraded to pyarrow 10.0.0 this stopped working.
I am not quite sure if this is an incompatibility introduced in the new pyarrow version or if it is a Bug in fsspec. So if I am in the wrong place here, please let me know.
Apart from pyarrow 10.0.0 and fsspec 2022.7.1, I am using pandas version 1.3.3 and python 3.8.11.
Here is the full stack trace
pd.read_csv("hdfs://10.0.2.15:8020/Projects/testing/testing_Training_Datasets/transactions_view_fraud_batch_fv_1_1/validation/part-00000-42b57ad2-57eb-4a63-bfaa-7375e82863e8-c000.csv") --------------------------------------------------------------------------- OSError Traceback (most recent call last) /srv/hops/anaconda/envs/theenv/lib/python3.8/site-packages/pandas/io/parsers/readers.py in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, error_bad_lines, warn_bad_lines, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options) 584 kwds.update(kwds_defaults) 585 --> 586 return _read(filepath_or_buffer, kwds) 587 588 /srv/hops/anaconda/envs/theenv/lib/python3.8/site-packages/pandas/io/parsers/readers.py in _read(filepath_or_buffer, kwds) 480 481 # Create the parser. --> 482 parser = TextFileReader(filepath_or_buffer, **kwds) 483 484 if chunksize or iterator: /srv/hops/anaconda/envs/theenv/lib/python3.8/site-packages/pandas/io/parsers/readers.py in _init_(self, f, engine, **kwds) 809 self.options["has_index_names"] = kwds["has_index_names"] 810 --> 811 self._engine = self._make_engine(self.engine) 812 813 def close(self): /srv/hops/anaconda/envs/theenv/lib/python3.8/site-packages/pandas/io/parsers/readers.py in _make_engine(self, engine) 1038 ) 1039 # error: Too many arguments for "ParserBase" -> 1040 return mapping[engine](self.f, **self.options) # type: ignore[call-arg] 1041 1042 def _failover_to_python(self): /srv/hops/anaconda/envs/theenv/lib/python3.8/site-packages/pandas/io/parsers/c_parser_wrapper.py in _init_(self, src, **kwds) 49 50 # open handles ---> 51 self._open_handles(src, kwds) 52 assert self.handles is not None 53 /srv/hops/anaconda/envs/theenv/lib/python3.8/site-packages/pandas/io/parsers/base_parser.py in _open_handles(self, src, kwds) 220 Let the readers open IOHandles after they are done with their potential raises. 221 """ --> 222 self.handles = get_handle( 223 src, 224 "r", /srv/hops/anaconda/envs/theenv/lib/python3.8/site-packages/pandas/io/common.py in get_handle(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options) 607 608 # open URLs --> 609 ioargs = _get_filepath_or_buffer( 610 path_or_buf, 611 encoding=encoding, /srv/hops/anaconda/envs/theenv/lib/python3.8/site-packages/pandas/io/common.py in _get_filepath_or_buffer(filepath_or_buffer, encoding, compression, mode, storage_options) 356 357 try: --> 358 file_obj = fsspec.open( 359 filepath_or_buffer, mode=fsspec_mode, **(storage_options or {}) 360 ).open() /srv/hops/anaconda/envs/theenv/lib/python3.8/site-packages/fsspec/core.py in open(self) 133 during the life of the file-like it generates. 134 """ --> 135 return self._enter_() 136 137 def close(self): /srv/hops/anaconda/envs/theenv/lib/python3.8/site-packages/fsspec/core.py in _enter_(self) 101 mode = self.mode.replace("t", "").replace("b", "") + "b" 102 --> 103 f = self.fs.open(self.path, mode=mode) 104 105 self.fobjects = [f] /srv/hops/anaconda/envs/theenv/lib/python3.8/site-packages/fsspec/spec.py in open(self, path, mode, block_size, cache_options, compression, **kwargs) 1092 else: 1093 ac = kwargs.pop("autocommit", not self._intrans) -> 1094 f = self._open( 1095 path, 1096 mode=mode, /srv/hops/anaconda/envs/theenv/lib/python3.8/site-packages/fsspec/implementations/arrow.py in wrapper(*args, **kwargs) 15 def wrapper(*args, **kwargs): 16 try: ---> 17 return func(*args, **kwargs) 18 except OSError as exception: 19 if not exception.args: /srv/hops/anaconda/envs/theenv/lib/python3.8/site-packages/fsspec/implementations/arrow.py in _open(self, path, mode, block_size, **kwargs) 157 # disable compression auto-detection 158 _kwargs["compression"] = None --> 159 stream = method(path, **_kwargs) 160 161 return ArrowFile(self, stream, path, mode, block_size, **kwargs) /srv/hops/anaconda/envs/theenv/lib/python3.8/site-packages/pyarrow/_fs.pyx in pyarrow._fs.FileSystem.open_input_stream() /srv/hops/anaconda/envs/theenv/lib/python3.8/site-packages/pyarrow/error.pxi in pyarrow.lib.pyarrow_internal_check_status() /srv/hops/anaconda/envs/theenv/lib/python3.8/site-packages/pyarrow/error.pxi in pyarrow.lib.check_status() OSError: [Errno 22] Opening HDFS file '10.0.2.15:8020/Projects/test/test_Training_Datasets/td_1/td/part-00000-acb66eca-636e-4917-9673-e6646f0b4486-c000.csv' failed. Detail: [errno 22] Invalid argument
However, if I leave out the namenode IP and port, it works as expected:
pd.read_csv("hdfs:///Projects/testing/testing_Training_Datasets/transactions_view_fraud_batch_fv_1_1/validation/part-00000-42b57ad2-57eb-4a63-bfaa-7375e82863e8-c000.csv")
Any help is appreciated, thank you!