Skip to main content

object_store/path/
parts.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use percent_encoding::{AsciiSet, CONTROLS, percent_encode};
19use std::{
20    borrow::Cow,
21    iter::{self, FusedIterator},
22    str::SplitTerminator,
23};
24
25use crate::path::DELIMITER_BYTE;
26
27/// Error returned by [`PathPart::parse`]
28#[derive(Debug, thiserror::Error)]
29#[error(
30    "Encountered illegal character sequence \"{}\" whilst parsing path segment \"{}\"",
31    illegal,
32    segment
33)]
34#[allow(missing_copy_implementations)]
35pub struct InvalidPart {
36    segment: String,
37    illegal: String,
38}
39
40/// The PathPart type exists to validate the directory/file names that form part
41/// of a path.
42///
43/// A [`PathPart`] is guaranteed to:
44///
45/// * Contain no ASCII control characters or `/`
46/// * Not be a relative path segment, i.e. `.` or `..`
47#[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Debug, Default, Hash)]
48pub struct PathPart<'a> {
49    pub(super) raw: Cow<'a, str>,
50}
51
52impl<'a> PathPart<'a> {
53    /// Parse the provided path segment as a [`PathPart`] returning an error if invalid
54    pub fn parse(segment: &'a str) -> Result<Self, InvalidPart> {
55        if segment == "." || segment == ".." {
56            return Err(InvalidPart {
57                segment: segment.to_string(),
58                illegal: segment.to_string(),
59            });
60        }
61
62        for c in segment.chars() {
63            if c.is_ascii_control() || c == '/' {
64                return Err(InvalidPart {
65                    segment: segment.to_string(),
66                    // This is correct as only single byte characters up to this point
67                    illegal: c.to_string(),
68                });
69            }
70        }
71
72        Ok(Self {
73            raw: segment.into(),
74        })
75    }
76}
77
78/// Characters we want to encode.
79const INVALID: &AsciiSet = &CONTROLS
80    // The delimiter we are reserving for internal hierarchy
81    .add(DELIMITER_BYTE)
82    // Characters AWS recommends avoiding for object keys
83    // https://docs.aws.amazon.com/AmazonS3/latest/dev/UsingMetadata.html
84    .add(b'\\')
85    .add(b'{')
86    .add(b'^')
87    .add(b'}')
88    .add(b'%')
89    .add(b'`')
90    .add(b']')
91    .add(b'"') // " <-- my editor is confused about double quotes within single quotes
92    .add(b'>')
93    .add(b'[')
94    .add(b'~')
95    .add(b'<')
96    .add(b'#')
97    .add(b'|')
98    // Characters Google Cloud Storage recommends avoiding for object names
99    // https://cloud.google.com/storage/docs/naming-objects
100    .add(b'\r')
101    .add(b'\n')
102    .add(b'*')
103    .add(b'?');
104
105impl<'a> From<&'a [u8]> for PathPart<'a> {
106    fn from(v: &'a [u8]) -> Self {
107        let inner = match v {
108            // We don't want to encode `.` generally, but we do want to disallow parts of paths
109            // to be equal to `.` or `..` to prevent file system traversal shenanigans.
110            b"." => "%2E".into(),
111            b".." => "%2E%2E".into(),
112            other => percent_encode(other, INVALID).into(),
113        };
114        Self { raw: inner }
115    }
116}
117
118impl<'a> From<&'a str> for PathPart<'a> {
119    fn from(v: &'a str) -> Self {
120        Self::from(v.as_bytes())
121    }
122}
123
124impl From<String> for PathPart<'static> {
125    fn from(s: String) -> Self {
126        Self {
127            raw: Cow::Owned(PathPart::from(s.as_str()).raw.into_owned()),
128        }
129    }
130}
131
132impl AsRef<str> for PathPart<'_> {
133    fn as_ref(&self) -> &str {
134        self.raw.as_ref()
135    }
136}
137
138/// See [`Path::parts`](super::Path::parts)
139#[derive(Debug, Clone)]
140pub struct PathParts<'a>(iter::Map<SplitTerminator<'a, char>, fn(&str) -> PathPart<'_>>);
141
142impl<'a> PathParts<'a> {
143    /// Create an iterator over the parts of the provided raw [`Path`](super::Path).
144    pub(super) fn new(raw: &'a str) -> Self {
145        Self(
146            raw.split_terminator(super::DELIMITER_CHAR)
147                .map(|s| PathPart { raw: s.into() }),
148        )
149    }
150}
151
152impl<'a> Iterator for PathParts<'a> {
153    type Item = PathPart<'a>;
154
155    fn next(&mut self) -> Option<Self::Item> {
156        self.0.next()
157    }
158}
159
160impl<'a> FusedIterator for PathParts<'a> {}
161
162impl<'a> DoubleEndedIterator for PathParts<'a> {
163    fn next_back(&mut self) -> Option<Self::Item> {
164        self.0.next_back()
165    }
166}
167
168#[cfg(test)]
169mod tests {
170    use super::*;
171
172    #[test]
173    fn path_part_delimiter_gets_encoded() {
174        let part: PathPart<'_> = "foo/bar".into();
175        assert_eq!(part.raw, "foo%2Fbar");
176    }
177
178    #[test]
179    fn path_part_given_already_encoded_string() {
180        let part: PathPart<'_> = "foo%2Fbar".into();
181        assert_eq!(part.raw, "foo%252Fbar");
182    }
183
184    #[test]
185    fn path_part_cant_be_one_dot() {
186        let part: PathPart<'_> = ".".into();
187        assert_eq!(part.raw, "%2E");
188    }
189
190    #[test]
191    fn path_part_cant_be_two_dots() {
192        let part: PathPart<'_> = "..".into();
193        assert_eq!(part.raw, "%2E%2E");
194    }
195
196    #[test]
197    fn path_part_parse() {
198        PathPart::parse("foo").unwrap();
199        PathPart::parse("foo/bar").unwrap_err();
200
201        // Test percent-encoded path
202        PathPart::parse("foo%2Fbar").unwrap();
203        PathPart::parse("L%3ABC.parquet").unwrap();
204
205        // Test path containing bad escape sequence
206        PathPart::parse("%Z").unwrap();
207        PathPart::parse("%%").unwrap();
208    }
209}