Skip to main content

object_store/
parse.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use crate::ObjectStore;
19#[cfg(all(feature = "fs", not(target_arch = "wasm32")))]
20use crate::local::LocalFileSystem;
21use crate::memory::InMemory;
22use crate::path::Path;
23use url::Url;
24
25#[derive(Debug, thiserror::Error)]
26pub enum Error {
27    #[error("Unable to recognise URL \"{}\"", url)]
28    Unrecognised { url: Url },
29
30    #[error(transparent)]
31    Path {
32        #[from]
33        source: crate::path::Error,
34    },
35}
36
37impl From<Error> for super::Error {
38    fn from(e: Error) -> Self {
39        Self::Generic {
40            store: "URL",
41            source: Box::new(e),
42        }
43    }
44}
45
46/// Recognizes various URL formats, identifying the relevant [`ObjectStore`]
47///
48/// See [`ObjectStoreScheme::parse`] for more details
49///
50/// # Supported formats:
51/// - `file:///path/to/my/file` -> [`LocalFileSystem`]
52/// - `memory:///` -> [`InMemory`]
53/// - `s3://bucket/path` -> [`AmazonS3`](crate::aws::AmazonS3) (also supports `s3a`)
54/// - `gs://bucket/path` -> [`GoogleCloudStorage`](crate::gcp::GoogleCloudStorage)
55/// - `[az|abfs[s]]://container[@<account>.<host>]/path` -> [`MicrosoftAzure`](crate::azure::MicrosoftAzure)
56/// - `http://mydomain/path` -> [`HttpStore`](crate::http::HttpStore)
57/// - `https://mydomain/path` -> [`HttpStore`](crate::http::HttpStore)
58///
59/// There are also special cases for AWS and Azure for `https://{host?}/path` paths:
60/// - `dfs.core.windows.net`, `blob.core.windows.net`, `dfs.fabric.microsoft.com`, `blob.fabric.microsoft.com` -> [`MicrosoftAzure`](crate::azure::MicrosoftAzure)
61/// - `amazonaws.com` -> [`AmazonS3`](crate::aws::AmazonS3)
62/// - `r2.cloudflarestorage.com` -> [`AmazonS3`](crate::aws::AmazonS3)
63///
64#[non_exhaustive] // permit new variants
65#[derive(Debug, Eq, PartialEq, Clone)]
66pub enum ObjectStoreScheme {
67    /// Url corresponding to [`LocalFileSystem`]
68    Local,
69    /// Url corresponding to [`InMemory`]
70    Memory,
71    /// Url corresponding to [`AmazonS3`](crate::aws::AmazonS3)
72    AmazonS3,
73    /// Url corresponding to [`GoogleCloudStorage`](crate::gcp::GoogleCloudStorage)
74    GoogleCloudStorage,
75    /// Url corresponding to [`MicrosoftAzure`](crate::azure::MicrosoftAzure)
76    MicrosoftAzure,
77    /// Url corresponding to [`HttpStore`](crate::http::HttpStore)
78    Http,
79}
80
81impl ObjectStoreScheme {
82    /// Create an [`ObjectStoreScheme`] from the provided [`Url`]
83    ///
84    /// Returns the [`ObjectStoreScheme`] and the remaining [`Path`]
85    ///
86    /// # Example
87    /// ```
88    /// # use url::Url;
89    /// # use object_store::ObjectStoreScheme;
90    /// let url: Url = "file:///path/to/my/file".parse().unwrap();
91    /// let (scheme, path) = ObjectStoreScheme::parse(&url).unwrap();
92    /// assert_eq!(scheme, ObjectStoreScheme::Local);
93    /// assert_eq!(path.as_ref(), "path/to/my/file");
94    ///
95    /// let url: Url = "https://blob.core.windows.net/container/path/to/my/file".parse().unwrap();
96    /// let (scheme, path) = ObjectStoreScheme::parse(&url).unwrap();
97    /// assert_eq!(scheme, ObjectStoreScheme::MicrosoftAzure);
98    /// assert_eq!(path.as_ref(), "path/to/my/file");
99    ///
100    /// let url: Url = "https://example.com/path/to/my/file".parse().unwrap();
101    /// let (scheme, path) = ObjectStoreScheme::parse(&url).unwrap();
102    /// assert_eq!(scheme, ObjectStoreScheme::Http);
103    /// assert_eq!(path.as_ref(), "path/to/my/file");
104    /// ```
105    pub fn parse(url: &Url) -> Result<(Self, Path), Error> {
106        let strip_bucket = || Some(url.path().strip_prefix('/')?.split_once('/')?.1);
107
108        let (scheme, path) = match (url.scheme(), url.host_str()) {
109            ("file", None) => (Self::Local, url.path()),
110            ("memory", None) => (Self::Memory, url.path()),
111            ("s3" | "s3a", Some(_)) => (Self::AmazonS3, url.path()),
112            ("gs", Some(_)) => (Self::GoogleCloudStorage, url.path()),
113            ("az" | "adl" | "azure" | "abfs" | "abfss", Some(_)) => {
114                (Self::MicrosoftAzure, url.path())
115            }
116            ("http", Some(_)) => (Self::Http, url.path()),
117            ("https", Some(host)) => {
118                if host.ends_with("dfs.core.windows.net")
119                    || host.ends_with("blob.core.windows.net")
120                    || host.ends_with("dfs.fabric.microsoft.com")
121                    || host.ends_with("blob.fabric.microsoft.com")
122                {
123                    (Self::MicrosoftAzure, strip_bucket().unwrap_or_default())
124                } else if host.ends_with("amazonaws.com") {
125                    match host.starts_with("s3") {
126                        true => (Self::AmazonS3, strip_bucket().unwrap_or_default()),
127                        false => (Self::AmazonS3, url.path()),
128                    }
129                } else if host.ends_with("r2.cloudflarestorage.com") {
130                    (Self::AmazonS3, strip_bucket().unwrap_or_default())
131                } else {
132                    (Self::Http, url.path())
133                }
134            }
135            _ => return Err(Error::Unrecognised { url: url.clone() }),
136        };
137
138        Ok((scheme, Path::from_url_path(path)?))
139    }
140}
141
142#[cfg(feature = "cloud")]
143macro_rules! builder_opts {
144    ($builder:ty, $url:expr, $options:expr) => {{
145        let builder = $options.into_iter().fold(
146            <$builder>::new().with_url($url.to_string()),
147            |builder, (key, value)| match key.as_ref().to_ascii_lowercase().parse() {
148                Ok(k) => builder.with_config(k, value),
149                Err(_) => builder,
150            },
151        );
152        Box::new(builder.build()?) as _
153    }};
154}
155
156/// Create an [`ObjectStore`] based on the provided `url`
157///
158/// Returns
159/// - An [`ObjectStore`] of the corresponding type
160/// - The [`Path`] into the [`ObjectStore`] of the addressed resource
161pub fn parse_url(url: &Url) -> Result<(Box<dyn ObjectStore>, Path), super::Error> {
162    parse_url_opts(url, std::iter::empty::<(&str, &str)>())
163}
164
165/// Create an [`ObjectStore`] based on the provided `url` and options
166///
167/// This method can be used to create an instance of one of the provided
168/// `ObjectStore` implementations based on the URL scheme (see
169/// [`ObjectStoreScheme`] for more details).
170///
171/// For example
172/// * `file:///path/to/my/file` will return a [`LocalFileSystem`] instance
173/// * `s3://bucket/path` will return an [`AmazonS3`] instance if the `aws` feature is enabled.
174///
175/// Arguments:
176/// * `url`: The URL to parse
177/// * `options`: A list of key-value pairs to pass to the [`ObjectStore`] builder.
178///   Note different object stores accept different configuration options, so
179///   the options that are read depends on the `url` value. One common pattern
180///   is to pass configuration information via process variables using [`std::env::vars`].
181///
182/// Returns
183/// - An [`ObjectStore`] of the corresponding type
184/// - The [`Path`] into the [`ObjectStore`] of the addressed resource
185///
186/// [`AmazonS3`]: https://docs.rs/object_store/0.12.0/object_store/aws/struct.AmazonS3.html
187pub fn parse_url_opts<I, K, V>(
188    url: &Url,
189    options: I,
190) -> Result<(Box<dyn ObjectStore>, Path), super::Error>
191where
192    I: IntoIterator<Item = (K, V)>,
193    K: AsRef<str>,
194    V: Into<String>,
195{
196    let _options = options;
197    let (scheme, path) = ObjectStoreScheme::parse(url)?;
198    let path = Path::parse(path)?;
199
200    let store = match scheme {
201        #[cfg(all(feature = "fs", not(target_arch = "wasm32")))]
202        ObjectStoreScheme::Local => Box::new(LocalFileSystem::new()) as _,
203        ObjectStoreScheme::Memory => Box::new(InMemory::new()) as _,
204        #[cfg(feature = "aws")]
205        ObjectStoreScheme::AmazonS3 => {
206            builder_opts!(crate::aws::AmazonS3Builder, url, _options)
207        }
208        #[cfg(feature = "gcp")]
209        ObjectStoreScheme::GoogleCloudStorage => {
210            builder_opts!(crate::gcp::GoogleCloudStorageBuilder, url, _options)
211        }
212        #[cfg(feature = "azure")]
213        ObjectStoreScheme::MicrosoftAzure => {
214            builder_opts!(crate::azure::MicrosoftAzureBuilder, url, _options)
215        }
216        #[cfg(feature = "http")]
217        ObjectStoreScheme::Http => {
218            let url = &url[..url::Position::BeforePath];
219            builder_opts!(crate::http::HttpBuilder, url, _options)
220        }
221        #[cfg(not(all(
222            feature = "fs",
223            feature = "aws",
224            feature = "azure",
225            feature = "gcp",
226            feature = "http",
227            not(target_arch = "wasm32")
228        )))]
229        s => {
230            return Err(super::Error::Generic {
231                store: "parse_url",
232                source: format!("feature for {s:?} not enabled").into(),
233            });
234        }
235    };
236
237    Ok((store, path))
238}
239
240#[cfg(test)]
241mod tests {
242    use super::*;
243
244    #[test]
245    fn test_parse() {
246        let cases = [
247            ("file:/path", (ObjectStoreScheme::Local, "path")),
248            ("file:///path", (ObjectStoreScheme::Local, "path")),
249            ("memory:/path", (ObjectStoreScheme::Memory, "path")),
250            ("memory:///", (ObjectStoreScheme::Memory, "")),
251            ("s3://bucket/path", (ObjectStoreScheme::AmazonS3, "path")),
252            ("s3a://bucket/path", (ObjectStoreScheme::AmazonS3, "path")),
253            (
254                "https://s3.region.amazonaws.com/bucket",
255                (ObjectStoreScheme::AmazonS3, ""),
256            ),
257            (
258                "https://s3.region.amazonaws.com/bucket/path",
259                (ObjectStoreScheme::AmazonS3, "path"),
260            ),
261            (
262                "https://bucket.s3.region.amazonaws.com",
263                (ObjectStoreScheme::AmazonS3, ""),
264            ),
265            (
266                "https://ACCOUNT_ID.r2.cloudflarestorage.com/bucket",
267                (ObjectStoreScheme::AmazonS3, ""),
268            ),
269            (
270                "https://ACCOUNT_ID.r2.cloudflarestorage.com/bucket/path",
271                (ObjectStoreScheme::AmazonS3, "path"),
272            ),
273            (
274                "abfs://container/path",
275                (ObjectStoreScheme::MicrosoftAzure, "path"),
276            ),
277            (
278                "abfs://file_system@account_name.dfs.core.windows.net/path",
279                (ObjectStoreScheme::MicrosoftAzure, "path"),
280            ),
281            (
282                "abfss://file_system@account_name.dfs.core.windows.net/path",
283                (ObjectStoreScheme::MicrosoftAzure, "path"),
284            ),
285            (
286                "https://account.dfs.core.windows.net",
287                (ObjectStoreScheme::MicrosoftAzure, ""),
288            ),
289            (
290                "https://account.dfs.core.windows.net/container/path",
291                (ObjectStoreScheme::MicrosoftAzure, "path"),
292            ),
293            (
294                "https://account.blob.core.windows.net",
295                (ObjectStoreScheme::MicrosoftAzure, ""),
296            ),
297            (
298                "https://account.blob.core.windows.net/container/path",
299                (ObjectStoreScheme::MicrosoftAzure, "path"),
300            ),
301            (
302                "az://container/path",
303                (ObjectStoreScheme::MicrosoftAzure, "path"),
304            ),
305            (
306                "az://container@account/path",
307                (ObjectStoreScheme::MicrosoftAzure, "path"),
308            ),
309            (
310                "abfs://container/path",
311                (ObjectStoreScheme::MicrosoftAzure, "path"),
312            ),
313            (
314                "abfs://container@account/path",
315                (ObjectStoreScheme::MicrosoftAzure, "path"),
316            ),
317            (
318                "abfss://container/path",
319                (ObjectStoreScheme::MicrosoftAzure, "path"),
320            ),
321            (
322                "abfss://container@account/path",
323                (ObjectStoreScheme::MicrosoftAzure, "path"),
324            ),
325            (
326                "adl://container/path",
327                (ObjectStoreScheme::MicrosoftAzure, "path"),
328            ),
329            (
330                "adl://container@account/path",
331                (ObjectStoreScheme::MicrosoftAzure, "path"),
332            ),
333            (
334                "gs://bucket/path",
335                (ObjectStoreScheme::GoogleCloudStorage, "path"),
336            ),
337            (
338                "gs://test.example.com/path",
339                (ObjectStoreScheme::GoogleCloudStorage, "path"),
340            ),
341            ("http://mydomain/path", (ObjectStoreScheme::Http, "path")),
342            ("https://mydomain/path", (ObjectStoreScheme::Http, "path")),
343            (
344                "s3://bucket/foo%20bar",
345                (ObjectStoreScheme::AmazonS3, "foo bar"),
346            ),
347            (
348                "s3://bucket/foo bar",
349                (ObjectStoreScheme::AmazonS3, "foo bar"),
350            ),
351            ("s3://bucket/😀", (ObjectStoreScheme::AmazonS3, "😀")),
352            (
353                "s3://bucket/%F0%9F%98%80",
354                (ObjectStoreScheme::AmazonS3, "😀"),
355            ),
356            (
357                "https://foo/bar%20baz",
358                (ObjectStoreScheme::Http, "bar baz"),
359            ),
360            (
361                "file:///bar%252Efoo",
362                (ObjectStoreScheme::Local, "bar%2Efoo"),
363            ),
364            (
365                "abfss://file_system@account.dfs.fabric.microsoft.com/",
366                (ObjectStoreScheme::MicrosoftAzure, ""),
367            ),
368            (
369                "abfss://file_system@account.dfs.fabric.microsoft.com/",
370                (ObjectStoreScheme::MicrosoftAzure, ""),
371            ),
372            (
373                "https://account.dfs.fabric.microsoft.com/",
374                (ObjectStoreScheme::MicrosoftAzure, ""),
375            ),
376            (
377                "https://account.dfs.fabric.microsoft.com/container",
378                (ObjectStoreScheme::MicrosoftAzure, ""),
379            ),
380            (
381                "https://account.dfs.fabric.microsoft.com/container/path",
382                (ObjectStoreScheme::MicrosoftAzure, "path"),
383            ),
384            (
385                "https://account.blob.fabric.microsoft.com/",
386                (ObjectStoreScheme::MicrosoftAzure, ""),
387            ),
388            (
389                "https://account.blob.fabric.microsoft.com/container",
390                (ObjectStoreScheme::MicrosoftAzure, ""),
391            ),
392            (
393                "https://account.blob.fabric.microsoft.com/container/path",
394                (ObjectStoreScheme::MicrosoftAzure, "path"),
395            ),
396        ];
397
398        for (s, (expected_scheme, expected_path)) in cases {
399            let url = Url::parse(s).unwrap();
400            let (scheme, path) = ObjectStoreScheme::parse(&url).unwrap();
401
402            assert_eq!(scheme, expected_scheme, "{s}");
403            assert_eq!(path, Path::parse(expected_path).unwrap(), "{s}");
404        }
405
406        let neg_cases = [
407            "unix:/run/foo.socket",
408            "file://remote/path",
409            "memory://remote/",
410        ];
411        for s in neg_cases {
412            let url = Url::parse(s).unwrap();
413            assert!(ObjectStoreScheme::parse(&url).is_err());
414        }
415    }
416
417    #[test]
418    fn test_url_spaces() {
419        let url = Url::parse("file:///my file with spaces").unwrap();
420        assert_eq!(url.path(), "/my%20file%20with%20spaces");
421        let (_, path) = parse_url(&url).unwrap();
422        assert_eq!(path.as_ref(), "my file with spaces");
423    }
424
425    #[tokio::test]
426    #[cfg(all(feature = "http", not(target_arch = "wasm32")))]
427    async fn test_url_http() {
428        use crate::{ObjectStoreExt, client::mock_server::MockServer};
429        use http::{Response, header::USER_AGENT};
430
431        let server = MockServer::new().await;
432
433        server.push_fn(|r| {
434            assert_eq!(r.uri().path(), "/foo/bar");
435            assert_eq!(r.headers().get(USER_AGENT).unwrap(), "test_url");
436            Response::new(String::from("result"))
437        });
438
439        let test = format!("{}/foo/bar", server.url());
440        let opts = [("USER_AGENT", "test_url"), ("allow_http", "true")];
441        let url = test.parse().unwrap();
442        let (store, path) = parse_url_opts(&url, opts).unwrap();
443        assert_eq!(path.as_ref(), "foo/bar");
444
445        let res = store.get(&path).await.unwrap();
446        let body = res.bytes().await.unwrap();
447        let body = str::from_utf8(&body).unwrap();
448        assert_eq!(body, "result");
449
450        server.shutdown().await;
451    }
452}