Module gatenlp.urlfileutils

Module for functions that help reading binary and textual data from either URLs or local files.

Expand source code
Module for functions that help reading binary and textual data from either URLs or local files.

from typing import Optional, Union
from io import TextIOWrapper
from pathlib import Path
import asyncio
from urllib.parse import ParseResult
from urllib.request import urlopen
have_pyodide = False
    import requests
except Exception as ex:
    # maybe importing requests failed because we are running in a browser using pyodide?
        import pyodide
        from pyodide import open_url
        have_pyodide = True
        # nope, re-raise the original exception
        raise ex

def is_url(ext: Union[str, Path, ParseResult, None]):
    Returns a tuple (True, urlstring) if ext should be interpreted as a (HTTP(s)) URL, otherwise false, pathstring
    If ext is None, returns None, None. If ext is a string, it only gets interpreted as a URL if it starts with
    http:// or https://, and if it starts with file:// the remaining path is used (the latter probably only works on
    non-Windows systems). Otherwise the string is interpreted as a file path.
    If ext is a ParseResult it is always interpreted as a URL, if it is a Path object it is always interpreted
    as a path.

        ext: something that represents an external resource: a string, a ParseResult i.e.
            the result of urllib.parse.urlparse(..), a Path i.e. the result of  Path(somepathstring).

        a tuple (True, urlstring)  or (False,pathstring), or None, None if ext is None

    if ext is None:
        return None, None
    if isinstance(ext, str):
        if ext.startswith("http://") or ext.startswith("https://"):
            return True, ext
            # for now, if we have ext starting with file:// we just remove that part and assume the
            # rest is supposed to be a proper file path
            if ext.startswith("file://"):
                ext = ext[7:]
            return False, ext
    elif isinstance(ext, Path):
        return False, str(ext)
    elif isinstance(ext, ParseResult):
        return True, ext.geturl()
        raise Exception(f"Odd type: {ext}")

def get_str_from_url(url: Union[str, ParseResult], encoding=None):  # pragma: no cover
    Read a string from the URL.

      url: some URL
      encoding: override the encoding that would have determined automatically (Default value = None)

        the string
    if isinstance(url, ParseResult):
        url = url.geturl()
    if have_pyodide:
        with open_url(url) as infp:
            text =
        return text
    req = requests.get(url, allow_redirects=True)
    if encoding is not None:
        req.encoding = encoding
    return req.text

async def get_bytes_from_url_pyodide(url):
    Fetches bytes from the URL using GET.

        url: url to use

    response = await pyodide.http.pyfetch(url, method="GET")
    bytes = await response.bytes()
    return bytes

def get_bytes_from_url(url):  # pragma: no cover
    Reads bytes from url.

      url: the URL

        the bytes
    if have_pyodide:
    req = requests.get(url, allow_redirects=True)
    return req.content

def yield_lines_from(url_or_file: Union[str, Path, ParseResult], encoding: str = "utf-8"):  # pragma: no cover
    Yields lines of text from either a file or an URL

        url_or_file: either a file path or URL. If this is a string, then it is interpreted as an URL
            only if it starts with http:// or https://, otherwise it can be a parsed urllib url
            or a pathlib path
        encoding: the encoding to use
    isurl, extstr = is_url(url_or_file)
    if isurl is None:
    if isurl:
        for line in urlopen(extstr):
            line = line.decode(encoding)
            yield line
        with open(extstr, "rt", encoding=encoding) as infp:
            for line in infp:
                yield line

def stream_from(url_or_file: Union[str, Path, ParseResult], encoding: str = "utf-8"):  # pragma: no cover
    Return an open stream from either the URL or the file, if encoding is None, in binary mode, otherwise
    in text mode with the given encoding.

        url_or_file: URL or file
        encoding: if None, open in binary mode, otherwise in text mode with this encoding

        open stream or None if we cannot determine if it is an URL or file

    isurl, extstr = is_url(url_or_file)
    if isurl is None:
    if isurl:
        tmpfp = urlopen(extstr)
        if encoding is not None:
            return TextIOWrapper(tmpfp, encoding=encoding)
            return tmpfp
        if encoding is not None:
            return open(extstr, "rt", encoding=encoding)
            return open(extstr, "rb")


def get_bytes_from_url(url)

Reads bytes from url.


the URL


the bytes

Expand source code
def get_bytes_from_url(url):  # pragma: no cover
    Reads bytes from url.

      url: the URL

        the bytes
    if have_pyodide:
    req = requests.get(url, allow_redirects=True)
    return req.content
async def get_bytes_from_url_pyodide(url)

Fetches bytes from the URL using GET.


url to use



Expand source code
async def get_bytes_from_url_pyodide(url):
    Fetches bytes from the URL using GET.

        url: url to use

    response = await pyodide.http.pyfetch(url, method="GET")
    bytes = await response.bytes()
    return bytes
def get_str_from_url(url: Union[str, urllib.parse.ParseResult], encoding=None)

Read a string from the URL.


some URL
override the encoding that would have determined automatically (Default value = None)


the string

Expand source code
def get_str_from_url(url: Union[str, ParseResult], encoding=None):  # pragma: no cover
    Read a string from the URL.

      url: some URL
      encoding: override the encoding that would have determined automatically (Default value = None)

        the string
    if isinstance(url, ParseResult):
        url = url.geturl()
    if have_pyodide:
        with open_url(url) as infp:
            text =
        return text
    req = requests.get(url, allow_redirects=True)
    if encoding is not None:
        req.encoding = encoding
    return req.text
def is_url(ext: Union[str, pathlib.Path, urllib.parse.ParseResult, None])

Returns a tuple (True, urlstring) if ext should be interpreted as a (HTTP(s)) URL, otherwise false, pathstring If ext is None, returns None, None. If ext is a string, it only gets interpreted as a URL if it starts with http:// or https://, and if it starts with file:// the remaining path is used (the latter probably only works on non-Windows systems). Otherwise the string is interpreted as a file path. If ext is a ParseResult it is always interpreted as a URL, if it is a Path object it is always interpreted as a path.


something that represents an external resource: a string, a ParseResult i.e. the result of urllib.parse.urlparse(..), a Path i.e. the result of Path(somepathstring).


a tuple (True, urlstring) or (False,pathstring), or None, None if ext is None

Expand source code
def is_url(ext: Union[str, Path, ParseResult, None]):
    Returns a tuple (True, urlstring) if ext should be interpreted as a (HTTP(s)) URL, otherwise false, pathstring
    If ext is None, returns None, None. If ext is a string, it only gets interpreted as a URL if it starts with
    http:// or https://, and if it starts with file:// the remaining path is used (the latter probably only works on
    non-Windows systems). Otherwise the string is interpreted as a file path.
    If ext is a ParseResult it is always interpreted as a URL, if it is a Path object it is always interpreted
    as a path.

        ext: something that represents an external resource: a string, a ParseResult i.e.
            the result of urllib.parse.urlparse(..), a Path i.e. the result of  Path(somepathstring).

        a tuple (True, urlstring)  or (False,pathstring), or None, None if ext is None

    if ext is None:
        return None, None
    if isinstance(ext, str):
        if ext.startswith("http://") or ext.startswith("https://"):
            return True, ext
            # for now, if we have ext starting with file:// we just remove that part and assume the
            # rest is supposed to be a proper file path
            if ext.startswith("file://"):
                ext = ext[7:]
            return False, ext
    elif isinstance(ext, Path):
        return False, str(ext)
    elif isinstance(ext, ParseResult):
        return True, ext.geturl()
        raise Exception(f"Odd type: {ext}")
def stream_from(url_or_file: Union[str, pathlib.Path, urllib.parse.ParseResult], encoding: str = 'utf-8')

Return an open stream from either the URL or the file, if encoding is None, in binary mode, otherwise in text mode with the given encoding.


URL or file
if None, open in binary mode, otherwise in text mode with this encoding


open stream or None if we cannot determine if it is an URL or file

Expand source code
def stream_from(url_or_file: Union[str, Path, ParseResult], encoding: str = "utf-8"):  # pragma: no cover
    Return an open stream from either the URL or the file, if encoding is None, in binary mode, otherwise
    in text mode with the given encoding.

        url_or_file: URL or file
        encoding: if None, open in binary mode, otherwise in text mode with this encoding

        open stream or None if we cannot determine if it is an URL or file

    isurl, extstr = is_url(url_or_file)
    if isurl is None:
    if isurl:
        tmpfp = urlopen(extstr)
        if encoding is not None:
            return TextIOWrapper(tmpfp, encoding=encoding)
            return tmpfp
        if encoding is not None:
            return open(extstr, "rt", encoding=encoding)
            return open(extstr, "rb")
def yield_lines_from(url_or_file: Union[str, pathlib.Path, urllib.parse.ParseResult], encoding: str = 'utf-8')

Yields lines of text from either a file or an URL


either a file path or URL. If this is a string, then it is interpreted as an URL only if it starts with http:// or https://, otherwise it can be a parsed urllib url or a pathlib path
the encoding to use
Expand source code
def yield_lines_from(url_or_file: Union[str, Path, ParseResult], encoding: str = "utf-8"):  # pragma: no cover
    Yields lines of text from either a file or an URL

        url_or_file: either a file path or URL. If this is a string, then it is interpreted as an URL
            only if it starts with http:// or https://, otherwise it can be a parsed urllib url
            or a pathlib path
        encoding: the encoding to use
    isurl, extstr = is_url(url_or_file)
    if isurl is None:
    if isurl:
        for line in urlopen(extstr):
            line = line.decode(encoding)
            yield line
        with open(extstr, "rt", encoding=encoding) as infp:
            for line in infp:
                yield line