From 7115d1ec162b4059b3e8e8f84bd8d5898c1db025 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Wed, 14 May 2014 19:41:05 -0400 Subject: moved source code to src This is to work around OSX's brain-damange regarding filename case insensitivity. Avoided moving config.hs, because it's a config file. Put in a symlink to make build work. --- src/Utility/FileSystemEncoding.hs | 132 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 132 insertions(+) create mode 100644 src/Utility/FileSystemEncoding.hs (limited to 'src/Utility/FileSystemEncoding.hs') diff --git a/src/Utility/FileSystemEncoding.hs b/src/Utility/FileSystemEncoding.hs new file mode 100644 index 00000000..b81fdc53 --- /dev/null +++ b/src/Utility/FileSystemEncoding.hs @@ -0,0 +1,132 @@ +{- GHC File system encoding handling. + - + - Copyright 2012-2014 Joey Hess + - + - License: BSD-2-clause + -} + +{-# LANGUAGE CPP #-} + +module Utility.FileSystemEncoding ( + fileEncoding, + withFilePath, + md5FilePath, + decodeBS, + decodeW8, + encodeW8, + truncateFilePath, +) where + +import qualified GHC.Foreign as GHC +import qualified GHC.IO.Encoding as Encoding +import Foreign.C +import System.IO +import System.IO.Unsafe +import qualified Data.Hash.MD5 as MD5 +import Data.Word +import Data.Bits.Utils +import qualified Data.ByteString.Lazy as L +#ifdef mingw32_HOST_OS +import qualified Data.ByteString.Lazy.UTF8 as L8 +#endif + +{- Sets a Handle to use the filesystem encoding. This causes data + - written or read from it to be encoded/decoded the same + - as ghc 7.4 does to filenames etc. This special encoding + - allows "arbitrary undecodable bytes to be round-tripped through it". + -} +fileEncoding :: Handle -> IO () +#ifndef mingw32_HOST_OS +fileEncoding h = hSetEncoding h =<< Encoding.getFileSystemEncoding +#else +{- The file system encoding does not work well on Windows, + - and Windows only has utf FilePaths anyway. -} +fileEncoding h = hSetEncoding h Encoding.utf8 +#endif + +{- Marshal a Haskell FilePath into a NUL terminated C string using temporary + - storage. The FilePath is encoded using the filesystem encoding, + - reversing the decoding that should have been done when the FilePath + - was obtained. -} +withFilePath :: FilePath -> (CString -> IO a) -> IO a +withFilePath fp f = Encoding.getFileSystemEncoding + >>= \enc -> GHC.withCString enc fp f + +{- Encodes a FilePath into a String, applying the filesystem encoding. + - + - There are very few things it makes sense to do with such an encoded + - string. It's not a legal filename; it should not be displayed. + - So this function is not exported, but instead used by the few functions + - that can usefully consume it. + - + - This use of unsafePerformIO is belived to be safe; GHC's interface + - only allows doing this conversion with CStrings, and the CString buffer + - is allocated, used, and deallocated within the call, with no side + - effects. + -} +{-# NOINLINE _encodeFilePath #-} +_encodeFilePath :: FilePath -> String +_encodeFilePath fp = unsafePerformIO $ do + enc <- Encoding.getFileSystemEncoding + GHC.withCString enc fp $ GHC.peekCString Encoding.char8 + +{- Encodes a FilePath into a Md5.Str, applying the filesystem encoding. -} +md5FilePath :: FilePath -> MD5.Str +md5FilePath = MD5.Str . _encodeFilePath + +{- Decodes a ByteString into a FilePath, applying the filesystem encoding. -} +decodeBS :: L.ByteString -> FilePath +#ifndef mingw32_HOST_OS +decodeBS = encodeW8 . L.unpack +#else +{- On Windows, we assume that the ByteString is utf-8, since Windows + - only uses unicode for filenames. -} +decodeBS = L8.toString +#endif + +{- Converts a [Word8] to a FilePath, encoding using the filesystem encoding. + - + - w82c produces a String, which may contain Chars that are invalid + - unicode. From there, this is really a simple matter of applying the + - file system encoding, only complicated by GHC's interface to doing so. + -} +{-# NOINLINE encodeW8 #-} +encodeW8 :: [Word8] -> FilePath +encodeW8 w8 = unsafePerformIO $ do + enc <- Encoding.getFileSystemEncoding + GHC.withCString Encoding.char8 (w82s w8) $ GHC.peekCString enc + +{- Useful when you want the actual number of bytes that will be used to + - represent the FilePath on disk. -} +decodeW8 :: FilePath -> [Word8] +decodeW8 = s2w8 . _encodeFilePath + +{- Truncates a FilePath to the given number of bytes (or less), + - as represented on disk. + - + - Avoids returning an invalid part of a unicode byte sequence, at the + - cost of efficiency when running on a large FilePath. + -} +truncateFilePath :: Int -> FilePath -> FilePath +#ifndef mingw32_HOST_OS +truncateFilePath n = go . reverse + where + go f = + let bytes = decodeW8 f + in if length bytes <= n + then reverse f + else go (drop 1 f) +#else +{- On Windows, count the number of bytes used by each utf8 character. -} +truncateFilePath n = reverse . go [] n . L8.fromString + where + go coll cnt bs + | cnt <= 0 = coll + | otherwise = case L8.decode bs of + Just (c, x) | c /= L8.replacement_char -> + let x' = fromIntegral x + in if cnt - x' < 0 + then coll + else go (c:coll) (cnt - x') (L8.drop 1 bs) + _ -> coll +#endif -- cgit v1.3-2-g0d8e