Source code for sphinx_c_autodoc.loader

"""
Load the c file objects
"""

import json
import os
import re
import textwrap

from collections import OrderedDict, namedtuple
from itertools import takewhile
from typing import Any, List, Optional, Union, Dict, Tuple, Sequence

from bs4 import BeautifulSoup
from bs4.element import Tag
from clang import cindex
from clang.cindex import Cursor, Token, StorageClass

from sphinx_c_autodoc.clang.patches import patch_clang

#: Nodes which clang doesn't autopopulate with the associated comment
UNDOCUMENTED_NODES = (cindex.CursorKind.MACRO_DEFINITION,)

#: The first few characters of a comment which indicates a documentation comment.
DOCUMENTATION_COMMENT_START = ("/**", "/*!", "///")

#: The start of a comment that is meant to document the previous item
TRAILING_COMMENT_START = ("/**<", "/*!<", "///<")

#: Nodes which can be anonymous and still need to be documented. i.e.
#: anonymous structures are usually part of another node. Anonymous enums are
#: often used to use the enumerators but not force the type usage.
ALLOWED_ANONYMOUS = (cindex.CursorKind.ENUM_DECL,)

# Must do this prior to calling into clang
patch_clang()

#: A light container to mimic a :class:`cindex.Token` for comments.
PsuedoToken = namedtuple("PsuedoToken", ["spelling", "extent"])



[docs]
class DocumentedObject:
    """
    A representation of a c object for documentation purposes.

    Arguments:
        node (:class:`~clang.cindex.Cursor`): The node representing this object.

    Attributes:
        type_ (str): The type of this item one of:

            - object: unknown/unsupported object.
            - file: Should be the root object of a documentation tree.
            - member: Member or field of a struct or union
            - function: A function
            - type: A typedef
            - struct: A structure
            - union: A Union
            - enumerator: An enumerator constant
            - macro: A macro

        doc (str): The default documentation of the object. This is usally
            the comment with leading '*' removed.
        name (str): The name of the object. For example functions this would
            be *only* the name of the function.
        node (:class:`~clang.cindex.Cursor`): The node representing this object.
        _children: The children of the object. For
            example for structs this would be the members or fields.
        _soup (:class:`~bs4.BeautifulSoup`): The soupified version of
            :attr:`node`'s clang xml comment.
        _declaration (str): The declaration string. For most things this is
            the type as well as the name.
        _line_range (Tuple[int, int]): The line range of the C construct,
            this will include any leading or trailing comments that may be
            part of the construct's documentation.
    """

    type_ = "object"

    def __init__(self, node: Cursor) -> None:
        self.doc = ""
        self.name = ""
        self.node = node
        self._children: Optional[OrderedDict] = None
        self._soup: Optional[BeautifulSoup] = None
        self._declaration: Optional[str] = None
        self._line_range: Optional[Tuple[int, int]] = None


[docs]
    def line_range(self) -> Tuple[int, int]:
        """
        The lines in the source file that this object covers.

        This will include any leading or trailing comments that may be part
        of the construct's documentation.
        """
        if self._line_range is None:
            node_extent = self.node.extent
            comment_extent = self.node.comment_extent
            if comment_extent.start.file is None:
                comment_extent = node_extent

            self._line_range = (
                min(node_extent.start.line, comment_extent.start.line),
                max(node_extent.end.line, comment_extent.end.line),
            )

        return self._line_range


    @property
    def type(self) -> str:
        """
        The type of object
        """
        return self.type_

    @property
    def children(self) -> dict:
        """
        The child objects of this object.
        """
        if self._children is None:
            self._children = OrderedDict()

        return self._children

    # pylint: disable=unused-argument

[docs]
    def format_args(self, **kwargs: Any) -> str:
        """
        Creates the parenthesis version of the function signature.  i.e. this
        will be the `(int hello, int what)` portion of a function.
        """
        return ""



[docs]
    def format_name(self) -> str:
        """
        The name of the object.

        For things like functions and others this will include the return type.
        """
        return self.declaration


    def __str__(self) -> str:
        """
        Will turn this instance into a JSON like representation.
        """
        obj_dict: Dict[str, Any] = {}
        obj_dict["doc"] = self.doc
        obj_dict["type"] = self.type
        obj_dict["name"] = self.name

        line_range = self.line_range()
        obj_dict["start_line"] = line_range[0]
        obj_dict["end_line"] = line_range[1]

        obj_dict["children"] = []

        for child in self.children.values():
            obj_dict["children"].append(json.loads(str(child)))

        return json.dumps(obj_dict)


[docs]
    def get_doc(self) -> str:
        """
        Get the documentation paragraph of the item
        """
        if self.soup is not None:
            root = self.soup.contents[0]
            body = self.get_paragraph(root.find("abstract", recursive=False))
            body += self.get_paragraph(root.find("discussion", recursive=False))
            return body

        return self.doc


    @property
    def declaration(self) -> str:
        """
        Declaration for this object. For things like functions it will be
        `void foo(int a, int b)` for variables it will be the type and name
        `char my_var`.
        """
        if self._declaration is None:
            # First try to utilize the clang comment's version as it is assumed
            # to be the more correct.
            self._declaration = self.get_soup_declaration()

        if self._declaration is None:
            # soup failed so fall back to manual parsing
            self._declaration = self.get_parsed_declaration()

        return self._declaration


[docs]
    def get_parsed_declaration(self) -> str:
        """
        Get the declaration as parsed from the :attr:`node`. This may be
        specific to each :attr:`type`.
        """
        # Declarations are object specific so default to something sane in the
        # off chance an object fails to implement this.
        return self.name



[docs]
    def get_soup_declaration(self) -> Optional[str]:
        """
        Get the declaration element from :attr:`soup`. If there is no soup or
        no declaration this will return None.
        """
        if self.soup is None:
            return None

        root = self.soup.contents[0]

        # It seems with different versions of clang the newlines will at times
        # be kept around from some declarations. This causes problems with
        # sphinx as the signature should remain all in one line.
        lines = root.declaration.text.splitlines()
        declaration = " ".join(line.strip() for line in lines)
        return declaration


    @property
    def soup(self) -> Optional[BeautifulSoup]:
        """
        Get the beautifulsoup representation of this object's comment.

        Returns:
            BeautifulSoup: The xml comment for this C object turned into soup.
        """
        if self._soup is None:
            comment = self.node.getParsedComment().as_xml()
            if comment is not None:
                self._soup = BeautifulSoup(comment, features="html.parser")

        return self._soup


[docs]
    @staticmethod
    def get_paragraph(tag: Optional[Tag]) -> str:
        """
        Get the paragraph contents from `tag`.

        Args:
            tag (:class:`~BeautifulSoup.tag`): The tag to get the paragraph
                contents from.

        Returns:
            str: One of two things:
                - An empty string if `tag` is None.
                - All of the paragraph contents of `tag` with newlines
                  between them along with a trailing newline, otherwise.
        """
        if tag is None:
            return ""

        paragraph = "\n".join(p.text.strip() for p in tag.find_all("para"))
        paragraph += "\n"
        return paragraph



[docs]
    def is_public(self) -> bool:
        """
        Determines if this item is public.

        C doesn't actually have public/private namespace so we're going to
        make up some rules.

        Constructs are public if:

            - They are in a header file. By nature header files are meant to
              be included in other files, thus they are deamed public.

            - They can be visible outside of the compilation unit. These are
              things the linker can get access to. Mainly this means functions
              and variables that are not static.
        """
        # Here we'll do the most common logic, and let specific constructs that
        # can be public do special logic.
        if self.node.location.file.name.endswith(".h"):
            return True

        return False





[docs]
class DocumentedFile(DocumentedObject):
    """
    A documented file
    """

    type_ = "file"




[docs]
class DocumentedMacro(DocumentedObject):
    """
    A documented macro
    """

    type_ = "macro"


[docs]
    def format_args(self, **kwargs: Any) -> str:
        """
        If the macro is function like, gets the parenthesis version of the
        function signature. i.e. this will be the `(_x, _y)` portion of the
        macro function header.
        Otherwise this is the empty string.

        Returns:
            The argument string for this macro.
        """
        decl = self.declaration

        # The logic allows this to be used for both function like and non
        # function like macros.
        # 'SOME_DEFINE'.partition('(')
        # >>>  'SOME_DEFINE', '', ''
        #
        # 'FUNCTION_LIKE(_a, _b)'.partition('(')
        # >>>  'FUNCTION_LIKE', '(', '_a, _b)'
        _, part, args = decl.partition("(")
        return part + args



[docs]
    def format_name(self) -> str:
        """Format the name for use in sphinx.

        For things like functions and others this will include the return type.
        """
        decl = self.declaration
        name, _, _ = decl.partition("(")
        return name



[docs]
    def get_parsed_declaration(self) -> str:
        """
        Creates the full declaration of the macro. For function like macros
        this will include the parenthesised arguments.
        """
        if not self.node.is_macro_function_like():
            return f"{self.name}"

        # We know this must be a function like macro, which means the first 2
        # tokens are `MACRO_NAME` followed by `(`.
        token_iter = self.node.get_tokens()
        next(token_iter)
        next(token_iter)

        # Consume all identifier tokens until the first closing `)`. This will
        # skip over `,` as well as any inline comments.
        # This may have some false positives if there are extra parens in the
        # argument list
        ident_iter = takewhile(lambda x: x.spelling != ")", token_iter)
        tokens = [
            i.spelling for i in ident_iter if i.kind == cindex.TokenKind.IDENTIFIER
        ]

        return f"{self.name}({', '.join(tokens)})"





[docs]
class DocumentedEnumerator(DocumentedObject):
    """
    An enumerator, the constant values in an enum
    """

    type_ = "enumerator"


[docs]
    def format_name(self) -> str:
        """
        The name of the object.

        For things like functions and others this will include the return type.
        """
        return self.name





[docs]
class DocumentedMember(DocumentedObject):
    """
    A documented member of a struct or union.
    """

    type_ = "member"


[docs]
    def get_parsed_declaration(self) -> str:
        """
        Build up the name from the node. This should be the member's type and
        it's name. i.e. for::

            struct foo
            {
                int bar;
                float hello;
            };

        The parsed declaration of `bar` would be `int bar`.
        """
        type_ = self.node.type.spelling

        # Libclang will return the type as `float [20]` when looking at
        # `float foo[20]`.  We could look at the kind `TypeKind.CONSTANTARRAY`
        # but partitioning on the "[" just seems more straight forward.
        type_, *(array) = type_.partition("[")
        array_contents = "".join(array)
        return f"{type_} {self.name} {array_contents}"



[docs]
    def is_public(self) -> bool:
        """
        Members are always public, because it's their parents that determine
        public versus private.
        """
        return True





[docs]
class DocumentedFunction(DocumentedObject):
    """
    A function specific documented object.
    """

    type_ = "function"


[docs]
    def get_soup_doc(self) -> Optional[str]:
        """
        Gets the documentation from the :attr:`_soup`.
        """
        if self.soup is None:
            return None
        root = self.soup.contents[0]

        if not root.find("parameters", recursive=False) and not root.find(
            "resultdiscussion"
        ):
            return None

        body = self.get_paragraph(root.find("abstract", recursive=False))
        body += self.get_paragraph(root.find("discussion", recursive=False))

        # Single newlines are tread as the same paragraph in restructured text,
        # providing 2 results in separate paragraphs.
        body = body.replace("\n", "\n\n")

        for param in root.find_all("parameter"):
            name = param.find("name").text
            param_doc = self.get_paragraph(param.discussion)
            body += f"\n:param {name}: {param_doc}"

        returns = self.get_paragraph(root.find("resultdiscussion", recursive=False))
        if returns:
            body += f"\n:returns: {returns}"

        return body



[docs]
    def get_doc(self) -> str:
        """
        Get the documentation paragraph of the item
        """
        soup_doc = self.get_soup_doc()
        if soup_doc is not None:
            return soup_doc

        return self.doc



[docs]
    def format_args(self, **kwargs: Any) -> str:
        """
        Creates the parenthesis version of the function signature.  i.e. this
        will be the `(int hello, int what)` portion of the function header.
        """
        decl = self.declaration
        _, args = decl.split("(", 1)

        # To keep consistency between clang versions we force the "void"
        # argument. Earlier versions of clang has 'foo()' and newer ones
        # have 'foo(void)'
        if args == ")":  # pragma: no cover
            args = "void)"
        return "(" + args



[docs]
    def format_name(self) -> str:
        """Format the name of *self.object*.

        This normally should be something that can be parsed by the generated
        directive, but doesn't need to be (Sphinx will display it unparsed
        then).

        For things like functions and others this will include the return type.
        """
        decl = self.declaration
        name, _ = decl.split("(", 1)
        return name


    def _get_arguments(self) -> str:
        """
        Get the arguments for this instance.

        Returns:
            str: The arguments for use in the function signature.
        """
        func = self.node

        # Early logic used to iterate over, `func.get_arguments()`, however when there
        # is an unknown type clang will sometimes fail to provide tokens for that
        # argument. For example in "unknown_type foo[]" the brackets will cause clang
        # to return back no tokens for the argument.
        start = func.location
        end = func.extent.end
        if func.is_definition():
            # When a function is a definition the last child is the compound statement
            # so we need to move prior to the compound statement
            children = list(func.get_children())
            body_start = children[-1].extent.start.offset
            end = cindex.SourceLocation.from_offset(func.tu, start.file, body_start - 1)

        extent = cindex.SourceRange.from_locations(start, end)
        non_comment_tokens = (
            t
            for t in cindex.TokenGroup.get_tokens(func.tu, extent=extent)
            if t.kind != cindex.TokenKind.COMMENT
        )

        # Even though this will place spaces around all the tokens, the sphinx C domain
        # will provide some formatting to make it look nicer in the final output.
        full_signature = " ".join(t.spelling for t in non_comment_tokens)

        _, _, arguments = full_signature.partition("(")
        arguments = arguments.rstrip(")")
        arguments = arguments.strip()

        return arguments


[docs]
    def get_parsed_declaration(self) -> str:
        """
        Creates the parenthesis version of the function signature.  i.e. this
        will be the `(int hello, int what)` portion of the header.
        """
        args = self._get_arguments()

        func = self.node
        tu = func.tu

        # For functions the extent encompasses the return value, and the
        # location is the beginning of the functions name.  So we can consume
        # all tokens in between.
        end = cindex.SourceLocation.from_offset(
            tu, func.location.file, func.location.offset - 1
        )
        extent = cindex.SourceRange.from_locations(func.extent.start, end)

        return_type = " ".join(
            t.spelling for t in cindex.TokenGroup.get_tokens(tu, extent=extent)
        )

        return f"{return_type} {func.spelling}({args})"



[docs]
    def is_public(self) -> bool:
        """
        Functions are public as long as they are not static.
        """
        if self.node.storage_class == StorageClass.STATIC:
            return False

        return True





[docs]
class DocumentedType(DocumentedObject):
    """
    A documented type(def)
    """

    type_ = "type"


[docs]
    def get_parsed_declaration(self) -> str:
        """Format the name of *self.object*.

        This normally should be something that can be parsed by the generated
        directive, but doesn't need to be (Sphinx will display it unparsed
        then).

        For things like functions and others this will include the return type.
        """
        parent_type = self.node.underlying_typedef_type.spelling

        # Function prototypes need to be handled different. When clang can't
        # successfully parse the file it falls back to naming the return type
        # as the display name.
        # Unfortunately some versions of clang behave a little differently, some
        # will return a `POINTER` while others will return `FUNCITONNOPROTO`. The
        # `POINTER`s are easy to derive the real type from, but the test
        # environment doesn't use that version of clang.
        type_ = self.node.underlying_typedef_type
        if type_.kind == cindex.TypeKind.POINTER:  # pragma: no cover
            type_ = type_.get_pointee()

        if type_.kind in (
            cindex.TypeKind.FUNCTIONPROTO,
            cindex.TypeKind.FUNCTIONNOPROTO,
        ):
            ret_value, paren, signature = parent_type.partition(")")
            signature = "".join((ret_value, self.name, paren, signature))

            return f"typedef {signature}"

        return f"typedef {parent_type} {self.name}"





[docs]
class DocumentedStructure(DocumentedObject):
    """
    A documented structure
    """

    type_ = "struct"

    @property
    def soup(self) -> None:
        """
        Since structures like objects use the "Members:" and
        "Enumerators:" sections do *not* use the clang xml comments as they
        don't preserve newlines, so the sections get lost.
        """
        return None


[docs]
    def get_parsed_declaration(self) -> str:
        """
        Structures, and similar, are just name.
        """
        return f"{self.name}"


    @property
    def children(self) -> dict:
        """
        Gets the children, members, of the structure.
        """
        if self._children is None:
            # Get the first level of the structures members.
            struct = self.node
            self._children = OrderedDict()
            for member in struct.get_children():
                item = object_from_cursor(member)
                if item:
                    self._children[member.spelling] = item

        return self._children




[docs]
class DocumentedUnion(DocumentedStructure):
    """
    Class for unions. Same as structures with a different :attr:`type`.
    """

    type_ = "union"




[docs]
class DocumentedEnum(DocumentedStructure):
    """
    Class for Enums. Same as structures with a different :attr:`type`.
    """

    type_ = "enum"




[docs]
class DocumentedVariable(DocumentedObject):
    """
    Class for file level variables
    """

    type_ = "variable"


[docs]
    def format_name(self) -> str:
        """Format the name of *self.object*.

        This normally should be something that can be parsed by the generated
        directive, but doesn't need to be (Sphinx will display it unparsed
        then).

        For things like functions and others this will include the return type.
        """
        decl = self.declaration

        # variables which are declared and assigned at the same location will
        # include the assignment in the clange declaration, so strip it out for
        # documentation.
        name, _, _ = decl.partition("=")
        return name



[docs]
    def is_public(self) -> bool:
        """
        Variables are public as long as they are not static.
        """
        if self.node.storage_class == StorageClass.STATIC:
            return False

        return True



[docs]
    def get_parsed_declaration(self) -> str:
        """
        Get the declaration as parsed from the :attr:`node`.
        """
        # Libclang will return the type as `float [20]` when looking at
        # `float foo[20]`.  We could look at the kind `TypeKind.CONSTANTARRAY`
        # but partitioning on the "[" just seems more straight forward.
        type_ = self.node.type.spelling
        type_, *(array) = type_.partition("[")
        array_contents = "".join(array)

        real_type = self._find_declaration_type()
        type_ = type_.replace("int", real_type)

        return f"{type_} {self.name} {array_contents}"


    def _find_declaration_type(self) -> str:
        """
        Makes an attempt to try and find the identifier, and storage class,
        representing the variable type.

        Returns:
            str: The type of the variable.  If this can't be derived falls back to
                `int`.
        """
        type_ = "int"
        tokens = list(
            filter(
                lambda t: t.kind == cindex.TokenKind.IDENTIFIER, self.node.get_tokens()
            )
        )
        try:
            type_ = tokens[-2].spelling
        except IndexError:  # pragma: no cover
            # For versions 16 and before, libclang fails to provide the tokens for array
            # variables with unknown types
            pass

        # clang doesn't provide the storage class in the type name, so we'll add it here
        storage_keyword = ""
        storage_class = self.node.storage_class
        if storage_class == cindex.StorageClass.STATIC:
            storage_keyword = "static"
        if storage_class == cindex.StorageClass.EXTERN:
            storage_keyword = "extern"

        return f"{storage_keyword} {type_}"



CURSORKIND_TO_OBJECT_CLASS = {
    cindex.CursorKind.TRANSLATION_UNIT: DocumentedFile,
    cindex.CursorKind.FUNCTION_DECL: DocumentedFunction,
    cindex.CursorKind.STRUCT_DECL: DocumentedStructure,
    cindex.CursorKind.UNION_DECL: DocumentedUnion,
    cindex.CursorKind.ENUM_DECL: DocumentedEnum,
    cindex.CursorKind.FIELD_DECL: DocumentedMember,
    cindex.CursorKind.MACRO_DEFINITION: DocumentedMacro,
    cindex.CursorKind.ENUM_CONSTANT_DECL: DocumentedEnumerator,
    cindex.CursorKind.VAR_DECL: DocumentedVariable,
    cindex.CursorKind.TYPEDEF_DECL: DocumentedType,
}



[docs]
def object_from_cursor(cursor: Cursor) -> Optional[DocumentedObject]:
    """
    Create an instance from a :class:`cindex.Cursor`
    """
    # Prior to clang 16, anonymous constructs would have an empty `spelling` and
    # `displayname`. Clang 16 began naming anonymous constructs as:
    #   "<construct> (anonymous at # <path_to_c_file>:<lineno>)"
    # So need to look at the type spelling to determine if a construct is anonymous
    name = cursor.spelling
    anonymous_type = any(
        anon in cursor.type.spelling for anon in ("anonymous at", "unnamed at")
    )

    if not name or anonymous_type:
        # An anonymous construct which isn't contained in a typedef will have a
        # type spelling of:
        # "<construct> (anonymous at # <path_to_c_file>:<lineno>)"
        # Typedef's should be handled by the get_nested_node() function
        if cursor.kind in ALLOWED_ANONYMOUS and anonymous_type:
            filename = os.path.basename(cursor.location.file.name)
            # remove the extension from the filename since the '.' is not a
            # valid c identifier. splitext will remove the trailing most
            # extension so if this file is multi dotted it will fail, just use
            # partition and grab the first part.
            filename, _, _ = filename.partition(".")
            name = f"anon_{filename}_{cursor.hash}"
        else:
            # Don't document anonymous items
            return None

    nested_cursor = get_nested_node(cursor)
    class_ = CURSORKIND_TO_OBJECT_CLASS.get(nested_cursor.kind, DocumentedObject)
    doc = class_(nested_cursor)

    doc.name = name
    psuedo_comment = PsuedoToken(
        nested_cursor.raw_comment, nested_cursor.comment_extent
    )
    doc.doc = parse_comment(psuedo_comment)

    return doc




[docs]
def get_nested_node(cursor: Cursor) -> Cursor:
    """
    Retrieve the nested node that `cursor` may be shadowing
    """
    if cursor.kind in (
        cindex.CursorKind.TYPEDEF_DECL,
        cindex.CursorKind.FIELD_DECL,
        cindex.CursorKind.VAR_DECL,
    ):
        try:
            underlying_node = next(cursor.get_children())
            if underlying_node.kind in (
                cindex.CursorKind.STRUCT_DECL,
                cindex.CursorKind.UNION_DECL,
                cindex.CursorKind.ENUM_DECL,
            ):
                return underlying_node
        except StopIteration:
            # No children for typedefs of native types, i.e. `typedef int some_int;`
            pass

    return cursor




[docs]
def get_file_comment(cursor: Cursor, child: Optional[Cursor]) -> str:
    """
    Attempts to get the comment at the top of the file.

    Args:
        cursor (:class:`cindex.Cursor`): The root cursor of the file.
        child (:class:`cindex.Cursor`): The first child node in the file.
            This can be None.

    Returns:
        str: The file level comment.
    """
    try:
        token = next(cursor.get_tokens())
    except StopIteration:
        # Only happens with a completely empty file
        return ""

    if token.kind == cindex.TokenKind.COMMENT:
        if child is not None:
            child_comment = child.raw_comment
        else:
            child_comment = ""

        # if the first comment is not the documentation comment for the first
        # child then assume it is the file comment.
        if child_comment != token.spelling:
            return parse_comment(token)

    return ""




[docs]
def get_compilation_args(
    filename: str, compilation_database: Optional[str] = None
) -> List[str]:
    """
    Get the compilation args for `filename` for the compilation database found in
    `compilation_db_dir`

    Args:
        filename (str): The file to get the compilation arguments for.
        compilation_database (str): The compilation database.

    Returns:
        list[str]: The compilation arguments.
    """
    if not compilation_database:
        return []
    directory = os.path.dirname(compilation_database)
    comp_db = cindex.CompilationDatabase.fromDirectory(directory)
    commands = comp_db.getCompileCommands(filename)

    if not commands:
        return []

    # For now only handling the first file instance seen in the database.
    # First argument is compiler path, last is the file to compile
    args = list(commands[0].arguments)[1:-1]

    # Since things like includes and defines could be relative we force the working
    # directory.
    working_dir = commands[0].directory
    args.append(f"-working-directory={working_dir}")

    return args




[docs]
def load(
    filename: str,
    contents: str,
    compilation_database: Optional[str] = None,
    compilation_args: Optional[Sequence[str]] = None,
) -> DocumentedObject:
    """
    Load a C file into a tree of :class:`DocumentedObject`\'s

    Args:
        filename (str): The c file to load into a documented item
        contents (str): The contents of `filename`
        compilation_database (str): The compilation database.
        compilation_args (str): Compilation arguments.  Will be applied *after*
            compilation database.

    Returns:
        :class:`DocumentedObject`: The documented version of `filename`.

    """
    args = get_compilation_args(filename, compilation_database)
    if compilation_args:
        args += compilation_args

    tu = cindex.TranslationUnit.from_source(
        filename,
        args=args,
        unsaved_files=[
            (filename, contents),
        ],
        options=cindex.TranslationUnit.PARSE_DETAILED_PROCESSING_RECORD,
    )
    cursor = tu.cursor

    root_document = DocumentedFile(cursor)

    # Some nodes show up from header includes as well as compiler defines, so
    # skip those. Macro instantiations are the locations where macros are
    # expanded, no need to document these.
    child_nodes = [
        c
        for c in cursor.get_children()
        if c.location.isFromMainFile()
        and c.kind
        not in (
            cindex.CursorKind.MACRO_INSTANTIATION,
            cindex.CursorKind.INCLUSION_DIRECTIVE,
        )
    ]

    # Macro definitions always come first in the child list, but that may not
    # be their location in the file, so sort all the nodes by location
    sorted_nodes = sorted(child_nodes, key=lambda x: x.extent.start.offset)

    comment_nodes(cursor, sorted_nodes)

    # TODO need to consider a better way to build this up, taking the
    # dictionary and modifying in place isn't ideal.
    children = root_document.children
    for node in sorted_nodes:
        item = object_from_cursor(node)
        if item:
            children[item.name] = item

    root_document.doc = cursor.raw_comment
    root_document.name = os.path.basename(cursor.spelling)
    root_document.node = cursor

    return root_document




[docs]
def comment_nodes(cursor: Cursor, children: List[Cursor]) -> None:
    """
    Comment all nodes in `cursor` and `children` that fall into the
    :data:`UNDOCUMENTED_NODES` type of nodes.

    The nodes will be modified in place.

    Args:
        cursor (cindex.Cursor): The parent cursor of the child nodes
            which may need to be commented. This is assumed to be the root
            file cursor.

        children (Sequence(cindex.Cursor)): The child nodes which may need to
            be commented.
    """
    # The idea here is to look for comment tokens between nodes.
    tu = cursor.tu
    prev_child = None
    for child in children:
        # :func:`comment_node` will look to see if the node is in
        # UNDOCUMENTED_NODES, but do it here anyway to save the effort of
        # getting tokens, no performance metrics were checked, but the general
        # hunch is there will be a lot fewer UNDOCUMENTED_NODES than not.
        if child.kind not in UNDOCUMENTED_NODES:
            prev_child = child
            continue

        # This may not be 100% accurate but move the end to the previous
        # line. This solves problems like macro definitions not including the
        # preprocessor `#define` tokens.
        #
        #                             <-- previous line
        #     #define SOME_MACRO 23
        #             ^            ^ (Note `end` is exclusive)
        #             |            |
        #             +-- extent --+
        #
        location = child.extent.start
        end = cindex.SourceLocation.from_position(
            tu, location.file, location.line - 1, 1
        )

        start = prev_child.extent.end if prev_child else cursor.extent.start
        extent = cindex.SourceRange.from_locations(start, end)
        tokens = list(cindex.TokenGroup.get_tokens(tu, extent=extent))

        if tokens:
            comment_node(child, tokens[-1])
            comment_node(prev_child, tokens[0], trailing=True)

        prev_child = child

    first_child = children[0] if children else None
    cursor.raw_comment = get_file_comment(cursor, first_child)




[docs]
def comment_node(node: Optional[Cursor], token: Token, trailing: bool = False) -> None:
    """
    Add the comment, `token`, to the `node`.

    node will be unmodified if:

        - `node` is not one of the :data:`UNDOCUMENTED_NODE`
        - `token` is not a documentation comment.

    Args:
        node (Cursor): The node to attempt to comment.
        token (Token): The token to use for the commenting.
        trailing (bool): Only comment `node` if `token` is a trailing
            comment, :data:`TRAILING_COMMENT_START`. If False then only
            comment `node` if `token` is a non trailing documentation
            comment.
    """
    if node is None or node.kind not in UNDOCUMENTED_NODES:
        return

    # DOCUMENTATION_COMMENT_START are all sub-strings of TRAILING_COMMENT_START
    # so either one would quickly be rejected here.
    if token.kind != cindex.TokenKind.COMMENT or not token.spelling.startswith(
        DOCUMENTATION_COMMENT_START
    ):
        return

    if trailing != token.spelling.startswith(TRAILING_COMMENT_START):
        return

    node.raw_comment = token.spelling
    node.comment_extent = token.extent




[docs]
def parse_comment(comment: Union[Token, PsuedoToken]) -> str:
    """
    Clean up a C comment such that it no longer has leading `/**`, leading lines
    of `*` or trailing `*/`

    Args:
        comment (:class:`cindex.Token`): A c comment token from clang.

    Returns:
        str: The comment with the c comment syntax removed.
    """
    # Happens when there is no documentation comment in the source file for the
    # item.
    spelling = comment.spelling
    if spelling is None:
        return ""

    # Comments from clang start at the '/*' portion, but if the comment itself
    # is indented subsequent lines will have too much indent.
    # Transform::
    #
    #      "/**\n     * hello some comment\n     * on multiple lines\n     */"
    #
    # into::
    #
    #      "/**\n * hello some comment\n * on multiple lines\n */"
    indent = " " * (comment.extent.start.column - 1)
    indented_comment = indent + spelling
    dedented_comment = textwrap.dedent(indented_comment)

    # Notes on the regex here.
    #   Option 1 '\s?\*/?'
    #       This piece will match comment lines that start with '*' or ' *'.
    #       This will also match a trailing '*/' for the end of a comment
    #
    #   Option 2 '^/\*+<?'
    #       This will match the start of a comment '/*' and consume any
    #       subsequent '*'. This is also meant to catch '/**<' for trailing comments.
    #
    #   Option 3 '\*+/'
    #       Matches any and all '*' up to the end of the comment string.
    contents = re.sub(
        r"^\s?\*/?|^/\*+<?|\*+/",
        lambda x: len(x.group(0)) * " ",
        dedented_comment,
        flags=re.MULTILINE,
    )

    contents = textwrap.dedent(contents)

    # there may still be left over newlines so only strip those, but leave any
    # whitespaces.
    contents = contents.strip("\n")

    return contents