Source code for code_index.language_processor.impl_c

# code_index/language_processor/impl_c.py

"""C language processor implementation.

This module provides a concrete implementation of the LanguageProcessor protocol
for C source code. It handles C-specific syntax for function definitions and
function calls using tree-sitter.

The processor supports:
- Function definitions with various declaration patterns
- Function calls and references
- Handling of function pointers and complex declarators
"""

from tree_sitter import Node
from tree_sitter_language_pack import get_language

from ..models import (
    CodeLocation,
    Definition,
    Function,
    Reference,
    Symbol,
    SymbolReference,
)
from .base import BaseLanguageProcessor, QueryContext



[docs]
class CProcessor(BaseLanguageProcessor):
    """Language processor for C source code.

    Handles parsing and analysis of C function definitions and calls.
    Supports various C function declaration patterns including:
    - Simple function definitions
    - Functions with storage class specifiers
    - Functions returning pointers
    - Function pointer declarations
    """


[docs]
    def __init__(self):
        """Initialize the C processor with language-specific configuration."""
        super().__init__(
            name="c",
            language=get_language("c"),
            extensions=[".c", ".h"],
            def_query_str="""
                (function_definition) @function.definition
            """,
            ref_query_str="""
                (call_expression) @function.call
            """,
        )



[docs]
    def handle_definition(
        self,
        node: Node,
        ctx: QueryContext,
    ) -> tuple[Symbol, Definition] | None:
        """Process a C function definition node.

        Handles function_definition nodes with various declaration patterns:
        1. Simple: primitive_type -> function_declarator -> compound_statement
        2. With modifiers: storage_class_specifier -> primitive_type -> function_declarator -> compound_statement
        3. Pointer return: storage_class_specifier -> primitive_type -> pointer_declarator -> compound_statement

        Args:
            node: A function_definition syntax tree node.
            ctx: Query context containing file information.

        Returns:
            A tuple of (Function, Definition) if successful, None if the function
            name cannot be extracted or the definition format is not recognized.
        """
        # Extract function name from various AST patterns
        func_name = self._extract_function_name(node, ctx)
        if not func_name:
            return None

        # Extract preceding comment/documentation
        doc_comment = self._extract_preceding_comment(node, ctx)

        # Find all function calls within the function body
        calls = []

        # Get the function body node (compound_statement)
        body_node = node.child_by_field_name("body")
        if body_node:
            # Search for all function calls within the function body
            for call_node in self.get_reference_nodes(body_node):
                call_result = self.handle_reference(call_node, ctx)
                if call_result:
                    symbol, reference = call_result
                    calls.append(SymbolReference(symbol=symbol, reference=reference.to_pure()))

        return (
            Function(name=func_name),
            Definition(
                location=CodeLocation(
                    file_path=ctx.file_path,
                    start_lineno=node.start_point[0] + 1,
                    start_col=node.start_point[1],
                    end_lineno=node.end_point[0] + 1,
                    end_col=node.end_point[1],
                    start_byte=node.start_byte,
                    end_byte=node.end_byte,
                ),
                doc=doc_comment,
                calls=calls,
            ),
        )



[docs]
    def _extract_function_name(self, function_def_node: Node, ctx: QueryContext) -> str | None:
        """Extract function name from a function_definition node.

        Handles various C function declaration patterns by traversing the
        declarator field which may be either a function_declarator or
        pointer_declarator containing a function_declarator.

        Args:
            function_def_node: The function_definition node to process.
            ctx: Query context for accessing source bytes.

        Returns:
            The function name as a string, or None if extraction fails.
        """
        # Find the declarator field, could be function_declarator or pointer_declarator
        declarator_node = function_def_node.child_by_field_name("declarator")
        if not declarator_node:
            return None

        # If it's a pointer_declarator, search for nested function_declarator
        if declarator_node.type == "pointer_declarator":
            # Look for function_declarator in pointer_declarator children
            for child in declarator_node.children:
                if child.type == "function_declarator":
                    declarator_node = child
                    break
            else:
                return None

        # Now declarator_node should be function_declarator
        if declarator_node.type != "function_declarator":
            return None

        # Extract function name from function_declarator
        name_node = declarator_node.child_by_field_name("declarator")
        if not name_node or name_node.type != "identifier":
            return None

        return ctx.source_bytes[name_node.start_byte : name_node.end_byte].decode("utf8")



[docs]
    def handle_reference(
        self,
        node: Node,
        ctx: QueryContext,
    ) -> tuple[Symbol, Reference] | None:
        """Process a C function call expression.

        Handles call_expression nodes to extract the called function name.
        Uses the entire call_expression range including function name,
        parentheses, and arguments for accurate location tracking.

        Args:
            node: A call_expression syntax tree node.
            ctx: Query context containing file information.

        Returns:
            A tuple of (Function, PureReference) if successful, None if the call
            expression doesn't have a recognizable function identifier.
        """
        name_node = node.child_by_field_name("function")
        if not name_node or name_node.type != "identifier":
            return None

        func_name = ctx.source_bytes[name_node.start_byte : name_node.end_byte].decode("utf8")

        # Use the entire call_expression node range, including function name, parentheses and arguments
        return (
            Function(name=func_name),
            Reference(
                location=CodeLocation(
                    file_path=ctx.file_path,
                    start_lineno=node.start_point[0] + 1,
                    start_col=node.start_point[1],
                    end_lineno=node.end_point[0] + 1,
                    end_col=node.end_point[1],
                    start_byte=node.start_byte,
                    end_byte=node.end_byte,
                ),
            ),
        )



[docs]
    def _extract_preceding_comment(self, node: Node, ctx: QueryContext) -> str | None:
        """Extract the preceding comment/documentation for a C function definition.

        Looks for comment nodes that appear immediately before the function definition.
        Handles both single-line (``//``) and multi-line (``/* */``) comment styles.

        Args:
            node: A function_definition syntax tree node.
            ctx: Query context containing file information.

        Returns:
            The comment text as a string, or None if not present.
        """
        # Look for comment nodes that precede this function definition
        current = node.prev_sibling
        comments = []

        # Traverse backwards through siblings to find comments
        while current:
            if current.type == "comment":
                comment_text = ctx.source_bytes[current.start_byte : current.end_byte].decode(
                    "utf8"
                )
                comments.append(self._clean_c_comment(comment_text))
            elif current.type not in [
                "preproc_include",
                "preproc_def",
                "preproc_ifdef",
                "preproc_ifndef",
                "preproc_endif",
                "preproc_else",
                "preproc_elif",
            ]:
                # Stop if we hit a non-comment, non-preprocessor node
                break
            current = current.prev_sibling

        if comments:
            # Reverse to get comments in the original order
            comments.reverse()
            return "\n".join(comments)

        return None



[docs]
    def _clean_c_comment(self, raw_comment: str) -> str:
        """Clean up a C comment by removing comment delimiters and normalizing whitespace.

        Args:
            raw_comment: The raw comment text including delimiters.

        Returns:
            The cleaned comment text.
        """
        # Remove comment delimiters
        if raw_comment.startswith("/*") and raw_comment.endswith("*/"):
            content = raw_comment[2:-2]
        elif raw_comment.startswith("//"):
            content = raw_comment[2:]
        else:
            content = raw_comment

        # Clean up whitespace and common comment formatting
        lines = content.split("\n")
        cleaned_lines = []
        for line in lines:
            # Remove leading whitespace and common comment prefixes
            line = line.strip()
            if line.startswith("* "):
                line = line[2:]
            elif line.startswith("*"):
                line = line[1:]
            cleaned_lines.append(line)

        return "\n".join(cleaned_lines).strip()